xref: /qemu/target/arm/tcg/sve_helper.c (revision 0baf907b718e1602383b973de7822c25db4c4a36)
1 /*
2  * ARM SVE Operations
3  *
4  * Copyright (c) 2018 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/page-protection.h"
24 #include "exec/helper-proto.h"
25 #include "exec/target_page.h"
26 #include "exec/tlb-flags.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "fpu/softfloat.h"
29 #include "tcg/tcg.h"
30 #include "vec_internal.h"
31 #include "sve_ldst_internal.h"
32 #include "accel/tcg/cpu-ldst.h"
33 #include "accel/tcg/helper-retaddr.h"
34 #include "accel/tcg/cpu-ops.h"
35 #include "accel/tcg/probe.h"
36 #ifdef CONFIG_USER_ONLY
37 #include "user/page-protection.h"
38 #endif
39 
40 
41 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
42  *
43  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
44  * and bit 0 set if C is set.  Compare the definitions of these variables
45  * within CPUARMState.
46  */
47 
48 /* For no G bits set, NZCV = C.  */
49 #define PREDTEST_INIT  1
50 
51 /* This is an iterative function, called for each Pd and Pg word
52  * moving forward.
53  */
iter_predtest_fwd(uint64_t d,uint64_t g,uint32_t flags)54 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
55 {
56     if (likely(g)) {
57         /* Compute N from first D & G.
58            Use bit 2 to signal first G bit seen.  */
59         if (!(flags & 4)) {
60             flags |= ((d & (g & -g)) != 0) << 31;
61             flags |= 4;
62         }
63 
64         /* Accumulate Z from each D & G.  */
65         flags |= ((d & g) != 0) << 1;
66 
67         /* Compute C from last !(D & G).  Replace previous.  */
68         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
69     }
70     return flags;
71 }
72 
73 /* This is an iterative function, called for each Pd and Pg word
74  * moving backward.
75  */
iter_predtest_bwd(uint64_t d,uint64_t g,uint32_t flags)76 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
77 {
78     if (likely(g)) {
79         /* Compute C from first (i.e last) !(D & G).
80            Use bit 2 to signal first G bit seen.  */
81         if (!(flags & 4)) {
82             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
83             flags |= (d & pow2floor(g)) == 0;
84         }
85 
86         /* Accumulate Z from each D & G.  */
87         flags |= ((d & g) != 0) << 1;
88 
89         /* Compute N from last (i.e first) D & G.  Replace previous.  */
90         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
91     }
92     return flags;
93 }
94 
95 /* The same for a single word predicate.  */
HELPER(sve_predtest1)96 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
97 {
98     return iter_predtest_fwd(d, g, PREDTEST_INIT);
99 }
100 
101 /* The same for a multi-word predicate.  */
HELPER(sve_predtest)102 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
103 {
104     uint32_t flags = PREDTEST_INIT;
105     uint64_t *d = vd, *g = vg;
106     uintptr_t i = 0;
107 
108     do {
109         flags = iter_predtest_fwd(d[i], g[i], flags);
110     } while (++i < words);
111 
112     return flags;
113 }
114 
115 /* Similarly for single word elements.  */
expand_pred_s(uint8_t byte)116 static inline uint64_t expand_pred_s(uint8_t byte)
117 {
118     static const uint64_t word[] = {
119         [0x01] = 0x00000000ffffffffull,
120         [0x10] = 0xffffffff00000000ull,
121         [0x11] = 0xffffffffffffffffull,
122     };
123     return word[byte & 0x11];
124 }
125 
126 #define LOGICAL_PPPP(NAME, FUNC) \
127 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
128 {                                                                         \
129     uintptr_t opr_sz = simd_oprsz(desc);                                  \
130     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
131     uintptr_t i;                                                          \
132     for (i = 0; i < opr_sz / 8; ++i) {                                    \
133         d[i] = FUNC(n[i], m[i], g[i]);                                    \
134     }                                                                     \
135 }
136 
137 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
138 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
139 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
140 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
141 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
142 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
143 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
144 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
145 
LOGICAL_PPPP(sve_and_pppp,DO_AND)146 LOGICAL_PPPP(sve_and_pppp, DO_AND)
147 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
148 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
149 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
150 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
151 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
152 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
153 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
154 
155 #undef DO_AND
156 #undef DO_BIC
157 #undef DO_EOR
158 #undef DO_ORR
159 #undef DO_ORN
160 #undef DO_NOR
161 #undef DO_NAND
162 #undef DO_SEL
163 #undef LOGICAL_PPPP
164 
165 /* Fully general three-operand expander, controlled by a predicate.
166  * This is complicated by the host-endian storage of the register file.
167  */
168 /* ??? I don't expect the compiler could ever vectorize this itself.
169  * With some tables we can convert bit masks to byte masks, and with
170  * extra care wrt byte/word ordering we could use gcc generic vectors
171  * and do 16 bytes at a time.
172  */
173 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
174 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
175 {                                                                       \
176     intptr_t i, opr_sz = simd_oprsz(desc);                              \
177     for (i = 0; i < opr_sz; ) {                                         \
178         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
179         do {                                                            \
180             if (pg & 1) {                                               \
181                 TYPE nn = *(TYPE *)(vn + H(i));                         \
182                 TYPE mm = *(TYPE *)(vm + H(i));                         \
183                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
184             }                                                           \
185             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
186         } while (i & 15);                                               \
187     }                                                                   \
188 }
189 
190 /* Similarly, specialized for 64-bit operands.  */
191 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
192 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
193 {                                                               \
194     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
195     TYPE *d = vd, *n = vn, *m = vm;                             \
196     uint8_t *pg = vg;                                           \
197     for (i = 0; i < opr_sz; i += 1) {                           \
198         if (pg[H1(i)] & 1) {                                    \
199             TYPE nn = n[i], mm = m[i];                          \
200             d[i] = OP(nn, mm);                                  \
201         }                                                       \
202     }                                                           \
203 }
204 
205 #define DO_AND(N, M)  (N & M)
206 #define DO_EOR(N, M)  (N ^ M)
207 #define DO_ORR(N, M)  (N | M)
208 #define DO_BIC(N, M)  (N & ~M)
209 #define DO_ADD(N, M)  (N + M)
210 #define DO_SUB(N, M)  (N - M)
211 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
212 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
213 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
214 #define DO_MUL(N, M)  (N * M)
215 
216 
217 /*
218  * We must avoid the C undefined behaviour cases: division by
219  * zero and signed division of INT_MIN by -1. Both of these
220  * have architecturally defined required results for Arm.
221  * We special case all signed divisions by -1 to avoid having
222  * to deduce the minimum integer for the type involved.
223  */
224 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
225 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
226 
227 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
228 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
229 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
230 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
231 
232 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
233 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
234 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
235 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
236 
237 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
238 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
239 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
240 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
241 
242 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
243 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
244 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
245 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
246 
247 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
248 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
249 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
250 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
251 
252 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
253 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
254 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
255 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
256 
257 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
258 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
259 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
260 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
261 
262 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
263 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
264 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
265 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
266 
267 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
268 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
269 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
270 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
271 
272 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
273 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
274 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
275 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
276 
277 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
278 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
279 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
280 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
281 
282 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
283 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
284 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
285 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
286 
287 /* Because the computation type is at least twice as large as required,
288    these work for both signed and unsigned source types.  */
289 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
290 {
291     return (n * m) >> 8;
292 }
293 
do_mulh_h(int32_t n,int32_t m)294 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
295 {
296     return (n * m) >> 16;
297 }
298 
do_mulh_s(int64_t n,int64_t m)299 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
300 {
301     return (n * m) >> 32;
302 }
303 
do_smulh_d(uint64_t n,uint64_t m)304 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
305 {
306     uint64_t lo, hi;
307     muls64(&lo, &hi, n, m);
308     return hi;
309 }
310 
do_umulh_d(uint64_t n,uint64_t m)311 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
312 {
313     uint64_t lo, hi;
314     mulu64(&lo, &hi, n, m);
315     return hi;
316 }
317 
DO_ZPZZ(sve_mul_zpzz_b,uint8_t,H1,DO_MUL)318 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
319 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
320 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
321 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
322 
323 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
324 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
325 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
326 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
327 
328 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
329 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
330 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
331 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
332 
333 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
334 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
335 
336 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
337 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
338 
339 /* Note that all bits of the shift are significant
340    and not modulo the element size.  */
341 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
342 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
343 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
344 
345 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
346 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
347 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
348 
349 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
350 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
351 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
352 
353 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
354 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
355 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
356 
357 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
358 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
359 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
360 
361 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
362 {
363     int8_t n1 = n, n2 = n >> 8;
364     return m + n1 + n2;
365 }
366 
do_sadalp_s(int32_t n,int32_t m)367 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
368 {
369     int16_t n1 = n, n2 = n >> 16;
370     return m + n1 + n2;
371 }
372 
do_sadalp_d(int64_t n,int64_t m)373 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
374 {
375     int32_t n1 = n, n2 = n >> 32;
376     return m + n1 + n2;
377 }
378 
DO_ZPZZ(sve2_sadalp_zpzz_h,int16_t,H1_2,do_sadalp_h)379 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
380 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
381 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
382 
383 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
384 {
385     uint8_t n1 = n, n2 = n >> 8;
386     return m + n1 + n2;
387 }
388 
do_uadalp_s(uint32_t n,uint32_t m)389 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
390 {
391     uint16_t n1 = n, n2 = n >> 16;
392     return m + n1 + n2;
393 }
394 
do_uadalp_d(uint64_t n,uint64_t m)395 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
396 {
397     uint32_t n1 = n, n2 = n >> 32;
398     return m + n1 + n2;
399 }
400 
DO_ZPZZ(sve2_uadalp_zpzz_h,uint16_t,H1_2,do_uadalp_h)401 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
402 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
403 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
404 
405 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
406 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
407 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
408 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
409 
410 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
411 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
412 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
413 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
414 
415 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
416 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
417 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
418 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
419 
420 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
421 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
422 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
423 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
424 
425 /*
426  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
427  * We pass in a pointer to a dummy saturation field to trigger
428  * the saturating arithmetic but discard the information about
429  * whether it has occurred.
430  */
431 #define do_sqshl_b(n, m) \
432    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
433 #define do_sqshl_h(n, m) \
434    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
435 #define do_sqshl_s(n, m) \
436    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
437 #define do_sqshl_d(n, m) \
438    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
439 
440 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
441 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
442 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
443 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
444 
445 #define do_uqshl_b(n, m) \
446    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
447 #define do_uqshl_h(n, m) \
448    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
449 #define do_uqshl_s(n, m) \
450    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
451 #define do_uqshl_d(n, m) \
452    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
453 
454 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
455 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
456 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
457 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
458 
459 #define do_sqrshl_b(n, m) \
460    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
461 #define do_sqrshl_h(n, m) \
462    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
463 #define do_sqrshl_s(n, m) \
464    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
465 #define do_sqrshl_d(n, m) \
466    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
467 
468 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
469 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
470 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
471 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
472 
473 #undef do_sqrshl_d
474 
475 #define do_uqrshl_b(n, m) \
476    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
477 #define do_uqrshl_h(n, m) \
478    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
479 #define do_uqrshl_s(n, m) \
480    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
481 #define do_uqrshl_d(n, m) \
482    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
483 
484 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
485 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
486 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
487 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
488 
489 #undef do_uqrshl_d
490 
491 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
492 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
493 
494 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
495 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
496 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
497 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
498 
499 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
500 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
501 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
502 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
503 
504 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
505 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
506 
507 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
508 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
509 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
510 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
511 
512 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
513 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
514 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
515 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
516 
517 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
518 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
519 
520 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
521 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
522 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
523 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
524 
525 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
526 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
527 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
528 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
529 
530 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
531 {
532     return val >= max ? max : val <= min ? min : val;
533 }
534 
535 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
536 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
537 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
538 
do_sqadd_d(int64_t n,int64_t m)539 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
540 {
541     int64_t r = n + m;
542     if (((r ^ n) & ~(n ^ m)) < 0) {
543         /* Signed overflow.  */
544         return r < 0 ? INT64_MAX : INT64_MIN;
545     }
546     return r;
547 }
548 
DO_ZPZZ(sve2_sqadd_zpzz_b,int8_t,H1,DO_SQADD_B)549 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
550 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
551 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
552 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
553 
554 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
555 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
556 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
557 
558 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
559 {
560     uint64_t r = n + m;
561     return r < n ? UINT64_MAX : r;
562 }
563 
DO_ZPZZ(sve2_uqadd_zpzz_b,uint8_t,H1,DO_UQADD_B)564 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
565 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
566 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
567 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
568 
569 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
570 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
571 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
572 
573 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
574 {
575     int64_t r = n - m;
576     if (((r ^ n) & (n ^ m)) < 0) {
577         /* Signed overflow.  */
578         return r < 0 ? INT64_MAX : INT64_MIN;
579     }
580     return r;
581 }
582 
DO_ZPZZ(sve2_sqsub_zpzz_b,int8_t,H1,DO_SQSUB_B)583 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
584 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
585 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
586 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
587 
588 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
589 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
590 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
591 
592 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
593 {
594     return n > m ? n - m : 0;
595 }
596 
DO_ZPZZ(sve2_uqsub_zpzz_b,uint8_t,H1,DO_UQSUB_B)597 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
598 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
599 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
600 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
601 
602 #define DO_SUQADD_B(n, m) \
603     do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
604 #define DO_SUQADD_H(n, m) \
605     do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
606 #define DO_SUQADD_S(n, m) \
607     do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
608 
609 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
610 {
611     uint64_t r = n + m;
612 
613     if (n < 0) {
614         /* Note that m - abs(n) cannot underflow. */
615         if (r > INT64_MAX) {
616             /* Result is either very large positive or negative. */
617             if (m > -n) {
618                 /* m > abs(n), so r is a very large positive. */
619                 return INT64_MAX;
620             }
621             /* Result is negative. */
622         }
623     } else {
624         /* Both inputs are positive: check for overflow.  */
625         if (r < m || r > INT64_MAX) {
626             return INT64_MAX;
627         }
628     }
629     return r;
630 }
631 
DO_ZPZZ(sve2_suqadd_zpzz_b,uint8_t,H1,DO_SUQADD_B)632 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
633 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
634 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
635 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
636 
637 #define DO_USQADD_B(n, m) \
638     do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
639 #define DO_USQADD_H(n, m) \
640     do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
641 #define DO_USQADD_S(n, m) \
642     do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
643 
644 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
645 {
646     uint64_t r = n + m;
647 
648     if (m < 0) {
649         return n < -m ? 0 : r;
650     }
651     return r < n ? UINT64_MAX : r;
652 }
653 
DO_ZPZZ(sve2_usqadd_zpzz_b,uint8_t,H1,DO_USQADD_B)654 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
655 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
656 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
657 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
658 
659 #undef DO_ZPZZ
660 #undef DO_ZPZZ_D
661 
662 /*
663  * Three operand expander, operating on element pairs.
664  * If the slot I is even, the elements from from VN {I, I+1}.
665  * If the slot I is odd, the elements from from VM {I-1, I}.
666  * Load all of the input elements in each pair before overwriting output.
667  */
668 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
669 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
670 {                                                               \
671     intptr_t i, opr_sz = simd_oprsz(desc);                      \
672     for (i = 0; i < opr_sz; ) {                                 \
673         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
674         do {                                                    \
675             TYPE n0 = *(TYPE *)(vn + H(i));                     \
676             TYPE m0 = *(TYPE *)(vm + H(i));                     \
677             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
678             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
679             if (pg & 1) {                                       \
680                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
681             }                                                   \
682             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
683             if (pg & 1) {                                       \
684                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
685             }                                                   \
686             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
687         } while (i & 15);                                       \
688     }                                                           \
689 }
690 
691 /* Similarly, specialized for 64-bit operands.  */
692 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
693 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
694 {                                                               \
695     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
696     TYPE *d = vd, *n = vn, *m = vm;                             \
697     uint8_t *pg = vg;                                           \
698     for (i = 0; i < opr_sz; i += 2) {                           \
699         TYPE n0 = n[i], n1 = n[i + 1];                          \
700         TYPE m0 = m[i], m1 = m[i + 1];                          \
701         if (pg[H1(i)] & 1) {                                    \
702             d[i] = OP(n0, n1);                                  \
703         }                                                       \
704         if (pg[H1(i + 1)] & 1) {                                \
705             d[i + 1] = OP(m0, m1);                              \
706         }                                                       \
707     }                                                           \
708 }
709 
710 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
711 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
712 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
713 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
714 
715 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
716 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
717 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
718 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
719 
720 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
721 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
722 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
723 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
724 
725 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
726 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
727 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
728 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
729 
730 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
731 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
732 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
733 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
734 
735 #undef DO_ZPZZ_PAIR
736 #undef DO_ZPZZ_PAIR_D
737 
738 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
739 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
740                   float_status *status, uint32_t desc)                  \
741 {                                                                       \
742     intptr_t i, opr_sz = simd_oprsz(desc);                              \
743     for (i = 0; i < opr_sz; ) {                                         \
744         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
745         do {                                                            \
746             TYPE n0 = *(TYPE *)(vn + H(i));                             \
747             TYPE m0 = *(TYPE *)(vm + H(i));                             \
748             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
749             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
750             if (pg & 1) {                                               \
751                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
752             }                                                           \
753             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
754             if (pg & 1) {                                               \
755                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
756             }                                                           \
757             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
758         } while (i & 15);                                               \
759     }                                                                   \
760 }
761 
762 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
763 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
764 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
765 
766 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
767 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
768 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
769 
770 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
771 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
772 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
773 
774 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
775 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
776 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
777 
778 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
779 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
780 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
781 
782 #undef DO_ZPZZ_PAIR_FP
783 
784 /* Three-operand expander, controlled by a predicate, in which the
785  * third operand is "wide".  That is, for D = N op M, the same 64-bit
786  * value of M is used with all of the narrower values of N.
787  */
788 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
789 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
790 {                                                                       \
791     intptr_t i, opr_sz = simd_oprsz(desc);                              \
792     for (i = 0; i < opr_sz; ) {                                         \
793         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
794         TYPEW mm = *(TYPEW *)(vm + i);                                  \
795         do {                                                            \
796             if (pg & 1) {                                               \
797                 TYPE nn = *(TYPE *)(vn + H(i));                         \
798                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
799             }                                                           \
800             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
801         } while (i & 7);                                                \
802     }                                                                   \
803 }
804 
805 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
806 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
807 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
808 
809 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
810 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
811 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
812 
813 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
814 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
815 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
816 
817 #undef DO_ZPZW
818 
819 /* Fully general two-operand expander, controlled by a predicate.
820  */
821 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
822 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
823 {                                                               \
824     intptr_t i, opr_sz = simd_oprsz(desc);                      \
825     for (i = 0; i < opr_sz; ) {                                 \
826         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
827         do {                                                    \
828             if (pg & 1) {                                       \
829                 TYPE nn = *(TYPE *)(vn + H(i));                 \
830                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
831             }                                                   \
832             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
833         } while (i & 15);                                       \
834     }                                                           \
835 }
836 
837 /* Similarly, specialized for 64-bit operands.  */
838 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
839 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
840 {                                                               \
841     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
842     TYPE *d = vd, *n = vn;                                      \
843     uint8_t *pg = vg;                                           \
844     for (i = 0; i < opr_sz; i += 1) {                           \
845         if (pg[H1(i)] & 1) {                                    \
846             TYPE nn = n[i];                                     \
847             d[i] = OP(nn);                                      \
848         }                                                       \
849     }                                                           \
850 }
851 
852 #define DO_CLS_B(N)   (clrsb32(N) - 24)
853 #define DO_CLS_H(N)   (clrsb32(N) - 16)
854 
855 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
856 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
857 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
858 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
859 
860 #define DO_CLZ_B(N)   (clz32(N) - 24)
861 #define DO_CLZ_H(N)   (clz32(N) - 16)
862 
863 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
864 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
865 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
866 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
867 
868 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
869 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
870 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
871 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
872 
873 #define DO_CNOT(N)    (N == 0)
874 
875 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
876 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
877 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
878 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
879 
880 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
881 
882 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
883 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
884 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
885 
886 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N))
887 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N))
888 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N))
889 
890 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H)
891 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S)
892 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D)
893 
894 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
895 
896 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
897 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
898 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
899 
900 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N))
901 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N))
902 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N))
903 
904 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H)
905 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S)
906 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D)
907 
908 #define DO_NOT(N)    (~N)
909 
910 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
911 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
912 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
913 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
914 
915 #define DO_SXTB(N)    ((int8_t)N)
916 #define DO_SXTH(N)    ((int16_t)N)
917 #define DO_SXTS(N)    ((int32_t)N)
918 #define DO_UXTB(N)    ((uint8_t)N)
919 #define DO_UXTH(N)    ((uint16_t)N)
920 #define DO_UXTS(N)    ((uint32_t)N)
921 
922 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
923 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
924 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
925 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
926 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
927 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
928 
929 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
930 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
931 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
932 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
933 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
934 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
935 
936 #define DO_ABS(N)    (N < 0 ? -N : N)
937 
938 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
939 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
940 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
941 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
942 
943 #define DO_NEG(N)    (-N)
944 
945 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
946 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
947 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
948 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
949 
950 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
951 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
952 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
953 
954 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
955 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
956 
957 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
958 
959 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
960 {
961     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
962     uint64_t *d = vd, *n = vn;
963     uint8_t *pg = vg;
964 
965     for (i = 0; i < opr_sz; i += 2) {
966         if (pg[H1(i)] & 1) {
967             uint64_t n0 = n[i + 0];
968             uint64_t n1 = n[i + 1];
969             d[i + 0] = n1;
970             d[i + 1] = n0;
971         }
972     }
973 }
974 
DO_ZPZ(sve_rbit_b,uint8_t,H1,revbit8)975 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
976 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
977 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
978 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
979 
980 #define DO_SQABS(X) \
981     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
982        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
983 
984 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
985 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
986 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
987 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
988 
989 #define DO_SQNEG(X) \
990     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
991        x_ == min_ ? -min_ - 1 : -x_; })
992 
993 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
994 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
995 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
996 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
997 
998 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
999 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
1000 
1001 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1002  */
1003 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
1004 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1005 {                                                              \
1006     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1007     for (i = 0; i < opr_sz; ) {                                \
1008         TYPEW mm = *(TYPEW *)(vm + i);                         \
1009         do {                                                   \
1010             TYPE nn = *(TYPE *)(vn + H(i));                    \
1011             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
1012             i += sizeof(TYPE);                                 \
1013         } while (i & 7);                                       \
1014     }                                                          \
1015 }
1016 
1017 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1018 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1019 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1020 
1021 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1022 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1023 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1024 
1025 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1026 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1027 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1028 
1029 #undef DO_ZZW
1030 
1031 #undef DO_CLS_B
1032 #undef DO_CLS_H
1033 #undef DO_CLZ_B
1034 #undef DO_CLZ_H
1035 #undef DO_CNOT
1036 #undef DO_FABS
1037 #undef DO_FNEG
1038 #undef DO_ABS
1039 #undef DO_NEG
1040 #undef DO_ZPZ
1041 #undef DO_ZPZ_D
1042 
1043 /*
1044  * Three-operand expander, unpredicated, in which the two inputs are
1045  * selected from the top or bottom half of the wide column.
1046  */
1047 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1048 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1049 {                                                                       \
1050     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1051     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1052     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1053     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1054         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1055         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1056         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1057     }                                                                   \
1058 }
1059 
1060 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1061 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1062 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1063 
1064 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1065 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1066 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1067 
1068 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1069 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1070 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1071 
1072 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1073 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1074 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1075 
1076 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1077 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1078 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1079 
1080 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1081 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1082 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1083 
1084 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1085 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1086 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1087 
1088 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1089 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1090 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1091 
1092 /* Note that the multiply cannot overflow, but the doubling can. */
1093 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1094 {
1095     int16_t val = n * m;
1096     return DO_SQADD_H(val, val);
1097 }
1098 
do_sqdmull_s(int32_t n,int32_t m)1099 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1100 {
1101     int32_t val = n * m;
1102     return DO_SQADD_S(val, val);
1103 }
1104 
do_sqdmull_d(int64_t n,int64_t m)1105 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1106 {
1107     int64_t val = n * m;
1108     return do_sqadd_d(val, val);
1109 }
1110 
DO_ZZZ_TB(sve2_sqdmull_zzz_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h)1111 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1112 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1113 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1114 
1115 #undef DO_ZZZ_TB
1116 
1117 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1118 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1119 {                                                              \
1120     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1121     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1122     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1123         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1124         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1125         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1126     }                                                          \
1127 }
1128 
1129 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1130 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1131 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1132 
1133 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1134 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1135 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1136 
1137 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1138 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1139 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1140 
1141 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1142 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1143 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1144 
1145 #undef DO_ZZZ_WTB
1146 
1147 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1148 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1149 {                                                                       \
1150     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1151     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1152     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1153     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1154         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1155         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1156         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1157     }                                                                   \
1158 }
1159 
1160 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1161 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1162 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1163 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1164 
1165 #undef DO_ZZZ_NTB
1166 
1167 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1168 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1169 {                                                               \
1170     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1171     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1172     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1173         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1174         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1175         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1176         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1177     }                                                           \
1178 }
1179 
1180 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1181 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1182 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1183 
1184 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1185 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1186 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1187 
1188 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1189 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1190 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1191 
1192 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1193 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1194 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1195 
1196 #define DO_NMUL(N, M)  -(N * M)
1197 
1198 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1199 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1200 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1201 
1202 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1203 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1204 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1205 
1206 #undef DO_ZZZW_ACC
1207 
1208 #define DO_XTNB(NAME, TYPE, OP) \
1209 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1210 {                                                            \
1211     intptr_t i, opr_sz = simd_oprsz(desc);                   \
1212     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1213         TYPE nn = *(TYPE *)(vn + i);                         \
1214         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1215         *(TYPE *)(vd + i) = nn;                              \
1216     }                                                        \
1217 }
1218 
1219 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1220 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1221 {                                                                       \
1222     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1223     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1224         TYPE nn = *(TYPE *)(vn + i);                                    \
1225         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1226     }                                                                   \
1227 }
1228 
1229 #define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
1230 #define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
1231 #define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
1232 
1233 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1234 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1235 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1236 
1237 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1238 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1239 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1240 
1241 #define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
1242 #define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
1243 #define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
1244 
1245 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1246 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1247 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1248 
1249 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1250 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1251 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1252 
1253 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1254 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1255 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1256 
1257 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1258 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1259 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1260 
1261 #undef DO_XTNB
1262 #undef DO_XTNT
1263 
1264 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1265 {
1266     intptr_t i, opr_sz = simd_oprsz(desc);
1267     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1268     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1269     uint32_t *a = va, *n = vn;
1270     uint64_t *d = vd, *m = vm;
1271 
1272     for (i = 0; i < opr_sz / 8; ++i) {
1273         uint32_t e1 = a[2 * i + H4(0)];
1274         uint32_t e2 = n[2 * i + sel] ^ inv;
1275         uint64_t c = extract64(m[i], 32, 1);
1276         /* Compute and store the entire 33-bit result at once. */
1277         d[i] = c + e1 + e2;
1278     }
1279 }
1280 
HELPER(sve2_adcl_d)1281 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1282 {
1283     intptr_t i, opr_sz = simd_oprsz(desc);
1284     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1285     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1286     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1287 
1288     for (i = 0; i < opr_sz / 8; i += 2) {
1289         Int128 e1 = int128_make64(a[i]);
1290         Int128 e2 = int128_make64(n[i + sel] ^ inv);
1291         Int128 c = int128_make64(m[i + 1] & 1);
1292         Int128 r = int128_add(int128_add(e1, e2), c);
1293         d[i + 0] = int128_getlo(r);
1294         d[i + 1] = int128_gethi(r);
1295     }
1296 }
1297 
1298 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1299 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1300 {                                                                       \
1301     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1302     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1303     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1304     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1305         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1306         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1307         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1308         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1309     }                                                                   \
1310 }
1311 
DO_SQDMLAL(sve2_sqdmlal_zzzw_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h,DO_SQADD_H)1312 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1313            do_sqdmull_h, DO_SQADD_H)
1314 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1315            do_sqdmull_s, DO_SQADD_S)
1316 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1317            do_sqdmull_d, do_sqadd_d)
1318 
1319 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1320            do_sqdmull_h, DO_SQSUB_H)
1321 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1322            do_sqdmull_s, DO_SQSUB_S)
1323 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1324            do_sqdmull_d, do_sqsub_d)
1325 
1326 #undef DO_SQDMLAL
1327 
1328 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1329 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1330 {                                                               \
1331     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1332     int rot = simd_data(desc);                                  \
1333     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1334     bool sub_r = rot == 1 || rot == 2;                          \
1335     bool sub_i = rot >= 2;                                      \
1336     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1337     for (i = 0; i < opr_sz; i += 2) {                           \
1338         TYPE elt1_a = n[H(i + sel_a)];                          \
1339         TYPE elt2_a = m[H(i + sel_a)];                          \
1340         TYPE elt2_b = m[H(i + sel_b)];                          \
1341         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1342         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1343     }                                                           \
1344 }
1345 
1346 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1347 
1348 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1349 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1350 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1351 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1352 
1353 #define DO_SQRDMLAH_B(N, M, A, S) \
1354     do_sqrdmlah_b(N, M, A, S, true)
1355 #define DO_SQRDMLAH_H(N, M, A, S) \
1356     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1357 #define DO_SQRDMLAH_S(N, M, A, S) \
1358     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1359 #define DO_SQRDMLAH_D(N, M, A, S) \
1360     do_sqrdmlah_d(N, M, A, S, true)
1361 
1362 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1363 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1364 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1365 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1366 
1367 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1368 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1369 {                                                                           \
1370     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1371     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1372     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1373     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1374     bool sub_r = rot == 1 || rot == 2;                                      \
1375     bool sub_i = rot >= 2;                                                  \
1376     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1377     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1378         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1379         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1380         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1381             TYPE elt1_a = n[H(i + j + sel_a)];                              \
1382             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1383             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1384         }                                                                   \
1385     }                                                                       \
1386 }
1387 
1388 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1389 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1390 
1391 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1392 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1393 
1394 #undef DO_CMLA
1395 #undef DO_CMLA_FUNC
1396 #undef DO_CMLA_IDX_FUNC
1397 #undef DO_SQRDMLAH_B
1398 #undef DO_SQRDMLAH_H
1399 #undef DO_SQRDMLAH_S
1400 #undef DO_SQRDMLAH_D
1401 
1402 /* Note N and M are 4 elements bundled into one unit. */
1403 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1404                          int sel_a, int sel_b, int sub_i)
1405 {
1406     for (int i = 0; i <= 1; i++) {
1407         int32_t elt1_r = (int8_t)(n >> (16 * i));
1408         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1409         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1410         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1411 
1412         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1413     }
1414     return a;
1415 }
1416 
do_cdot_d(uint64_t n,uint64_t m,int64_t a,int sel_a,int sel_b,int sub_i)1417 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1418                          int sel_a, int sel_b, int sub_i)
1419 {
1420     for (int i = 0; i <= 1; i++) {
1421         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1422         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1423         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1424         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1425 
1426         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1427     }
1428     return a;
1429 }
1430 
HELPER(sve2_cdot_zzzz_s)1431 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1432                               void *va, uint32_t desc)
1433 {
1434     int opr_sz = simd_oprsz(desc);
1435     int rot = simd_data(desc);
1436     int sel_a = rot & 1;
1437     int sel_b = sel_a ^ 1;
1438     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1439     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1440 
1441     for (int e = 0; e < opr_sz / 4; e++) {
1442         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1443     }
1444 }
1445 
HELPER(sve2_cdot_zzzz_d)1446 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1447                               void *va, uint32_t desc)
1448 {
1449     int opr_sz = simd_oprsz(desc);
1450     int rot = simd_data(desc);
1451     int sel_a = rot & 1;
1452     int sel_b = sel_a ^ 1;
1453     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1454     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1455 
1456     for (int e = 0; e < opr_sz / 8; e++) {
1457         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1458     }
1459 }
1460 
HELPER(sve2_cdot_idx_s)1461 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1462                              void *va, uint32_t desc)
1463 {
1464     int opr_sz = simd_oprsz(desc);
1465     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1466     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1467     int sel_a = rot & 1;
1468     int sel_b = sel_a ^ 1;
1469     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1470     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1471 
1472     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1473         uint32_t seg_m = m[seg + idx];
1474         for (int e = 0; e < 4; e++) {
1475             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1476                                    sel_a, sel_b, sub_i);
1477         }
1478     }
1479 }
1480 
HELPER(sve2_cdot_idx_d)1481 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1482                              void *va, uint32_t desc)
1483 {
1484     int seg, opr_sz = simd_oprsz(desc);
1485     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1486     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1487     int sel_a = rot & 1;
1488     int sel_b = sel_a ^ 1;
1489     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1490     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1491 
1492     for (seg = 0; seg < opr_sz / 8; seg += 2) {
1493         uint64_t seg_m = m[seg + idx];
1494         for (int e = 0; e < 2; e++) {
1495             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1496                                    sel_a, sel_b, sub_i);
1497         }
1498     }
1499 }
1500 
1501 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1502 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1503 {                                                                       \
1504     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1505     intptr_t i, j, idx = simd_data(desc);                               \
1506     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1507     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1508         TYPE mm = m[i];                                                 \
1509         for (j = 0; j < segment; j++) {                                 \
1510             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1511         }                                                               \
1512     }                                                                   \
1513 }
1514 
1515 #define DO_SQRDMLAH_H(N, M, A) \
1516     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1517 #define DO_SQRDMLAH_S(N, M, A) \
1518     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1519 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1520 
DO_ZZXZ(sve2_sqrdmlah_idx_h,int16_t,H2,DO_SQRDMLAH_H)1521 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1522 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1523 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1524 
1525 #define DO_SQRDMLSH_H(N, M, A) \
1526     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1527 #define DO_SQRDMLSH_S(N, M, A) \
1528     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1529 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1530 
1531 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1532 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1533 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1534 
1535 #undef DO_ZZXZ
1536 
1537 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1538 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1539 {                                                                         \
1540     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1541     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1542     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1543     for (i = 0; i < oprsz; i += 16) {                                     \
1544         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1545         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1546             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1547             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1548             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1549         }                                                                 \
1550     }                                                                     \
1551 }
1552 
1553 #define DO_MLA(N, M, A)  (A + N * M)
1554 
1555 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1556 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1557 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1558 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1559 
1560 #define DO_MLS(N, M, A)  (A - N * M)
1561 
1562 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1563 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1564 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1565 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1566 
1567 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1568 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1569 
1570 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1571 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1572 
1573 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1574 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1575 
1576 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1577 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1578 
1579 #undef DO_MLA
1580 #undef DO_MLS
1581 #undef DO_ZZXW
1582 
1583 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1584 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1585 {                                                                         \
1586     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1587     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1588     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1589     for (i = 0; i < oprsz; i += 16) {                                     \
1590         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1591         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1592             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1593             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1594         }                                                                 \
1595     }                                                                     \
1596 }
1597 
1598 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1599 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1600 
1601 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1602 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1603 
1604 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1605 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1606 
1607 #undef DO_ZZX
1608 
1609 #define DO_BITPERM(NAME, TYPE, OP) \
1610 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1611 {                                                              \
1612     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1613     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1614         TYPE nn = *(TYPE *)(vn + i);                           \
1615         TYPE mm = *(TYPE *)(vm + i);                           \
1616         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1617     }                                                          \
1618 }
1619 
1620 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1621 {
1622     uint64_t res = 0;
1623     int db, rb = 0;
1624 
1625     for (db = 0; db < n; ++db) {
1626         if ((mask >> db) & 1) {
1627             res |= ((data >> db) & 1) << rb;
1628             ++rb;
1629         }
1630     }
1631     return res;
1632 }
1633 
DO_BITPERM(sve2_bext_b,uint8_t,bitextract)1634 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1635 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1636 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1637 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1638 
1639 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1640 {
1641     uint64_t res = 0;
1642     int rb, db = 0;
1643 
1644     for (rb = 0; rb < n; ++rb) {
1645         if ((mask >> rb) & 1) {
1646             res |= ((data >> db) & 1) << rb;
1647             ++db;
1648         }
1649     }
1650     return res;
1651 }
1652 
DO_BITPERM(sve2_bdep_b,uint8_t,bitdeposit)1653 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1654 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1655 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1656 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1657 
1658 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1659 {
1660     uint64_t resm = 0, resu = 0;
1661     int db, rbm = 0, rbu = 0;
1662 
1663     for (db = 0; db < n; ++db) {
1664         uint64_t val = (data >> db) & 1;
1665         if ((mask >> db) & 1) {
1666             resm |= val << rbm++;
1667         } else {
1668             resu |= val << rbu++;
1669         }
1670     }
1671 
1672     return resm | (resu << rbm);
1673 }
1674 
DO_BITPERM(sve2_bgrp_b,uint8_t,bitgroup)1675 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1676 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1677 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1678 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1679 
1680 #undef DO_BITPERM
1681 
1682 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1683 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1684 {                                                               \
1685     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1686     int sub_r = simd_data(desc);                                \
1687     if (sub_r) {                                                \
1688         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1689             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1690             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1691             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1692             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1693             acc_r = ADD_OP(acc_r, el2_i);                       \
1694             acc_i = SUB_OP(acc_i, el2_r);                       \
1695             *(TYPE *)(vd + H(i)) = acc_r;                       \
1696             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1697         }                                                       \
1698     } else {                                                    \
1699         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1700             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1701             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1702             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1703             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1704             acc_r = SUB_OP(acc_r, el2_i);                       \
1705             acc_i = ADD_OP(acc_i, el2_r);                       \
1706             *(TYPE *)(vd + H(i)) = acc_r;                       \
1707             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1708         }                                                       \
1709     }                                                           \
1710 }
1711 
1712 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1713 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1714 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1715 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1716 
1717 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1718 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1719 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1720 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1721 
1722 #undef DO_CADD
1723 
1724 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1725 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1726 {                                                              \
1727     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1728     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1729     int shift = simd_data(desc) >> 1;                          \
1730     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1731         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1732         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1733     }                                                          \
1734 }
1735 
1736 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1737 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1738 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1739 
1740 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1741 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1742 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1743 
1744 #undef DO_ZZI_SHLL
1745 
1746 /* Two-operand reduction expander, controlled by a predicate.
1747  * The difference between TYPERED and TYPERET has to do with
1748  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1749  * but TYPERET must be unsigned so that e.g. a 32-bit value
1750  * is not sign-extended to the ABI uint64_t return type.
1751  */
1752 /* ??? If we were to vectorize this by hand the reduction ordering
1753  * would change.  For integer operands, this is perfectly fine.
1754  */
1755 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1756 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1757 {                                                          \
1758     intptr_t i, opr_sz = simd_oprsz(desc);                 \
1759     TYPERED ret = INIT;                                    \
1760     for (i = 0; i < opr_sz; ) {                            \
1761         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1762         do {                                               \
1763             if (pg & 1) {                                  \
1764                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1765                 ret = OP(ret, nn);                         \
1766             }                                              \
1767             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1768         } while (i & 15);                                  \
1769     }                                                      \
1770     return (TYPERET)ret;                                   \
1771 }
1772 
1773 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1774 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1775 {                                                          \
1776     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1777     TYPEE *n = vn;                                         \
1778     uint8_t *pg = vg;                                      \
1779     TYPER ret = INIT;                                      \
1780     for (i = 0; i < opr_sz; i += 1) {                      \
1781         if (pg[H1(i)] & 1) {                               \
1782             TYPEE nn = n[i];                               \
1783             ret = OP(ret, nn);                             \
1784         }                                                  \
1785     }                                                      \
1786     return ret;                                            \
1787 }
1788 
1789 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1790 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1791 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1792 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1793 
1794 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1795 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1796 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1797 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1798 
1799 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1800 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1801 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1802 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1803 
1804 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1805 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1806 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1807 
1808 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1809 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1810 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1811 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1812 
1813 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1814 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1815 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1816 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1817 
1818 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1819 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1820 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1821 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1822 
1823 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1824 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1825 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1826 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1827 
1828 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1829 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1830 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1831 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1832 
1833 #undef DO_VPZ
1834 #undef DO_VPZ_D
1835 
1836 /* Two vector operand, one scalar operand, unpredicated.  */
1837 #define DO_ZZI(NAME, TYPE, OP)                                       \
1838 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1839 {                                                                    \
1840     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1841     TYPE s = s64, *d = vd, *n = vn;                                  \
1842     for (i = 0; i < opr_sz; ++i) {                                   \
1843         d[i] = OP(n[i], s);                                          \
1844     }                                                                \
1845 }
1846 
1847 #define DO_SUBR(X, Y)   (Y - X)
1848 
1849 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1850 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1851 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1852 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1853 
1854 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1855 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1856 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1857 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1858 
1859 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1860 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1861 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1862 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1863 
1864 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1865 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1866 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1867 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1868 
1869 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1870 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1871 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1872 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1873 
1874 #undef DO_ZZI
1875 
1876 #undef DO_AND
1877 #undef DO_ORR
1878 #undef DO_EOR
1879 #undef DO_BIC
1880 #undef DO_ADD
1881 #undef DO_SUB
1882 #undef DO_MAX
1883 #undef DO_MIN
1884 #undef DO_ABD
1885 #undef DO_MUL
1886 #undef DO_DIV
1887 #undef DO_ASR
1888 #undef DO_LSR
1889 #undef DO_LSL
1890 #undef DO_SUBR
1891 
1892 /* Similar to the ARM LastActiveElement pseudocode function, except the
1893    result is multiplied by the element size.  This includes the not found
1894    indication; e.g. not found for esz=3 is -8.  */
1895 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1896 {
1897     uint64_t mask = pred_esz_masks[esz];
1898     intptr_t i = words;
1899 
1900     do {
1901         uint64_t this_g = g[--i] & mask;
1902         if (this_g) {
1903             return i * 64 + (63 - clz64(this_g));
1904         }
1905     } while (i > 0);
1906     return (intptr_t)-1 << esz;
1907 }
1908 
HELPER(sve_pfirst)1909 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1910 {
1911     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1912     uint32_t flags = PREDTEST_INIT;
1913     uint64_t *d = vd, *g = vg;
1914     intptr_t i = 0;
1915 
1916     do {
1917         uint64_t this_d = d[i];
1918         uint64_t this_g = g[i];
1919 
1920         if (this_g) {
1921             if (!(flags & 4)) {
1922                 /* Set in D the first bit of G.  */
1923                 this_d |= this_g & -this_g;
1924                 d[i] = this_d;
1925             }
1926             flags = iter_predtest_fwd(this_d, this_g, flags);
1927         }
1928     } while (++i < words);
1929 
1930     return flags;
1931 }
1932 
HELPER(sve_pnext)1933 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1934 {
1935     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1936     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1937     uint32_t flags = PREDTEST_INIT;
1938     uint64_t *d = vd, *g = vg, esz_mask;
1939     intptr_t i, next;
1940 
1941     next = last_active_element(vd, words, esz) + (1 << esz);
1942     esz_mask = pred_esz_masks[esz];
1943 
1944     /* Similar to the pseudocode for pnext, but scaled by ESZ
1945        so that we find the correct bit.  */
1946     if (next < words * 64) {
1947         uint64_t mask = -1;
1948 
1949         if (next & 63) {
1950             mask = ~((1ull << (next & 63)) - 1);
1951             next &= -64;
1952         }
1953         do {
1954             uint64_t this_g = g[next / 64] & esz_mask & mask;
1955             if (this_g != 0) {
1956                 next = (next & -64) + ctz64(this_g);
1957                 break;
1958             }
1959             next += 64;
1960             mask = -1;
1961         } while (next < words * 64);
1962     }
1963 
1964     i = 0;
1965     do {
1966         uint64_t this_d = 0;
1967         if (i == next / 64) {
1968             this_d = 1ull << (next & 63);
1969         }
1970         d[i] = this_d;
1971         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1972     } while (++i < words);
1973 
1974     return flags;
1975 }
1976 
1977 /*
1978  * Copy Zn into Zd, and store zero into inactive elements.
1979  * If inv, store zeros into the active elements.
1980  */
HELPER(sve_movz_b)1981 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1982 {
1983     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1984     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1985     uint64_t *d = vd, *n = vn;
1986     uint8_t *pg = vg;
1987 
1988     for (i = 0; i < opr_sz; i += 1) {
1989         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1990     }
1991 }
1992 
HELPER(sve_movz_h)1993 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1994 {
1995     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1996     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1997     uint64_t *d = vd, *n = vn;
1998     uint8_t *pg = vg;
1999 
2000     for (i = 0; i < opr_sz; i += 1) {
2001         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
2002     }
2003 }
2004 
HELPER(sve_movz_s)2005 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2006 {
2007     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2008     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2009     uint64_t *d = vd, *n = vn;
2010     uint8_t *pg = vg;
2011 
2012     for (i = 0; i < opr_sz; i += 1) {
2013         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2014     }
2015 }
2016 
HELPER(sve_movz_d)2017 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2018 {
2019     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2020     uint64_t *d = vd, *n = vn;
2021     uint8_t *pg = vg;
2022     uint8_t inv = simd_data(desc);
2023 
2024     for (i = 0; i < opr_sz; i += 1) {
2025         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2026     }
2027 }
2028 
2029 /* Three-operand expander, immediate operand, controlled by a predicate.
2030  */
2031 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2032 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2033 {                                                               \
2034     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2035     TYPE imm = simd_data(desc);                                 \
2036     for (i = 0; i < opr_sz; ) {                                 \
2037         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2038         do {                                                    \
2039             if (pg & 1) {                                       \
2040                 TYPE nn = *(TYPE *)(vn + H(i));                 \
2041                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2042             }                                                   \
2043             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2044         } while (i & 15);                                       \
2045     }                                                           \
2046 }
2047 
2048 /* Similarly, specialized for 64-bit operands.  */
2049 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2050 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2051 {                                                               \
2052     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2053     TYPE *d = vd, *n = vn;                                      \
2054     TYPE imm = simd_data(desc);                                 \
2055     uint8_t *pg = vg;                                           \
2056     for (i = 0; i < opr_sz; i += 1) {                           \
2057         if (pg[H1(i)] & 1) {                                    \
2058             TYPE nn = n[i];                                     \
2059             d[i] = OP(nn, imm);                                 \
2060         }                                                       \
2061     }                                                           \
2062 }
2063 
2064 #define DO_SHR(N, M)  (N >> M)
2065 #define DO_SHL(N, M)  (N << M)
2066 
2067 /* Arithmetic shift right for division.  This rounds negative numbers
2068    toward zero as per signed division.  Therefore before shifting,
2069    when N is negative, add 2**M-1.  */
2070 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2071 
do_urshr(uint64_t x,unsigned sh)2072 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2073 {
2074     if (likely(sh < 64)) {
2075         return (x >> sh) + ((x >> (sh - 1)) & 1);
2076     } else if (sh == 64) {
2077         return x >> 63;
2078     } else {
2079         return 0;
2080     }
2081 }
2082 
do_srshr(int64_t x,unsigned sh)2083 static inline int64_t do_srshr(int64_t x, unsigned sh)
2084 {
2085     if (likely(sh < 64)) {
2086         return (x >> sh) + ((x >> (sh - 1)) & 1);
2087     } else {
2088         /* Rounding the sign bit always produces 0. */
2089         return 0;
2090     }
2091 }
2092 
DO_ZPZI(sve_asr_zpzi_b,int8_t,H1,DO_SHR)2093 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2094 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2095 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2096 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2097 
2098 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2099 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2100 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2101 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2102 
2103 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2104 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2105 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2106 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2107 
2108 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2109 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2110 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2111 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2112 
2113 /* SVE2 bitwise shift by immediate */
2114 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2115 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2116 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2117 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2118 
2119 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2120 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2121 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2122 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2123 
2124 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2125 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2126 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2127 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2128 
2129 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2130 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2131 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2132 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2133 
2134 #define do_suqrshl_b(n, m) \
2135    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2136 #define do_suqrshl_h(n, m) \
2137    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2138 #define do_suqrshl_s(n, m) \
2139    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2140 #define do_suqrshl_d(n, m) \
2141    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2142 
2143 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2144 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2145 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2146 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2147 
2148 #undef DO_ASRD
2149 #undef DO_ZPZI
2150 #undef DO_ZPZI_D
2151 
2152 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2153 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2154 {                                                            \
2155     intptr_t i, opr_sz = simd_oprsz(desc);                   \
2156     int shift = simd_data(desc);                             \
2157     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2158         TYPEW nn = *(TYPEW *)(vn + i);                       \
2159         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2160     }                                                        \
2161 }
2162 
2163 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2164 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2165 {                                                                 \
2166     intptr_t i, opr_sz = simd_oprsz(desc);                        \
2167     int shift = simd_data(desc);                                  \
2168     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2169         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2170         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2171     }                                                             \
2172 }
2173 
2174 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2175 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2176 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2177 
2178 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2179 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2180 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2181 
2182 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2183 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2184 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2185 
2186 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2187 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2188 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2189 
2190 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2191 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2192 #define DO_SQSHRUN_D(x, sh) \
2193     do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2194 
2195 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2196 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2197 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2198 
2199 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2200 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2201 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2202 
2203 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2204 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2205 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2206 
2207 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2208 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2209 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2210 
2211 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2212 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2213 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2214 
2215 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2216 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2217 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2218 
2219 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2220 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2221 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2222 
2223 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2224 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2225 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2226 
2227 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2228 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2229 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2230 
2231 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2232 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2233 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2234 
2235 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2236 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2237 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2238 
2239 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2240 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2241 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2242 
2243 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2244 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2245 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2246 
2247 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2248 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2249 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2250 
2251 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2252 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2253 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2254 
2255 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2256 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2257 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2258 
2259 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2260 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2261 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2262 
2263 #undef DO_SHRNB
2264 #undef DO_SHRNT
2265 
2266 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2267 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2268 {                                                                           \
2269     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2270     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2271         TYPEW nn = *(TYPEW *)(vn + i);                                      \
2272         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2273         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2274     }                                                                       \
2275 }
2276 
2277 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2278 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2279 {                                                                           \
2280     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2281     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2282         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2283         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2284         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2285     }                                                                       \
2286 }
2287 
2288 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2289 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2290 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2291 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2292 
2293 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2294 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2295 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2296 
2297 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2298 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2299 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2300 
2301 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2302 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2303 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2304 
2305 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2306 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2307 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2308 
2309 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2310 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2311 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2312 
2313 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2314 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2315 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2316 
2317 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2318 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2319 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2320 
2321 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2322 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2323 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2324 
2325 #undef DO_RSUBHN
2326 #undef DO_SUBHN
2327 #undef DO_RADDHN
2328 #undef DO_ADDHN
2329 
2330 #undef DO_BINOPNB
2331 
2332 /* Fully general four-operand expander, controlled by a predicate.
2333  */
2334 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2335 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2336                   void *vg, uint32_t desc)                    \
2337 {                                                             \
2338     intptr_t i, opr_sz = simd_oprsz(desc);                    \
2339     for (i = 0; i < opr_sz; ) {                               \
2340         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2341         do {                                                  \
2342             if (pg & 1) {                                     \
2343                 TYPE nn = *(TYPE *)(vn + H(i));               \
2344                 TYPE mm = *(TYPE *)(vm + H(i));               \
2345                 TYPE aa = *(TYPE *)(va + H(i));               \
2346                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2347             }                                                 \
2348             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2349         } while (i & 15);                                     \
2350     }                                                         \
2351 }
2352 
2353 /* Similarly, specialized for 64-bit operands.  */
2354 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2355 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2356                   void *vg, uint32_t desc)                    \
2357 {                                                             \
2358     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2359     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2360     uint8_t *pg = vg;                                         \
2361     for (i = 0; i < opr_sz; i += 1) {                         \
2362         if (pg[H1(i)] & 1) {                                  \
2363             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2364             d[i] = OP(aa, nn, mm);                            \
2365         }                                                     \
2366     }                                                         \
2367 }
2368 
2369 #define DO_MLA(A, N, M)  (A + N * M)
2370 #define DO_MLS(A, N, M)  (A - N * M)
2371 
2372 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2373 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2374 
2375 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2376 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2377 
2378 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2379 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2380 
2381 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2382 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2383 
2384 #undef DO_MLA
2385 #undef DO_MLS
2386 #undef DO_ZPZZZ
2387 #undef DO_ZPZZZ_D
2388 
2389 void HELPER(sve_index_b)(void *vd, uint32_t start,
2390                          uint32_t incr, uint32_t desc)
2391 {
2392     intptr_t i, opr_sz = simd_oprsz(desc);
2393     uint8_t *d = vd;
2394     for (i = 0; i < opr_sz; i += 1) {
2395         d[H1(i)] = start + i * incr;
2396     }
2397 }
2398 
HELPER(sve_index_h)2399 void HELPER(sve_index_h)(void *vd, uint32_t start,
2400                          uint32_t incr, uint32_t desc)
2401 {
2402     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2403     uint16_t *d = vd;
2404     for (i = 0; i < opr_sz; i += 1) {
2405         d[H2(i)] = start + i * incr;
2406     }
2407 }
2408 
HELPER(sve_index_s)2409 void HELPER(sve_index_s)(void *vd, uint32_t start,
2410                          uint32_t incr, uint32_t desc)
2411 {
2412     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2413     uint32_t *d = vd;
2414     for (i = 0; i < opr_sz; i += 1) {
2415         d[H4(i)] = start + i * incr;
2416     }
2417 }
2418 
HELPER(sve_index_d)2419 void HELPER(sve_index_d)(void *vd, uint64_t start,
2420                          uint64_t incr, uint32_t desc)
2421 {
2422     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2423     uint64_t *d = vd;
2424     for (i = 0; i < opr_sz; i += 1) {
2425         d[i] = start + i * incr;
2426     }
2427 }
2428 
HELPER(sve_adr_p32)2429 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2430 {
2431     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2432     uint32_t sh = simd_data(desc);
2433     uint32_t *d = vd, *n = vn, *m = vm;
2434     for (i = 0; i < opr_sz; i += 1) {
2435         d[i] = n[i] + (m[i] << sh);
2436     }
2437 }
2438 
HELPER(sve_adr_p64)2439 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2440 {
2441     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2442     uint64_t sh = simd_data(desc);
2443     uint64_t *d = vd, *n = vn, *m = vm;
2444     for (i = 0; i < opr_sz; i += 1) {
2445         d[i] = n[i] + (m[i] << sh);
2446     }
2447 }
2448 
HELPER(sve_adr_s32)2449 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2450 {
2451     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2452     uint64_t sh = simd_data(desc);
2453     uint64_t *d = vd, *n = vn, *m = vm;
2454     for (i = 0; i < opr_sz; i += 1) {
2455         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2456     }
2457 }
2458 
HELPER(sve_adr_u32)2459 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2460 {
2461     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2462     uint64_t sh = simd_data(desc);
2463     uint64_t *d = vd, *n = vn, *m = vm;
2464     for (i = 0; i < opr_sz; i += 1) {
2465         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2466     }
2467 }
2468 
HELPER(sve_fexpa_h)2469 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2470 {
2471     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2472     static const uint16_t coeff[] = {
2473         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2474         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2475         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2476         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2477     };
2478     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2479     uint16_t *d = vd, *n = vn;
2480 
2481     for (i = 0; i < opr_sz; i++) {
2482         uint16_t nn = n[i];
2483         intptr_t idx = extract32(nn, 0, 5);
2484         uint16_t exp = extract32(nn, 5, 5);
2485         d[i] = coeff[idx] | (exp << 10);
2486     }
2487 }
2488 
HELPER(sve_fexpa_s)2489 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2490 {
2491     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2492     static const uint32_t coeff[] = {
2493         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2494         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2495         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2496         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2497         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2498         0x1ef532, 0x20b051, 0x227043, 0x243516,
2499         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2500         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2501         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2502         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2503         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2504         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2505         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2506         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2507         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2508         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2509     };
2510     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2511     uint32_t *d = vd, *n = vn;
2512 
2513     for (i = 0; i < opr_sz; i++) {
2514         uint32_t nn = n[i];
2515         intptr_t idx = extract32(nn, 0, 6);
2516         uint32_t exp = extract32(nn, 6, 8);
2517         d[i] = coeff[idx] | (exp << 23);
2518     }
2519 }
2520 
HELPER(sve_fexpa_d)2521 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2522 {
2523     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2524     static const uint64_t coeff[] = {
2525         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2526         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2527         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2528         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2529         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2530         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2531         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2532         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2533         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2534         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2535         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2536         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2537         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2538         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2539         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2540         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2541         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2542         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2543         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2544         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2545         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2546         0xFA7C1819E90D8ull,
2547     };
2548     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2549     uint64_t *d = vd, *n = vn;
2550 
2551     for (i = 0; i < opr_sz; i++) {
2552         uint64_t nn = n[i];
2553         intptr_t idx = extract32(nn, 0, 6);
2554         uint64_t exp = extract32(nn, 6, 11);
2555         d[i] = coeff[idx] | (exp << 52);
2556     }
2557 }
2558 
HELPER(sve_ftssel_h)2559 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2560 {
2561     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2562     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2563     uint16_t *d = vd, *n = vn, *m = vm;
2564     for (i = 0; i < opr_sz; i += 1) {
2565         uint16_t nn = n[i];
2566         uint16_t mm = m[i];
2567         if (mm & 1) {
2568             nn = float16_one;
2569         }
2570         if (mm & 2) {
2571             nn = float16_maybe_ah_chs(nn, fpcr_ah);
2572         }
2573         d[i] = nn;
2574     }
2575 }
2576 
HELPER(sve_ftssel_s)2577 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2578 {
2579     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2580     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2581     uint32_t *d = vd, *n = vn, *m = vm;
2582     for (i = 0; i < opr_sz; i += 1) {
2583         uint32_t nn = n[i];
2584         uint32_t mm = m[i];
2585         if (mm & 1) {
2586             nn = float32_one;
2587         }
2588         if (mm & 2) {
2589             nn = float32_maybe_ah_chs(nn, fpcr_ah);
2590         }
2591         d[i] = nn;
2592     }
2593 }
2594 
HELPER(sve_ftssel_d)2595 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2596 {
2597     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2598     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2599     uint64_t *d = vd, *n = vn, *m = vm;
2600     for (i = 0; i < opr_sz; i += 1) {
2601         uint64_t nn = n[i];
2602         uint64_t mm = m[i];
2603         if (mm & 1) {
2604             nn = float64_one;
2605         }
2606         if (mm & 2) {
2607             nn = float64_maybe_ah_chs(nn, fpcr_ah);
2608         }
2609         d[i] = nn;
2610     }
2611 }
2612 
2613 /*
2614  * Signed saturating addition with scalar operand.
2615  */
2616 
HELPER(sve_sqaddi_b)2617 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2618 {
2619     intptr_t i, oprsz = simd_oprsz(desc);
2620 
2621     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2622         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2623     }
2624 }
2625 
HELPER(sve_sqaddi_h)2626 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2627 {
2628     intptr_t i, oprsz = simd_oprsz(desc);
2629 
2630     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2631         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2632     }
2633 }
2634 
HELPER(sve_sqaddi_s)2635 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2636 {
2637     intptr_t i, oprsz = simd_oprsz(desc);
2638 
2639     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2640         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2641     }
2642 }
2643 
HELPER(sve_sqaddi_d)2644 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2645 {
2646     intptr_t i, oprsz = simd_oprsz(desc);
2647 
2648     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2649         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2650     }
2651 }
2652 
2653 /*
2654  * Unsigned saturating addition with scalar operand.
2655  */
2656 
HELPER(sve_uqaddi_b)2657 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2658 {
2659     intptr_t i, oprsz = simd_oprsz(desc);
2660 
2661     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2662         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2663     }
2664 }
2665 
HELPER(sve_uqaddi_h)2666 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2667 {
2668     intptr_t i, oprsz = simd_oprsz(desc);
2669 
2670     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2671         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2672     }
2673 }
2674 
HELPER(sve_uqaddi_s)2675 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2676 {
2677     intptr_t i, oprsz = simd_oprsz(desc);
2678 
2679     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2680         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2681     }
2682 }
2683 
HELPER(sve_uqaddi_d)2684 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2685 {
2686     intptr_t i, oprsz = simd_oprsz(desc);
2687 
2688     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2689         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2690     }
2691 }
2692 
HELPER(sve_uqsubi_d)2693 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2694 {
2695     intptr_t i, oprsz = simd_oprsz(desc);
2696 
2697     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2698         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2699     }
2700 }
2701 
2702 /* Two operand predicated copy immediate with merge.  All valid immediates
2703  * can fit within 17 signed bits in the simd_data field.
2704  */
HELPER(sve_cpy_m_b)2705 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2706                          uint64_t mm, uint32_t desc)
2707 {
2708     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2709     uint64_t *d = vd, *n = vn;
2710     uint8_t *pg = vg;
2711 
2712     mm = dup_const(MO_8, mm);
2713     for (i = 0; i < opr_sz; i += 1) {
2714         uint64_t nn = n[i];
2715         uint64_t pp = expand_pred_b(pg[H1(i)]);
2716         d[i] = (mm & pp) | (nn & ~pp);
2717     }
2718 }
2719 
HELPER(sve_cpy_m_h)2720 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2721                          uint64_t mm, uint32_t desc)
2722 {
2723     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2724     uint64_t *d = vd, *n = vn;
2725     uint8_t *pg = vg;
2726 
2727     mm = dup_const(MO_16, mm);
2728     for (i = 0; i < opr_sz; i += 1) {
2729         uint64_t nn = n[i];
2730         uint64_t pp = expand_pred_h(pg[H1(i)]);
2731         d[i] = (mm & pp) | (nn & ~pp);
2732     }
2733 }
2734 
HELPER(sve_cpy_m_s)2735 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2736                          uint64_t mm, uint32_t desc)
2737 {
2738     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2739     uint64_t *d = vd, *n = vn;
2740     uint8_t *pg = vg;
2741 
2742     mm = dup_const(MO_32, mm);
2743     for (i = 0; i < opr_sz; i += 1) {
2744         uint64_t nn = n[i];
2745         uint64_t pp = expand_pred_s(pg[H1(i)]);
2746         d[i] = (mm & pp) | (nn & ~pp);
2747     }
2748 }
2749 
HELPER(sve_cpy_m_d)2750 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2751                          uint64_t mm, uint32_t desc)
2752 {
2753     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2754     uint64_t *d = vd, *n = vn;
2755     uint8_t *pg = vg;
2756 
2757     for (i = 0; i < opr_sz; i += 1) {
2758         uint64_t nn = n[i];
2759         d[i] = (pg[H1(i)] & 1 ? mm : nn);
2760     }
2761 }
2762 
HELPER(sve_cpy_z_b)2763 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2764 {
2765     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2766     uint64_t *d = vd;
2767     uint8_t *pg = vg;
2768 
2769     val = dup_const(MO_8, val);
2770     for (i = 0; i < opr_sz; i += 1) {
2771         d[i] = val & expand_pred_b(pg[H1(i)]);
2772     }
2773 }
2774 
HELPER(sve_cpy_z_h)2775 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2776 {
2777     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2778     uint64_t *d = vd;
2779     uint8_t *pg = vg;
2780 
2781     val = dup_const(MO_16, val);
2782     for (i = 0; i < opr_sz; i += 1) {
2783         d[i] = val & expand_pred_h(pg[H1(i)]);
2784     }
2785 }
2786 
HELPER(sve_cpy_z_s)2787 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2788 {
2789     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2790     uint64_t *d = vd;
2791     uint8_t *pg = vg;
2792 
2793     val = dup_const(MO_32, val);
2794     for (i = 0; i < opr_sz; i += 1) {
2795         d[i] = val & expand_pred_s(pg[H1(i)]);
2796     }
2797 }
2798 
HELPER(sve_cpy_z_d)2799 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2800 {
2801     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2802     uint64_t *d = vd;
2803     uint8_t *pg = vg;
2804 
2805     for (i = 0; i < opr_sz; i += 1) {
2806         d[i] = (pg[H1(i)] & 1 ? val : 0);
2807     }
2808 }
2809 
2810 /* Big-endian hosts need to frob the byte indices.  If the copy
2811  * happens to be 8-byte aligned, then no frobbing necessary.
2812  */
swap_memmove(void * vd,void * vs,size_t n)2813 static void swap_memmove(void *vd, void *vs, size_t n)
2814 {
2815     uintptr_t d = (uintptr_t)vd;
2816     uintptr_t s = (uintptr_t)vs;
2817     uintptr_t o = (d | s | n) & 7;
2818     size_t i;
2819 
2820 #if !HOST_BIG_ENDIAN
2821     o = 0;
2822 #endif
2823     switch (o) {
2824     case 0:
2825         memmove(vd, vs, n);
2826         break;
2827 
2828     case 4:
2829         if (d < s || d >= s + n) {
2830             for (i = 0; i < n; i += 4) {
2831                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2832             }
2833         } else {
2834             for (i = n; i > 0; ) {
2835                 i -= 4;
2836                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2837             }
2838         }
2839         break;
2840 
2841     case 2:
2842     case 6:
2843         if (d < s || d >= s + n) {
2844             for (i = 0; i < n; i += 2) {
2845                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2846             }
2847         } else {
2848             for (i = n; i > 0; ) {
2849                 i -= 2;
2850                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2851             }
2852         }
2853         break;
2854 
2855     default:
2856         if (d < s || d >= s + n) {
2857             for (i = 0; i < n; i++) {
2858                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2859             }
2860         } else {
2861             for (i = n; i > 0; ) {
2862                 i -= 1;
2863                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2864             }
2865         }
2866         break;
2867     }
2868 }
2869 
2870 /* Similarly for memset of 0.  */
swap_memzero(void * vd,size_t n)2871 static void swap_memzero(void *vd, size_t n)
2872 {
2873     uintptr_t d = (uintptr_t)vd;
2874     uintptr_t o = (d | n) & 7;
2875     size_t i;
2876 
2877     /* Usually, the first bit of a predicate is set, so N is 0.  */
2878     if (likely(n == 0)) {
2879         return;
2880     }
2881 
2882 #if !HOST_BIG_ENDIAN
2883     o = 0;
2884 #endif
2885     switch (o) {
2886     case 0:
2887         memset(vd, 0, n);
2888         break;
2889 
2890     case 4:
2891         for (i = 0; i < n; i += 4) {
2892             *(uint32_t *)H1_4(d + i) = 0;
2893         }
2894         break;
2895 
2896     case 2:
2897     case 6:
2898         for (i = 0; i < n; i += 2) {
2899             *(uint16_t *)H1_2(d + i) = 0;
2900         }
2901         break;
2902 
2903     default:
2904         for (i = 0; i < n; i++) {
2905             *(uint8_t *)H1(d + i) = 0;
2906         }
2907         break;
2908     }
2909 }
2910 
HELPER(sve_ext)2911 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2912 {
2913     intptr_t opr_sz = simd_oprsz(desc);
2914     size_t n_ofs = simd_data(desc);
2915     size_t n_siz = opr_sz - n_ofs;
2916 
2917     if (vd != vm) {
2918         swap_memmove(vd, vn + n_ofs, n_siz);
2919         swap_memmove(vd + n_siz, vm, n_ofs);
2920     } else if (vd != vn) {
2921         swap_memmove(vd + n_siz, vd, n_ofs);
2922         swap_memmove(vd, vn + n_ofs, n_siz);
2923     } else {
2924         /* vd == vn == vm.  Need temp space.  */
2925         ARMVectorReg tmp;
2926         swap_memmove(&tmp, vm, n_ofs);
2927         swap_memmove(vd, vd + n_ofs, n_siz);
2928         memcpy(vd + n_siz, &tmp, n_ofs);
2929     }
2930 }
2931 
2932 #define DO_INSR(NAME, TYPE, H) \
2933 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2934 {                                                                  \
2935     intptr_t opr_sz = simd_oprsz(desc);                            \
2936     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2937     *(TYPE *)(vd + H(0)) = val;                                    \
2938 }
2939 
DO_INSR(sve_insr_b,uint8_t,H1)2940 DO_INSR(sve_insr_b, uint8_t, H1)
2941 DO_INSR(sve_insr_h, uint16_t, H1_2)
2942 DO_INSR(sve_insr_s, uint32_t, H1_4)
2943 DO_INSR(sve_insr_d, uint64_t, H1_8)
2944 
2945 #undef DO_INSR
2946 
2947 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2948 {
2949     intptr_t i, j, opr_sz = simd_oprsz(desc);
2950     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2951         uint64_t f = *(uint64_t *)(vn + i);
2952         uint64_t b = *(uint64_t *)(vn + j);
2953         *(uint64_t *)(vd + i) = bswap64(b);
2954         *(uint64_t *)(vd + j) = bswap64(f);
2955     }
2956 }
2957 
HELPER(sve_rev_h)2958 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2959 {
2960     intptr_t i, j, opr_sz = simd_oprsz(desc);
2961     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2962         uint64_t f = *(uint64_t *)(vn + i);
2963         uint64_t b = *(uint64_t *)(vn + j);
2964         *(uint64_t *)(vd + i) = hswap64(b);
2965         *(uint64_t *)(vd + j) = hswap64(f);
2966     }
2967 }
2968 
HELPER(sve_rev_s)2969 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2970 {
2971     intptr_t i, j, opr_sz = simd_oprsz(desc);
2972     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2973         uint64_t f = *(uint64_t *)(vn + i);
2974         uint64_t b = *(uint64_t *)(vn + j);
2975         *(uint64_t *)(vd + i) = rol64(b, 32);
2976         *(uint64_t *)(vd + j) = rol64(f, 32);
2977     }
2978 }
2979 
HELPER(sve_rev_d)2980 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2981 {
2982     intptr_t i, j, opr_sz = simd_oprsz(desc);
2983     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2984         uint64_t f = *(uint64_t *)(vn + i);
2985         uint64_t b = *(uint64_t *)(vn + j);
2986         *(uint64_t *)(vd + i) = b;
2987         *(uint64_t *)(vd + j) = f;
2988     }
2989 }
2990 
2991 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2992 
do_tbl1(void * vd,void * vn,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)2993 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2994                            bool is_tbx, tb_impl_fn *fn)
2995 {
2996     ARMVectorReg scratch;
2997     uintptr_t oprsz = simd_oprsz(desc);
2998 
2999     if (unlikely(vd == vn)) {
3000         vn = memcpy(&scratch, vn, oprsz);
3001     }
3002 
3003     fn(vd, vn, NULL, vm, oprsz, is_tbx);
3004 }
3005 
do_tbl2(void * vd,void * vn0,void * vn1,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)3006 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
3007                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
3008 {
3009     ARMVectorReg scratch;
3010     uintptr_t oprsz = simd_oprsz(desc);
3011 
3012     if (unlikely(vd == vn0)) {
3013         vn0 = memcpy(&scratch, vn0, oprsz);
3014         if (vd == vn1) {
3015             vn1 = vn0;
3016         }
3017     } else if (unlikely(vd == vn1)) {
3018         vn1 = memcpy(&scratch, vn1, oprsz);
3019     }
3020 
3021     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3022 }
3023 
3024 #define DO_TB(SUFF, TYPE, H)                                            \
3025 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
3026                                 void *vm, uintptr_t oprsz, bool is_tbx) \
3027 {                                                                       \
3028     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
3029     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
3030     for (i = 0; i < nelem; ++i) {                                       \
3031         TYPE index = indexes[H1(i)], val = 0;                           \
3032         if (index < nelem) {                                            \
3033             val = tbl0[H(index)];                                       \
3034         } else {                                                        \
3035             index -= nelem;                                             \
3036             if (tbl1 && index < nelem) {                                \
3037                 val = tbl1[H(index)];                                   \
3038             } else if (is_tbx) {                                        \
3039                 continue;                                               \
3040             }                                                           \
3041         }                                                               \
3042         d[H(i)] = val;                                                  \
3043     }                                                                   \
3044 }                                                                       \
3045 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3046 {                                                                       \
3047     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3048 }                                                                       \
3049 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3050                              void *vm, uint32_t desc)                   \
3051 {                                                                       \
3052     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3053 }                                                                       \
3054 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3055 {                                                                       \
3056     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3057 }
3058 
3059 DO_TB(b, uint8_t, H1)
3060 DO_TB(h, uint16_t, H2)
3061 DO_TB(s, uint32_t, H4)
3062 DO_TB(d, uint64_t, H8)
3063 
3064 #undef DO_TB
3065 
3066 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3067 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3068 {                                                              \
3069     intptr_t i, opr_sz = simd_oprsz(desc);                     \
3070     TYPED *d = vd;                                             \
3071     TYPES *n = vn;                                             \
3072     ARMVectorReg tmp;                                          \
3073     if (unlikely(vn - vd < opr_sz)) {                          \
3074         n = memcpy(&tmp, n, opr_sz / 2);                       \
3075     }                                                          \
3076     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3077         d[HD(i)] = n[HS(i)];                                   \
3078     }                                                          \
3079 }
3080 
3081 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3082 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3083 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3084 
3085 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3086 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3087 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3088 
3089 #undef DO_UNPK
3090 
3091 /* Mask of bits included in the even numbered predicates of width esz.
3092  * We also use this for expand_bits/compress_bits, and so extend the
3093  * same pattern out to 16-bit units.
3094  */
3095 static const uint64_t even_bit_esz_masks[5] = {
3096     0x5555555555555555ull,
3097     0x3333333333333333ull,
3098     0x0f0f0f0f0f0f0f0full,
3099     0x00ff00ff00ff00ffull,
3100     0x0000ffff0000ffffull,
3101 };
3102 
3103 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3104  * For N==0, this corresponds to the operation that in qemu/bitops.h
3105  * we call half_shuffle64; this algorithm is from Hacker's Delight,
3106  * section 7-2 Shuffling Bits.
3107  */
expand_bits(uint64_t x,int n)3108 static uint64_t expand_bits(uint64_t x, int n)
3109 {
3110     int i;
3111 
3112     x &= 0xffffffffu;
3113     for (i = 4; i >= n; i--) {
3114         int sh = 1 << i;
3115         x = ((x << sh) | x) & even_bit_esz_masks[i];
3116     }
3117     return x;
3118 }
3119 
3120 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3121  * For N==0, this corresponds to the operation that in qemu/bitops.h
3122  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3123  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3124  */
compress_bits(uint64_t x,int n)3125 static uint64_t compress_bits(uint64_t x, int n)
3126 {
3127     int i;
3128 
3129     for (i = n; i <= 4; i++) {
3130         int sh = 1 << i;
3131         x &= even_bit_esz_masks[i];
3132         x = (x >> sh) | x;
3133     }
3134     return x & 0xffffffffu;
3135 }
3136 
HELPER(sve_zip_p)3137 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3138 {
3139     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3140     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3141     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3142     int esize = 1 << esz;
3143     uint64_t *d = vd;
3144     intptr_t i;
3145 
3146     if (oprsz <= 8) {
3147         uint64_t nn = *(uint64_t *)vn;
3148         uint64_t mm = *(uint64_t *)vm;
3149         int half = 4 * oprsz;
3150 
3151         nn = extract64(nn, high * half, half);
3152         mm = extract64(mm, high * half, half);
3153         nn = expand_bits(nn, esz);
3154         mm = expand_bits(mm, esz);
3155         d[0] = nn | (mm << esize);
3156     } else {
3157         ARMPredicateReg tmp;
3158 
3159         /* We produce output faster than we consume input.
3160            Therefore we must be mindful of possible overlap.  */
3161         if (vd == vn) {
3162             vn = memcpy(&tmp, vn, oprsz);
3163             if (vd == vm) {
3164                 vm = vn;
3165             }
3166         } else if (vd == vm) {
3167             vm = memcpy(&tmp, vm, oprsz);
3168         }
3169         if (high) {
3170             high = oprsz >> 1;
3171         }
3172 
3173         if ((oprsz & 7) == 0) {
3174             uint32_t *n = vn, *m = vm;
3175             high >>= 2;
3176 
3177             for (i = 0; i < oprsz / 8; i++) {
3178                 uint64_t nn = n[H4(high + i)];
3179                 uint64_t mm = m[H4(high + i)];
3180 
3181                 nn = expand_bits(nn, esz);
3182                 mm = expand_bits(mm, esz);
3183                 d[i] = nn | (mm << esize);
3184             }
3185         } else {
3186             uint8_t *n = vn, *m = vm;
3187             uint16_t *d16 = vd;
3188 
3189             for (i = 0; i < oprsz / 2; i++) {
3190                 uint16_t nn = n[H1(high + i)];
3191                 uint16_t mm = m[H1(high + i)];
3192 
3193                 nn = expand_bits(nn, esz);
3194                 mm = expand_bits(mm, esz);
3195                 d16[H2(i)] = nn | (mm << esize);
3196             }
3197         }
3198     }
3199 }
3200 
HELPER(sve_uzp_p)3201 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3202 {
3203     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3204     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3205     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3206     uint64_t *d = vd, *n = vn, *m = vm;
3207     uint64_t l, h;
3208     intptr_t i;
3209 
3210     if (oprsz <= 8) {
3211         l = compress_bits(n[0] >> odd, esz);
3212         h = compress_bits(m[0] >> odd, esz);
3213         d[0] = l | (h << (4 * oprsz));
3214     } else {
3215         ARMPredicateReg tmp_m;
3216         intptr_t oprsz_16 = oprsz / 16;
3217 
3218         if ((vm - vd) < (uintptr_t)oprsz) {
3219             m = memcpy(&tmp_m, vm, oprsz);
3220         }
3221 
3222         for (i = 0; i < oprsz_16; i++) {
3223             l = n[2 * i + 0];
3224             h = n[2 * i + 1];
3225             l = compress_bits(l >> odd, esz);
3226             h = compress_bits(h >> odd, esz);
3227             d[i] = l | (h << 32);
3228         }
3229 
3230         /*
3231          * For VL which is not a multiple of 512, the results from M do not
3232          * align nicely with the uint64_t for D.  Put the aligned results
3233          * from M into TMP_M and then copy it into place afterward.
3234          */
3235         if (oprsz & 15) {
3236             int final_shift = (oprsz & 15) * 2;
3237 
3238             l = n[2 * i + 0];
3239             h = n[2 * i + 1];
3240             l = compress_bits(l >> odd, esz);
3241             h = compress_bits(h >> odd, esz);
3242             d[i] = l | (h << final_shift);
3243 
3244             for (i = 0; i < oprsz_16; i++) {
3245                 l = m[2 * i + 0];
3246                 h = m[2 * i + 1];
3247                 l = compress_bits(l >> odd, esz);
3248                 h = compress_bits(h >> odd, esz);
3249                 tmp_m.p[i] = l | (h << 32);
3250             }
3251             l = m[2 * i + 0];
3252             h = m[2 * i + 1];
3253             l = compress_bits(l >> odd, esz);
3254             h = compress_bits(h >> odd, esz);
3255             tmp_m.p[i] = l | (h << final_shift);
3256 
3257             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3258         } else {
3259             for (i = 0; i < oprsz_16; i++) {
3260                 l = m[2 * i + 0];
3261                 h = m[2 * i + 1];
3262                 l = compress_bits(l >> odd, esz);
3263                 h = compress_bits(h >> odd, esz);
3264                 d[oprsz_16 + i] = l | (h << 32);
3265             }
3266         }
3267     }
3268 }
3269 
HELPER(sve_trn_p)3270 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3271 {
3272     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3273     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3274     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3275     uint64_t *d = vd, *n = vn, *m = vm;
3276     uint64_t mask;
3277     int shr, shl;
3278     intptr_t i;
3279 
3280     shl = 1 << esz;
3281     shr = 0;
3282     mask = even_bit_esz_masks[esz];
3283     if (odd) {
3284         mask <<= shl;
3285         shr = shl;
3286         shl = 0;
3287     }
3288 
3289     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3290         uint64_t nn = (n[i] & mask) >> shr;
3291         uint64_t mm = (m[i] & mask) << shl;
3292         d[i] = nn + mm;
3293     }
3294 }
3295 
3296 /* Reverse units of 2**N bits.  */
reverse_bits_64(uint64_t x,int n)3297 static uint64_t reverse_bits_64(uint64_t x, int n)
3298 {
3299     int i, sh;
3300 
3301     x = bswap64(x);
3302     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3303         uint64_t mask = even_bit_esz_masks[i];
3304         x = ((x & mask) << sh) | ((x >> sh) & mask);
3305     }
3306     return x;
3307 }
3308 
reverse_bits_8(uint8_t x,int n)3309 static uint8_t reverse_bits_8(uint8_t x, int n)
3310 {
3311     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3312     int i, sh;
3313 
3314     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3315         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3316     }
3317     return x;
3318 }
3319 
HELPER(sve_rev_p)3320 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3321 {
3322     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3323     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3324     intptr_t i, oprsz_2 = oprsz / 2;
3325 
3326     if (oprsz <= 8) {
3327         uint64_t l = *(uint64_t *)vn;
3328         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3329         *(uint64_t *)vd = l;
3330     } else if ((oprsz & 15) == 0) {
3331         for (i = 0; i < oprsz_2; i += 8) {
3332             intptr_t ih = oprsz - 8 - i;
3333             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3334             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3335             *(uint64_t *)(vd + i) = h;
3336             *(uint64_t *)(vd + ih) = l;
3337         }
3338     } else {
3339         for (i = 0; i < oprsz_2; i += 1) {
3340             intptr_t il = H1(i);
3341             intptr_t ih = H1(oprsz - 1 - i);
3342             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3343             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3344             *(uint8_t *)(vd + il) = h;
3345             *(uint8_t *)(vd + ih) = l;
3346         }
3347     }
3348 }
3349 
HELPER(sve_punpk_p)3350 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3351 {
3352     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3353     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3354     uint64_t *d = vd;
3355     intptr_t i;
3356 
3357     if (oprsz <= 8) {
3358         uint64_t nn = *(uint64_t *)vn;
3359         int half = 4 * oprsz;
3360 
3361         nn = extract64(nn, high * half, half);
3362         nn = expand_bits(nn, 0);
3363         d[0] = nn;
3364     } else {
3365         ARMPredicateReg tmp_n;
3366 
3367         /* We produce output faster than we consume input.
3368            Therefore we must be mindful of possible overlap.  */
3369         if ((vn - vd) < (uintptr_t)oprsz) {
3370             vn = memcpy(&tmp_n, vn, oprsz);
3371         }
3372         if (high) {
3373             high = oprsz >> 1;
3374         }
3375 
3376         if ((oprsz & 7) == 0) {
3377             uint32_t *n = vn;
3378             high >>= 2;
3379 
3380             for (i = 0; i < oprsz / 8; i++) {
3381                 uint64_t nn = n[H4(high + i)];
3382                 d[i] = expand_bits(nn, 0);
3383             }
3384         } else {
3385             uint16_t *d16 = vd;
3386             uint8_t *n = vn;
3387 
3388             for (i = 0; i < oprsz / 2; i++) {
3389                 uint16_t nn = n[H1(high + i)];
3390                 d16[H2(i)] = expand_bits(nn, 0);
3391             }
3392         }
3393     }
3394 }
3395 
3396 #define DO_ZIP(NAME, TYPE, H) \
3397 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3398 {                                                                    \
3399     intptr_t oprsz = simd_oprsz(desc);                               \
3400     intptr_t odd_ofs = simd_data(desc);                              \
3401     intptr_t i, oprsz_2 = oprsz / 2;                                 \
3402     ARMVectorReg tmp_n, tmp_m;                                       \
3403     /* We produce output faster than we consume input.               \
3404        Therefore we must be mindful of possible overlap.  */         \
3405     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3406         vn = memcpy(&tmp_n, vn, oprsz);                              \
3407     }                                                                \
3408     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3409         vm = memcpy(&tmp_m, vm, oprsz);                              \
3410     }                                                                \
3411     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3412         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3413         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3414             *(TYPE *)(vm + odd_ofs + H(i));                          \
3415     }                                                                \
3416     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3417         memset(vd + oprsz - 16, 0, 16);                              \
3418     }                                                                \
3419 }
3420 
DO_ZIP(sve_zip_b,uint8_t,H1)3421 DO_ZIP(sve_zip_b, uint8_t, H1)
3422 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3423 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3424 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3425 DO_ZIP(sve2_zip_q, Int128, )
3426 
3427 #define DO_UZP(NAME, TYPE, H) \
3428 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3429 {                                                                      \
3430     intptr_t oprsz = simd_oprsz(desc);                                 \
3431     intptr_t odd_ofs = simd_data(desc);                                \
3432     intptr_t i, p;                                                     \
3433     ARMVectorReg tmp_m;                                                \
3434     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3435         vm = memcpy(&tmp_m, vm, oprsz);                                \
3436     }                                                                  \
3437     i = 0, p = odd_ofs;                                                \
3438     do {                                                               \
3439         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3440         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3441     } while (p < oprsz);                                               \
3442     p -= oprsz;                                                        \
3443     do {                                                               \
3444         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3445         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3446     } while (p < oprsz);                                               \
3447     tcg_debug_assert(i == oprsz);                                      \
3448 }
3449 
3450 DO_UZP(sve_uzp_b, uint8_t, H1)
3451 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3452 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3453 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3454 DO_UZP(sve2_uzp_q, Int128, )
3455 
3456 #define DO_TRN(NAME, TYPE, H) \
3457 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3458 {                                                                      \
3459     intptr_t oprsz = simd_oprsz(desc);                                 \
3460     intptr_t odd_ofs = simd_data(desc);                                \
3461     intptr_t i;                                                        \
3462     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3463         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3464         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3465         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3466         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3467     }                                                                  \
3468     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3469         memset(vd + oprsz - 16, 0, 16);                                \
3470     }                                                                  \
3471 }
3472 
3473 DO_TRN(sve_trn_b, uint8_t, H1)
3474 DO_TRN(sve_trn_h, uint16_t, H1_2)
3475 DO_TRN(sve_trn_s, uint32_t, H1_4)
3476 DO_TRN(sve_trn_d, uint64_t, H1_8)
3477 DO_TRN(sve2_trn_q, Int128, )
3478 
3479 #undef DO_ZIP
3480 #undef DO_UZP
3481 #undef DO_TRN
3482 
3483 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3484 {
3485     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3486     uint32_t *d = vd, *n = vn;
3487     uint8_t *pg = vg;
3488 
3489     for (i = j = 0; i < opr_sz; i++) {
3490         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3491             d[H4(j)] = n[H4(i)];
3492             j++;
3493         }
3494     }
3495     for (; j < opr_sz; j++) {
3496         d[H4(j)] = 0;
3497     }
3498 }
3499 
HELPER(sve_compact_d)3500 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3501 {
3502     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3503     uint64_t *d = vd, *n = vn;
3504     uint8_t *pg = vg;
3505 
3506     for (i = j = 0; i < opr_sz; i++) {
3507         if (pg[H1(i)] & 1) {
3508             d[j] = n[i];
3509             j++;
3510         }
3511     }
3512     for (; j < opr_sz; j++) {
3513         d[j] = 0;
3514     }
3515 }
3516 
3517 /* Similar to the ARM LastActiveElement pseudocode function, except the
3518  * result is multiplied by the element size.  This includes the not found
3519  * indication; e.g. not found for esz=3 is -8.
3520  */
HELPER(sve_last_active_element)3521 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3522 {
3523     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3524     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3525 
3526     return last_active_element(vg, words, esz);
3527 }
3528 
HELPER(sve_splice)3529 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3530 {
3531     intptr_t opr_sz = simd_oprsz(desc) / 8;
3532     int esz = simd_data(desc);
3533     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3534     intptr_t i, first_i, last_i;
3535     ARMVectorReg tmp;
3536 
3537     first_i = last_i = 0;
3538     first_g = last_g = 0;
3539 
3540     /* Find the extent of the active elements within VG.  */
3541     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3542         pg = *(uint64_t *)(vg + i) & mask;
3543         if (pg) {
3544             if (last_g == 0) {
3545                 last_g = pg;
3546                 last_i = i;
3547             }
3548             first_g = pg;
3549             first_i = i;
3550         }
3551     }
3552 
3553     len = 0;
3554     if (first_g != 0) {
3555         first_i = first_i * 8 + ctz64(first_g);
3556         last_i = last_i * 8 + 63 - clz64(last_g);
3557         len = last_i - first_i + (1 << esz);
3558         if (vd == vm) {
3559             vm = memcpy(&tmp, vm, opr_sz * 8);
3560         }
3561         swap_memmove(vd, vn + first_i, len);
3562     }
3563     swap_memmove(vd + len, vm, opr_sz * 8 - len);
3564 }
3565 
HELPER(sve_sel_zpzz_b)3566 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3567                             void *vg, uint32_t desc)
3568 {
3569     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3570     uint64_t *d = vd, *n = vn, *m = vm;
3571     uint8_t *pg = vg;
3572 
3573     for (i = 0; i < opr_sz; i += 1) {
3574         uint64_t nn = n[i], mm = m[i];
3575         uint64_t pp = expand_pred_b(pg[H1(i)]);
3576         d[i] = (nn & pp) | (mm & ~pp);
3577     }
3578 }
3579 
HELPER(sve_sel_zpzz_h)3580 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3581                             void *vg, uint32_t desc)
3582 {
3583     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3584     uint64_t *d = vd, *n = vn, *m = vm;
3585     uint8_t *pg = vg;
3586 
3587     for (i = 0; i < opr_sz; i += 1) {
3588         uint64_t nn = n[i], mm = m[i];
3589         uint64_t pp = expand_pred_h(pg[H1(i)]);
3590         d[i] = (nn & pp) | (mm & ~pp);
3591     }
3592 }
3593 
HELPER(sve_sel_zpzz_s)3594 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3595                             void *vg, uint32_t desc)
3596 {
3597     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3598     uint64_t *d = vd, *n = vn, *m = vm;
3599     uint8_t *pg = vg;
3600 
3601     for (i = 0; i < opr_sz; i += 1) {
3602         uint64_t nn = n[i], mm = m[i];
3603         uint64_t pp = expand_pred_s(pg[H1(i)]);
3604         d[i] = (nn & pp) | (mm & ~pp);
3605     }
3606 }
3607 
HELPER(sve_sel_zpzz_d)3608 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3609                             void *vg, uint32_t desc)
3610 {
3611     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3612     uint64_t *d = vd, *n = vn, *m = vm;
3613     uint8_t *pg = vg;
3614 
3615     for (i = 0; i < opr_sz; i += 1) {
3616         uint64_t nn = n[i], mm = m[i];
3617         d[i] = (pg[H1(i)] & 1 ? nn : mm);
3618     }
3619 }
3620 
HELPER(sve_sel_zpzz_q)3621 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3622                             void *vg, uint32_t desc)
3623 {
3624     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3625     Int128 *d = vd, *n = vn, *m = vm;
3626     uint16_t *pg = vg;
3627 
3628     for (i = 0; i < opr_sz; i += 1) {
3629         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3630     }
3631 }
3632 
3633 /* Two operand comparison controlled by a predicate.
3634  * ??? It is very tempting to want to be able to expand this inline
3635  * with x86 instructions, e.g.
3636  *
3637  *    vcmpeqw    zm, zn, %ymm0
3638  *    vpmovmskb  %ymm0, %eax
3639  *    and        $0x5555, %eax
3640  *    and        pg, %eax
3641  *
3642  * or even aarch64, e.g.
3643  *
3644  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3645  *    cmeq       v0.8h, zn, zm
3646  *    and        v0.8h, v0.8h, mask
3647  *    addv       h0, v0.8h
3648  *    and        v0.8b, pg
3649  *
3650  * However, coming up with an abstraction that allows vector inputs and
3651  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3652  * scalar outputs, is tricky.
3653  */
3654 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3655 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3656 {                                                                            \
3657     intptr_t opr_sz = simd_oprsz(desc);                                      \
3658     uint32_t flags = PREDTEST_INIT;                                          \
3659     intptr_t i = opr_sz;                                                     \
3660     do {                                                                     \
3661         uint64_t out = 0, pg;                                                \
3662         do {                                                                 \
3663             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3664             TYPE nn = *(TYPE *)(vn + H(i));                                  \
3665             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3666             out |= nn OP mm;                                                 \
3667         } while (i & 63);                                                    \
3668         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3669         out &= pg;                                                           \
3670         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3671         flags = iter_predtest_bwd(out, pg, flags);                           \
3672     } while (i > 0);                                                         \
3673     return flags;                                                            \
3674 }
3675 
3676 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3677     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3678 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3679     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3680 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3681     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3682 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3683     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3684 
3685 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3686 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3687 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3688 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3689 
3690 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3691 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3692 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3693 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3694 
3695 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3696 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3697 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3698 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3699 
3700 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3701 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3702 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3703 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3704 
3705 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3706 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3707 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3708 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3709 
3710 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3711 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3712 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3713 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3714 
3715 #undef DO_CMP_PPZZ_B
3716 #undef DO_CMP_PPZZ_H
3717 #undef DO_CMP_PPZZ_S
3718 #undef DO_CMP_PPZZ_D
3719 #undef DO_CMP_PPZZ
3720 
3721 /* Similar, but the second source is "wide".  */
3722 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3723 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3724 {                                                                            \
3725     intptr_t opr_sz = simd_oprsz(desc);                                      \
3726     uint32_t flags = PREDTEST_INIT;                                          \
3727     intptr_t i = opr_sz;                                                     \
3728     do {                                                                     \
3729         uint64_t out = 0, pg;                                                \
3730         do {                                                                 \
3731             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3732             do {                                                             \
3733                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3734                 TYPE nn = *(TYPE *)(vn + H(i));                              \
3735                 out |= nn OP mm;                                             \
3736             } while (i & 7);                                                 \
3737         } while (i & 63);                                                    \
3738         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3739         out &= pg;                                                           \
3740         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3741         flags = iter_predtest_bwd(out, pg, flags);                           \
3742     } while (i > 0);                                                         \
3743     return flags;                                                            \
3744 }
3745 
3746 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3747     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3748 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3749     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3750 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3751     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3752 
3753 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3754 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3755 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3756 
3757 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3758 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3759 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3760 
3761 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3762 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3763 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3764 
3765 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3766 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3767 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3768 
3769 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3770 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3771 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3772 
3773 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3774 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3775 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3776 
3777 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3778 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3779 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3780 
3781 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3782 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3783 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3784 
3785 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3786 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3787 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3788 
3789 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3790 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3791 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3792 
3793 #undef DO_CMP_PPZW_B
3794 #undef DO_CMP_PPZW_H
3795 #undef DO_CMP_PPZW_S
3796 #undef DO_CMP_PPZW
3797 
3798 /* Similar, but the second source is immediate.  */
3799 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3800 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3801 {                                                                    \
3802     intptr_t opr_sz = simd_oprsz(desc);                              \
3803     uint32_t flags = PREDTEST_INIT;                                  \
3804     TYPE mm = simd_data(desc);                                       \
3805     intptr_t i = opr_sz;                                             \
3806     do {                                                             \
3807         uint64_t out = 0, pg;                                        \
3808         do {                                                         \
3809             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3810             TYPE nn = *(TYPE *)(vn + H(i));                          \
3811             out |= nn OP mm;                                         \
3812         } while (i & 63);                                            \
3813         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3814         out &= pg;                                                   \
3815         *(uint64_t *)(vd + (i >> 3)) = out;                          \
3816         flags = iter_predtest_bwd(out, pg, flags);                   \
3817     } while (i > 0);                                                 \
3818     return flags;                                                    \
3819 }
3820 
3821 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3822     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3823 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3824     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3825 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3826     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3827 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3828     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3829 
3830 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3831 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3832 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3833 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3834 
3835 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3836 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3837 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3838 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3839 
3840 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3841 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3842 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3843 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3844 
3845 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3846 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3847 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3848 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3849 
3850 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3851 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3852 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3853 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3854 
3855 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3856 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3857 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3858 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3859 
3860 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3861 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3862 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3863 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3864 
3865 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
3866 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3867 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3868 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3869 
3870 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
3871 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3872 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3873 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3874 
3875 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
3876 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3877 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3878 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3879 
3880 #undef DO_CMP_PPZI_B
3881 #undef DO_CMP_PPZI_H
3882 #undef DO_CMP_PPZI_S
3883 #undef DO_CMP_PPZI_D
3884 #undef DO_CMP_PPZI
3885 
3886 /* Similar to the ARM LastActive pseudocode function.  */
last_active_pred(void * vd,void * vg,intptr_t oprsz)3887 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3888 {
3889     intptr_t i;
3890 
3891     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3892         uint64_t pg = *(uint64_t *)(vg + i);
3893         if (pg) {
3894             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3895         }
3896     }
3897     return 0;
3898 }
3899 
3900 /* Compute a mask into RETB that is true for all G, up to and including
3901  * (if after) or excluding (if !after) the first G & N.
3902  * Return true if BRK found.
3903  */
compute_brk(uint64_t * retb,uint64_t n,uint64_t g,bool brk,bool after)3904 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3905                         bool brk, bool after)
3906 {
3907     uint64_t b;
3908 
3909     if (brk) {
3910         b = 0;
3911     } else if ((g & n) == 0) {
3912         /* For all G, no N are set; break not found.  */
3913         b = g;
3914     } else {
3915         /* Break somewhere in N.  Locate it.  */
3916         b = g & n;            /* guard true, pred true */
3917         b = b & -b;           /* first such */
3918         if (after) {
3919             b = b | (b - 1);  /* break after same */
3920         } else {
3921             b = b - 1;        /* break before same */
3922         }
3923         brk = true;
3924     }
3925 
3926     *retb = b;
3927     return brk;
3928 }
3929 
3930 /* Compute a zeroing BRK.  */
compute_brk_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3931 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3932                           intptr_t oprsz, bool after)
3933 {
3934     bool brk = false;
3935     intptr_t i;
3936 
3937     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3938         uint64_t this_b, this_g = g[i];
3939 
3940         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3941         d[i] = this_b & this_g;
3942     }
3943 }
3944 
3945 /* Likewise, but also compute flags.  */
compute_brks_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3946 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3947                                intptr_t oprsz, bool after)
3948 {
3949     uint32_t flags = PREDTEST_INIT;
3950     bool brk = false;
3951     intptr_t i;
3952 
3953     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3954         uint64_t this_b, this_d, this_g = g[i];
3955 
3956         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3957         d[i] = this_d = this_b & this_g;
3958         flags = iter_predtest_fwd(this_d, this_g, flags);
3959     }
3960     return flags;
3961 }
3962 
3963 /* Compute a merging BRK.  */
compute_brk_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3964 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3965                           intptr_t oprsz, bool after)
3966 {
3967     bool brk = false;
3968     intptr_t i;
3969 
3970     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3971         uint64_t this_b, this_g = g[i];
3972 
3973         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3974         d[i] = (this_b & this_g) | (d[i] & ~this_g);
3975     }
3976 }
3977 
3978 /* Likewise, but also compute flags.  */
compute_brks_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3979 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3980                                intptr_t oprsz, bool after)
3981 {
3982     uint32_t flags = PREDTEST_INIT;
3983     bool brk = false;
3984     intptr_t i;
3985 
3986     for (i = 0; i < oprsz / 8; ++i) {
3987         uint64_t this_b, this_d = d[i], this_g = g[i];
3988 
3989         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3990         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3991         flags = iter_predtest_fwd(this_d, this_g, flags);
3992     }
3993     return flags;
3994 }
3995 
do_zero(ARMPredicateReg * d,intptr_t oprsz)3996 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3997 {
3998     /* It is quicker to zero the whole predicate than loop on OPRSZ.
3999      * The compiler should turn this into 4 64-bit integer stores.
4000      */
4001     memset(d, 0, sizeof(ARMPredicateReg));
4002     return PREDTEST_INIT;
4003 }
4004 
HELPER(sve_brkpa)4005 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
4006                        uint32_t pred_desc)
4007 {
4008     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4009     if (last_active_pred(vn, vg, oprsz)) {
4010         compute_brk_z(vd, vm, vg, oprsz, true);
4011     } else {
4012         do_zero(vd, oprsz);
4013     }
4014 }
4015 
HELPER(sve_brkpas)4016 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
4017                             uint32_t pred_desc)
4018 {
4019     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4020     if (last_active_pred(vn, vg, oprsz)) {
4021         return compute_brks_z(vd, vm, vg, oprsz, true);
4022     } else {
4023         return do_zero(vd, oprsz);
4024     }
4025 }
4026 
HELPER(sve_brkpb)4027 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
4028                        uint32_t pred_desc)
4029 {
4030     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4031     if (last_active_pred(vn, vg, oprsz)) {
4032         compute_brk_z(vd, vm, vg, oprsz, false);
4033     } else {
4034         do_zero(vd, oprsz);
4035     }
4036 }
4037 
HELPER(sve_brkpbs)4038 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4039                             uint32_t pred_desc)
4040 {
4041     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4042     if (last_active_pred(vn, vg, oprsz)) {
4043         return compute_brks_z(vd, vm, vg, oprsz, false);
4044     } else {
4045         return do_zero(vd, oprsz);
4046     }
4047 }
4048 
HELPER(sve_brka_z)4049 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4050 {
4051     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4052     compute_brk_z(vd, vn, vg, oprsz, true);
4053 }
4054 
HELPER(sve_brkas_z)4055 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4056 {
4057     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4058     return compute_brks_z(vd, vn, vg, oprsz, true);
4059 }
4060 
HELPER(sve_brkb_z)4061 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4062 {
4063     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4064     compute_brk_z(vd, vn, vg, oprsz, false);
4065 }
4066 
HELPER(sve_brkbs_z)4067 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4068 {
4069     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4070     return compute_brks_z(vd, vn, vg, oprsz, false);
4071 }
4072 
HELPER(sve_brka_m)4073 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4074 {
4075     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4076     compute_brk_m(vd, vn, vg, oprsz, true);
4077 }
4078 
HELPER(sve_brkas_m)4079 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4080 {
4081     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4082     return compute_brks_m(vd, vn, vg, oprsz, true);
4083 }
4084 
HELPER(sve_brkb_m)4085 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4086 {
4087     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4088     compute_brk_m(vd, vn, vg, oprsz, false);
4089 }
4090 
HELPER(sve_brkbs_m)4091 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4092 {
4093     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4094     return compute_brks_m(vd, vn, vg, oprsz, false);
4095 }
4096 
HELPER(sve_brkn)4097 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4098 {
4099     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4100     if (!last_active_pred(vn, vg, oprsz)) {
4101         do_zero(vd, oprsz);
4102     }
4103 }
4104 
4105 /* As if PredTest(Ones(PL), D, esz).  */
predtest_ones(ARMPredicateReg * d,intptr_t oprsz,uint64_t esz_mask)4106 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4107                               uint64_t esz_mask)
4108 {
4109     uint32_t flags = PREDTEST_INIT;
4110     intptr_t i;
4111 
4112     for (i = 0; i < oprsz / 8; i++) {
4113         flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4114     }
4115     if (oprsz & 7) {
4116         uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4117         flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4118     }
4119     return flags;
4120 }
4121 
HELPER(sve_brkns)4122 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4123 {
4124     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4125     if (last_active_pred(vn, vg, oprsz)) {
4126         return predtest_ones(vd, oprsz, -1);
4127     } else {
4128         return do_zero(vd, oprsz);
4129     }
4130 }
4131 
HELPER(sve_cntp)4132 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4133 {
4134     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4135     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4136     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4137     intptr_t i;
4138 
4139     for (i = 0; i < words; ++i) {
4140         uint64_t t = n[i] & g[i] & mask;
4141         sum += ctpop64(t);
4142     }
4143     return sum;
4144 }
4145 
HELPER(sve_whilel)4146 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4147 {
4148     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4149     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4150     uint64_t esz_mask = pred_esz_masks[esz];
4151     ARMPredicateReg *d = vd;
4152     uint32_t flags;
4153     intptr_t i;
4154 
4155     /* Begin with a zero predicate register.  */
4156     flags = do_zero(d, oprsz);
4157     if (count == 0) {
4158         return flags;
4159     }
4160 
4161     /* Set all of the requested bits.  */
4162     for (i = 0; i < count / 64; ++i) {
4163         d->p[i] = esz_mask;
4164     }
4165     if (count & 63) {
4166         d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4167     }
4168 
4169     return predtest_ones(d, oprsz, esz_mask);
4170 }
4171 
HELPER(sve_whileg)4172 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4173 {
4174     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4175     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4176     uint64_t esz_mask = pred_esz_masks[esz];
4177     ARMPredicateReg *d = vd;
4178     intptr_t i, invcount, oprbits;
4179     uint64_t bits;
4180 
4181     if (count == 0) {
4182         return do_zero(d, oprsz);
4183     }
4184 
4185     oprbits = oprsz * 8;
4186     tcg_debug_assert(count <= oprbits);
4187 
4188     bits = esz_mask;
4189     if (oprbits & 63) {
4190         bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4191     }
4192 
4193     invcount = oprbits - count;
4194     for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4195         d->p[i] = bits;
4196         bits = esz_mask;
4197     }
4198 
4199     d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4200 
4201     while (--i >= 0) {
4202         d->p[i] = 0;
4203     }
4204 
4205     return predtest_ones(d, oprsz, esz_mask);
4206 }
4207 
4208 /* Recursive reduction on a function;
4209  * C.f. the ARM ARM function ReducePredicated.
4210  *
4211  * While it would be possible to write this without the DATA temporary,
4212  * it is much simpler to process the predicate register this way.
4213  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4214  * little to gain with a more complex non-recursive form.
4215  */
4216 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
4217 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4218 {                                                                     \
4219     if (n == 1) {                                                     \
4220         return *data;                                                 \
4221     } else {                                                          \
4222         uintptr_t half = n / 2;                                       \
4223         TYPE lo = NAME##_reduce(data, status, half);                  \
4224         TYPE hi = NAME##_reduce(data + half, status, half);           \
4225         return FUNC(lo, hi, status);                                  \
4226     }                                                                 \
4227 }                                                                     \
4228 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \
4229 {                                                                     \
4230     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4231     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4232     for (i = 0; i < oprsz; ) {                                        \
4233         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4234         do {                                                          \
4235             TYPE nn = *(TYPE *)(vn + H(i));                           \
4236             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4237             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4238         } while (i & 15);                                             \
4239     }                                                                 \
4240     for (; i < maxsz; i += sizeof(TYPE)) {                            \
4241         *(TYPE *)((void *)data + i) = IDENT;                          \
4242     }                                                                 \
4243     return NAME##_reduce(data, s, maxsz / sizeof(TYPE));              \
4244 }
4245 
DO_REDUCE(sve_faddv_h,float16,H1_2,float16_add,float16_zero)4246 DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero)
4247 DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero)
4248 DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero)
4249 
4250 /* Identity is floatN_default_nan, without the function call.  */
4251 DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00)
4252 DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000)
4253 DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL)
4254 
4255 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00)
4256 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000)
4257 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL)
4258 
4259 DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity)
4260 DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity)
4261 DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity)
4262 
4263 DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity))
4264 DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity))
4265 DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity))
4266 
4267 DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
4268 DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
4269 DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
4270 
4271 DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh,
4272           float16_chs(float16_infinity))
4273 DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs,
4274           float32_chs(float32_infinity))
4275 DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd,
4276           float64_chs(float64_infinity))
4277 
4278 #undef DO_REDUCE
4279 
4280 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4281                              float_status *status, uint32_t desc)
4282 {
4283     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4284     float16 result = nn;
4285 
4286     do {
4287         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4288         do {
4289             if (pg & 1) {
4290                 float16 mm = *(float16 *)(vm + H1_2(i));
4291                 result = float16_add(result, mm, status);
4292             }
4293             i += sizeof(float16), pg >>= sizeof(float16);
4294         } while (i & 15);
4295     } while (i < opr_sz);
4296 
4297     return result;
4298 }
4299 
HELPER(sve_fadda_s)4300 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4301                              float_status *status, uint32_t desc)
4302 {
4303     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4304     float32 result = nn;
4305 
4306     do {
4307         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4308         do {
4309             if (pg & 1) {
4310                 float32 mm = *(float32 *)(vm + H1_2(i));
4311                 result = float32_add(result, mm, status);
4312             }
4313             i += sizeof(float32), pg >>= sizeof(float32);
4314         } while (i & 15);
4315     } while (i < opr_sz);
4316 
4317     return result;
4318 }
4319 
HELPER(sve_fadda_d)4320 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4321                              float_status *status, uint32_t desc)
4322 {
4323     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4324     uint64_t *m = vm;
4325     uint8_t *pg = vg;
4326 
4327     for (i = 0; i < opr_sz; i++) {
4328         if (pg[H1(i)] & 1) {
4329             nn = float64_add(nn, m[i], status);
4330         }
4331     }
4332 
4333     return nn;
4334 }
4335 
4336 /* Fully general three-operand expander, controlled by a predicate,
4337  * With the extra float_status parameter.
4338  */
4339 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4340 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4341                   float_status *status, uint32_t desc)          \
4342 {                                                               \
4343     intptr_t i = simd_oprsz(desc);                              \
4344     uint64_t *g = vg;                                           \
4345     do {                                                        \
4346         uint64_t pg = g[(i - 1) >> 6];                          \
4347         do {                                                    \
4348             i -= sizeof(TYPE);                                  \
4349             if (likely((pg >> (i & 63)) & 1)) {                 \
4350                 TYPE nn = *(TYPE *)(vn + H(i));                 \
4351                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4352                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4353             }                                                   \
4354         } while (i & 63);                                       \
4355     } while (i != 0);                                           \
4356 }
4357 
DO_ZPZZ_FP(sve_fadd_h,uint16_t,H1_2,float16_add)4358 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4359 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4360 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4361 
4362 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4363 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4364 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4365 
4366 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4367 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4368 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4369 
4370 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4371 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4372 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4373 
4374 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4375 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4376 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4377 
4378 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4379 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4380 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4381 
4382 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh)
4383 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins)
4384 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind)
4385 
4386 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh)
4387 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs)
4388 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd)
4389 
4390 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4391 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4392 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4393 
4394 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4395 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4396 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4397 
4398 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4399 {
4400     return float16_abs(float16_sub(a, b, s));
4401 }
4402 
abd_s(float32 a,float32 b,float_status * s)4403 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4404 {
4405     return float32_abs(float32_sub(a, b, s));
4406 }
4407 
abd_d(float64 a,float64 b,float_status * s)4408 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4409 {
4410     return float64_abs(float64_sub(a, b, s));
4411 }
4412 
4413 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
ah_abd_h(float16 op1,float16 op2,float_status * stat)4414 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat)
4415 {
4416     float16 r = float16_sub(op1, op2, stat);
4417     return float16_is_any_nan(r) ? r : float16_abs(r);
4418 }
4419 
ah_abd_s(float32 op1,float32 op2,float_status * stat)4420 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat)
4421 {
4422     float32 r = float32_sub(op1, op2, stat);
4423     return float32_is_any_nan(r) ? r : float32_abs(r);
4424 }
4425 
ah_abd_d(float64 op1,float64 op2,float_status * stat)4426 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat)
4427 {
4428     float64 r = float64_sub(op1, op2, stat);
4429     return float64_is_any_nan(r) ? r : float64_abs(r);
4430 }
4431 
DO_ZPZZ_FP(sve_fabd_h,uint16_t,H1_2,abd_h)4432 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4433 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4434 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4435 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h)
4436 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s)
4437 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d)
4438 
4439 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4440 {
4441     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4442     return float64_scalbn(a, b_int, s);
4443 }
4444 
DO_ZPZZ_FP(sve_fscalbn_h,int16_t,H1_2,float16_scalbn)4445 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4446 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4447 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4448 
4449 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4450 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4451 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4452 
4453 #undef DO_ZPZZ_FP
4454 
4455 /* Three-operand expander, with one scalar operand, controlled by
4456  * a predicate, with the extra float_status parameter.
4457  */
4458 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4459 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4460                   float_status *status, uint32_t desc)            \
4461 {                                                                 \
4462     intptr_t i = simd_oprsz(desc);                                \
4463     uint64_t *g = vg;                                             \
4464     TYPE mm = scalar;                                             \
4465     do {                                                          \
4466         uint64_t pg = g[(i - 1) >> 6];                            \
4467         do {                                                      \
4468             i -= sizeof(TYPE);                                    \
4469             if (likely((pg >> (i & 63)) & 1)) {                   \
4470                 TYPE nn = *(TYPE *)(vn + H(i));                   \
4471                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4472             }                                                     \
4473         } while (i & 63);                                         \
4474     } while (i != 0);                                             \
4475 }
4476 
4477 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4478 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4479 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4480 
4481 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4482 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4483 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4484 
4485 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4486 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4487 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4488 
4489 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4490 {
4491     return float16_sub(b, a, s);
4492 }
4493 
subr_s(float32 a,float32 b,float_status * s)4494 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4495 {
4496     return float32_sub(b, a, s);
4497 }
4498 
subr_d(float64 a,float64 b,float_status * s)4499 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4500 {
4501     return float64_sub(b, a, s);
4502 }
4503 
DO_ZPZS_FP(sve_fsubrs_h,float16,H1_2,subr_h)4504 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4505 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4506 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4507 
4508 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4509 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4510 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4511 
4512 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4513 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4514 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4515 
4516 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4517 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4518 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4519 
4520 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4521 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4522 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4523 
4524 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh)
4525 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs)
4526 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd)
4527 
4528 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh)
4529 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins)
4530 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind)
4531 
4532 /* Fully general two-operand expander, controlled by a predicate,
4533  * With the extra float_status parameter.
4534  */
4535 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4536 void HELPER(NAME)(void *vd, void *vn, void *vg,                       \
4537                   float_status *status, uint32_t desc)                \
4538 {                                                                     \
4539     intptr_t i = simd_oprsz(desc);                                    \
4540     uint64_t *g = vg;                                                 \
4541     do {                                                              \
4542         uint64_t pg = g[(i - 1) >> 6];                                \
4543         do {                                                          \
4544             i -= sizeof(TYPE);                                        \
4545             if (likely((pg >> (i & 63)) & 1)) {                       \
4546                 TYPE nn = *(TYPE *)(vn + H(i));                       \
4547                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4548             }                                                         \
4549         } while (i & 63);                                             \
4550     } while (i != 0);                                                 \
4551 }
4552 
4553 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4554  * FZ16.  When converting from fp16, this affects flushing input denormals;
4555  * when converting to fp16, this affects flushing output denormals.
4556  */
4557 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4558 {
4559     bool save = get_flush_inputs_to_zero(fpst);
4560     float32 ret;
4561 
4562     set_flush_inputs_to_zero(false, fpst);
4563     ret = float16_to_float32(f, true, fpst);
4564     set_flush_inputs_to_zero(save, fpst);
4565     return ret;
4566 }
4567 
sve_f16_to_f64(float16 f,float_status * fpst)4568 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4569 {
4570     bool save = get_flush_inputs_to_zero(fpst);
4571     float64 ret;
4572 
4573     set_flush_inputs_to_zero(false, fpst);
4574     ret = float16_to_float64(f, true, fpst);
4575     set_flush_inputs_to_zero(save, fpst);
4576     return ret;
4577 }
4578 
sve_f32_to_f16(float32 f,float_status * fpst)4579 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4580 {
4581     bool save = get_flush_to_zero(fpst);
4582     float16 ret;
4583 
4584     set_flush_to_zero(false, fpst);
4585     ret = float32_to_float16(f, true, fpst);
4586     set_flush_to_zero(save, fpst);
4587     return ret;
4588 }
4589 
sve_f64_to_f16(float64 f,float_status * fpst)4590 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4591 {
4592     bool save = get_flush_to_zero(fpst);
4593     float16 ret;
4594 
4595     set_flush_to_zero(false, fpst);
4596     ret = float64_to_float16(f, true, fpst);
4597     set_flush_to_zero(save, fpst);
4598     return ret;
4599 }
4600 
vfp_float16_to_int16_rtz(float16 f,float_status * s)4601 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4602 {
4603     if (float16_is_any_nan(f)) {
4604         float_raise(float_flag_invalid, s);
4605         return 0;
4606     }
4607     return float16_to_int16_round_to_zero(f, s);
4608 }
4609 
vfp_float16_to_int64_rtz(float16 f,float_status * s)4610 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4611 {
4612     if (float16_is_any_nan(f)) {
4613         float_raise(float_flag_invalid, s);
4614         return 0;
4615     }
4616     return float16_to_int64_round_to_zero(f, s);
4617 }
4618 
vfp_float32_to_int64_rtz(float32 f,float_status * s)4619 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4620 {
4621     if (float32_is_any_nan(f)) {
4622         float_raise(float_flag_invalid, s);
4623         return 0;
4624     }
4625     return float32_to_int64_round_to_zero(f, s);
4626 }
4627 
vfp_float64_to_int64_rtz(float64 f,float_status * s)4628 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4629 {
4630     if (float64_is_any_nan(f)) {
4631         float_raise(float_flag_invalid, s);
4632         return 0;
4633     }
4634     return float64_to_int64_round_to_zero(f, s);
4635 }
4636 
vfp_float16_to_uint16_rtz(float16 f,float_status * s)4637 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4638 {
4639     if (float16_is_any_nan(f)) {
4640         float_raise(float_flag_invalid, s);
4641         return 0;
4642     }
4643     return float16_to_uint16_round_to_zero(f, s);
4644 }
4645 
vfp_float16_to_uint64_rtz(float16 f,float_status * s)4646 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4647 {
4648     if (float16_is_any_nan(f)) {
4649         float_raise(float_flag_invalid, s);
4650         return 0;
4651     }
4652     return float16_to_uint64_round_to_zero(f, s);
4653 }
4654 
vfp_float32_to_uint64_rtz(float32 f,float_status * s)4655 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4656 {
4657     if (float32_is_any_nan(f)) {
4658         float_raise(float_flag_invalid, s);
4659         return 0;
4660     }
4661     return float32_to_uint64_round_to_zero(f, s);
4662 }
4663 
vfp_float64_to_uint64_rtz(float64 f,float_status * s)4664 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4665 {
4666     if (float64_is_any_nan(f)) {
4667         float_raise(float_flag_invalid, s);
4668         return 0;
4669     }
4670     return float64_to_uint64_round_to_zero(f, s);
4671 }
4672 
DO_ZPZ_FP(sve_fcvt_sh,uint32_t,H1_4,sve_f32_to_f16)4673 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4674 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4675 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4676 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4677 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4678 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4679 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4680 
4681 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4682 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4683 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4684 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4685 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4686 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4687 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4688 
4689 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4690 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4691 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4692 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4693 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4694 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4695 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4696 
4697 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4698 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4699 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4700 
4701 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4702 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4703 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4704 
4705 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4706 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4707 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4708 
4709 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4710 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4711 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4712 
4713 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4714 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4715 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4716 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4717 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4718 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4719 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4720 
4721 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4722 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4723 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4724 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4725 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4726 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4727 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4728 
4729 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4730 {
4731     /* Extract frac to the top of the uint32_t. */
4732     uint32_t frac = (uint32_t)a << (16 + 6);
4733     int16_t exp = extract32(a, 10, 5);
4734 
4735     if (unlikely(exp == 0)) {
4736         if (frac != 0) {
4737             if (!get_flush_inputs_to_zero(s)) {
4738                 /* denormal: bias - fractional_zeros */
4739                 return -15 - clz32(frac);
4740             }
4741             /* flush to zero */
4742             float_raise(float_flag_input_denormal_flushed, s);
4743         }
4744     } else if (unlikely(exp == 0x1f)) {
4745         if (frac == 0) {
4746             return INT16_MAX; /* infinity */
4747         }
4748     } else {
4749         /* normal: exp - bias */
4750         return exp - 15;
4751     }
4752     /* nan or zero */
4753     float_raise(float_flag_invalid, s);
4754     return INT16_MIN;
4755 }
4756 
do_float32_logb_as_int(float32 a,float_status * s)4757 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4758 {
4759     /* Extract frac to the top of the uint32_t. */
4760     uint32_t frac = a << 9;
4761     int32_t exp = extract32(a, 23, 8);
4762 
4763     if (unlikely(exp == 0)) {
4764         if (frac != 0) {
4765             if (!get_flush_inputs_to_zero(s)) {
4766                 /* denormal: bias - fractional_zeros */
4767                 return -127 - clz32(frac);
4768             }
4769             /* flush to zero */
4770             float_raise(float_flag_input_denormal_flushed, s);
4771         }
4772     } else if (unlikely(exp == 0xff)) {
4773         if (frac == 0) {
4774             return INT32_MAX; /* infinity */
4775         }
4776     } else {
4777         /* normal: exp - bias */
4778         return exp - 127;
4779     }
4780     /* nan or zero */
4781     float_raise(float_flag_invalid, s);
4782     return INT32_MIN;
4783 }
4784 
do_float64_logb_as_int(float64 a,float_status * s)4785 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4786 {
4787     /* Extract frac to the top of the uint64_t. */
4788     uint64_t frac = a << 12;
4789     int64_t exp = extract64(a, 52, 11);
4790 
4791     if (unlikely(exp == 0)) {
4792         if (frac != 0) {
4793             if (!get_flush_inputs_to_zero(s)) {
4794                 /* denormal: bias - fractional_zeros */
4795                 return -1023 - clz64(frac);
4796             }
4797             /* flush to zero */
4798             float_raise(float_flag_input_denormal_flushed, s);
4799         }
4800     } else if (unlikely(exp == 0x7ff)) {
4801         if (frac == 0) {
4802             return INT64_MAX; /* infinity */
4803         }
4804     } else {
4805         /* normal: exp - bias */
4806         return exp - 1023;
4807     }
4808     /* nan or zero */
4809     float_raise(float_flag_invalid, s);
4810     return INT64_MIN;
4811 }
4812 
DO_ZPZ_FP(flogb_h,float16,H1_2,do_float16_logb_as_int)4813 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4814 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4815 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4816 
4817 #undef DO_ZPZ_FP
4818 
4819 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4820                             float_status *status, uint32_t desc,
4821                             uint16_t neg1, uint16_t neg3, int flags)
4822 {
4823     intptr_t i = simd_oprsz(desc);
4824     uint64_t *g = vg;
4825 
4826     do {
4827         uint64_t pg = g[(i - 1) >> 6];
4828         do {
4829             i -= 2;
4830             if (likely((pg >> (i & 63)) & 1)) {
4831                 float16 e1, e2, e3, r;
4832 
4833                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4834                 e2 = *(uint16_t *)(vm + H1_2(i));
4835                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4836                 r = float16_muladd(e1, e2, e3, flags, status);
4837                 *(uint16_t *)(vd + H1_2(i)) = r;
4838             }
4839         } while (i & 63);
4840     } while (i != 0);
4841 }
4842 
HELPER(sve_fmla_zpzzz_h)4843 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4844                               void *vg, float_status *status, uint32_t desc)
4845 {
4846     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4847 }
4848 
HELPER(sve_fmls_zpzzz_h)4849 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4850                               void *vg, float_status *status, uint32_t desc)
4851 {
4852     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0);
4853 }
4854 
HELPER(sve_fnmla_zpzzz_h)4855 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4856                                void *vg, float_status *status, uint32_t desc)
4857 {
4858     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0);
4859 }
4860 
HELPER(sve_fnmls_zpzzz_h)4861 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4862                                void *vg, float_status *status, uint32_t desc)
4863 {
4864     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0);
4865 }
4866 
HELPER(sve_ah_fmls_zpzzz_h)4867 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4868                               void *vg, float_status *status, uint32_t desc)
4869 {
4870     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4871                     float_muladd_negate_product);
4872 }
4873 
HELPER(sve_ah_fnmla_zpzzz_h)4874 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4875                                void *vg, float_status *status, uint32_t desc)
4876 {
4877     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4878                     float_muladd_negate_product | float_muladd_negate_c);
4879 }
4880 
HELPER(sve_ah_fnmls_zpzzz_h)4881 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4882                                void *vg, float_status *status, uint32_t desc)
4883 {
4884     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4885                     float_muladd_negate_c);
4886 }
4887 
do_fmla_zpzzz_s(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint32_t neg1,uint32_t neg3,int flags)4888 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4889                             float_status *status, uint32_t desc,
4890                             uint32_t neg1, uint32_t neg3, int flags)
4891 {
4892     intptr_t i = simd_oprsz(desc);
4893     uint64_t *g = vg;
4894 
4895     do {
4896         uint64_t pg = g[(i - 1) >> 6];
4897         do {
4898             i -= 4;
4899             if (likely((pg >> (i & 63)) & 1)) {
4900                 float32 e1, e2, e3, r;
4901 
4902                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4903                 e2 = *(uint32_t *)(vm + H1_4(i));
4904                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4905                 r = float32_muladd(e1, e2, e3, flags, status);
4906                 *(uint32_t *)(vd + H1_4(i)) = r;
4907             }
4908         } while (i & 63);
4909     } while (i != 0);
4910 }
4911 
HELPER(sve_fmla_zpzzz_s)4912 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4913                               void *vg, float_status *status, uint32_t desc)
4914 {
4915     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4916 }
4917 
HELPER(sve_fmls_zpzzz_s)4918 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4919                               void *vg, float_status *status, uint32_t desc)
4920 {
4921     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0);
4922 }
4923 
HELPER(sve_fnmla_zpzzz_s)4924 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4925                                void *vg, float_status *status, uint32_t desc)
4926 {
4927     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0);
4928 }
4929 
HELPER(sve_fnmls_zpzzz_s)4930 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4931                                void *vg, float_status *status, uint32_t desc)
4932 {
4933     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0);
4934 }
4935 
HELPER(sve_ah_fmls_zpzzz_s)4936 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4937                               void *vg, float_status *status, uint32_t desc)
4938 {
4939     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4940                     float_muladd_negate_product);
4941 }
4942 
HELPER(sve_ah_fnmla_zpzzz_s)4943 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4944                                void *vg, float_status *status, uint32_t desc)
4945 {
4946     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4947                     float_muladd_negate_product | float_muladd_negate_c);
4948 }
4949 
HELPER(sve_ah_fnmls_zpzzz_s)4950 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4951                                void *vg, float_status *status, uint32_t desc)
4952 {
4953     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4954                     float_muladd_negate_c);
4955 }
4956 
do_fmla_zpzzz_d(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint64_t neg1,uint64_t neg3,int flags)4957 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4958                             float_status *status, uint32_t desc,
4959                             uint64_t neg1, uint64_t neg3, int flags)
4960 {
4961     intptr_t i = simd_oprsz(desc);
4962     uint64_t *g = vg;
4963 
4964     do {
4965         uint64_t pg = g[(i - 1) >> 6];
4966         do {
4967             i -= 8;
4968             if (likely((pg >> (i & 63)) & 1)) {
4969                 float64 e1, e2, e3, r;
4970 
4971                 e1 = *(uint64_t *)(vn + i) ^ neg1;
4972                 e2 = *(uint64_t *)(vm + i);
4973                 e3 = *(uint64_t *)(va + i) ^ neg3;
4974                 r = float64_muladd(e1, e2, e3, flags, status);
4975                 *(uint64_t *)(vd + i) = r;
4976             }
4977         } while (i & 63);
4978     } while (i != 0);
4979 }
4980 
HELPER(sve_fmla_zpzzz_d)4981 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4982                               void *vg, float_status *status, uint32_t desc)
4983 {
4984     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4985 }
4986 
HELPER(sve_fmls_zpzzz_d)4987 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4988                               void *vg, float_status *status, uint32_t desc)
4989 {
4990     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0);
4991 }
4992 
HELPER(sve_fnmla_zpzzz_d)4993 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4994                                void *vg, float_status *status, uint32_t desc)
4995 {
4996     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0);
4997 }
4998 
HELPER(sve_fnmls_zpzzz_d)4999 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5000                                void *vg, float_status *status, uint32_t desc)
5001 {
5002     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0);
5003 }
5004 
HELPER(sve_ah_fmls_zpzzz_d)5005 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5006                               void *vg, float_status *status, uint32_t desc)
5007 {
5008     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5009                     float_muladd_negate_product);
5010 }
5011 
HELPER(sve_ah_fnmla_zpzzz_d)5012 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5013                                void *vg, float_status *status, uint32_t desc)
5014 {
5015     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5016                     float_muladd_negate_product | float_muladd_negate_c);
5017 }
5018 
HELPER(sve_ah_fnmls_zpzzz_d)5019 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5020                                void *vg, float_status *status, uint32_t desc)
5021 {
5022     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5023                     float_muladd_negate_c);
5024 }
5025 
5026 /* Two operand floating-point comparison controlled by a predicate.
5027  * Unlike the integer version, we are not allowed to optimistically
5028  * compare operands, since the comparison may have side effects wrt
5029  * the FPSR.
5030  */
5031 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
5032 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
5033                   float_status *status, uint32_t desc)                  \
5034 {                                                                       \
5035     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
5036     uint64_t *d = vd, *g = vg;                                          \
5037     do {                                                                \
5038         uint64_t out = 0, pg = g[j];                                    \
5039         do {                                                            \
5040             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
5041             if (likely((pg >> (i & 63)) & 1)) {                         \
5042                 TYPE nn = *(TYPE *)(vn + H(i));                         \
5043                 TYPE mm = *(TYPE *)(vm + H(i));                         \
5044                 out |= OP(TYPE, nn, mm, status);                        \
5045             }                                                           \
5046         } while (i & 63);                                               \
5047         d[j--] = out;                                                   \
5048     } while (i > 0);                                                    \
5049 }
5050 
5051 #define DO_FPCMP_PPZZ_H(NAME, OP) \
5052     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
5053 #define DO_FPCMP_PPZZ_S(NAME, OP) \
5054     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
5055 #define DO_FPCMP_PPZZ_D(NAME, OP) \
5056     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
5057 
5058 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
5059     DO_FPCMP_PPZZ_H(NAME, OP)   \
5060     DO_FPCMP_PPZZ_S(NAME, OP)   \
5061     DO_FPCMP_PPZZ_D(NAME, OP)
5062 
5063 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
5064 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
5065 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
5066 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
5067 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
5068 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
5069 #define DO_FCMUO(TYPE, X, Y, ST)  \
5070     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
5071 #define DO_FACGE(TYPE, X, Y, ST)  \
5072     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
5073 #define DO_FACGT(TYPE, X, Y, ST)  \
5074     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
5075 
DO_FPCMP_PPZZ_ALL(sve_fcmge,DO_FCMGE)5076 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
5077 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
5078 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
5079 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
5080 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
5081 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
5082 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
5083 
5084 #undef DO_FPCMP_PPZZ_ALL
5085 #undef DO_FPCMP_PPZZ_D
5086 #undef DO_FPCMP_PPZZ_S
5087 #undef DO_FPCMP_PPZZ_H
5088 #undef DO_FPCMP_PPZZ
5089 
5090 /* One operand floating-point comparison against zero, controlled
5091  * by a predicate.
5092  */
5093 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
5094 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
5095                   float_status *status, uint32_t desc)     \
5096 {                                                          \
5097     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
5098     uint64_t *d = vd, *g = vg;                             \
5099     do {                                                   \
5100         uint64_t out = 0, pg = g[j];                       \
5101         do {                                               \
5102             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
5103             if ((pg >> (i & 63)) & 1) {                    \
5104                 TYPE nn = *(TYPE *)(vn + H(i));            \
5105                 out |= OP(TYPE, nn, 0, status);            \
5106             }                                              \
5107         } while (i & 63);                                  \
5108         d[j--] = out;                                      \
5109     } while (i > 0);                                       \
5110 }
5111 
5112 #define DO_FPCMP_PPZ0_H(NAME, OP) \
5113     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
5114 #define DO_FPCMP_PPZ0_S(NAME, OP) \
5115     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
5116 #define DO_FPCMP_PPZ0_D(NAME, OP) \
5117     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
5118 
5119 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
5120     DO_FPCMP_PPZ0_H(NAME, OP)   \
5121     DO_FPCMP_PPZ0_S(NAME, OP)   \
5122     DO_FPCMP_PPZ0_D(NAME, OP)
5123 
5124 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
5125 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
5126 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
5127 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
5128 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
5129 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
5130 
5131 /* FP Trig Multiply-Add. */
5132 
5133 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm,
5134                          float_status *s, uint32_t desc)
5135 {
5136     static const float16 coeff[16] = {
5137         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5138         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5139     };
5140     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
5141     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5142     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5143     float16 *d = vd, *n = vn, *m = vm;
5144 
5145     for (i = 0; i < opr_sz; i++) {
5146         float16 mm = m[i];
5147         intptr_t xx = x;
5148         int flags = 0;
5149 
5150         if (float16_is_neg(mm)) {
5151             if (fpcr_ah) {
5152                 flags = float_muladd_negate_product;
5153             } else {
5154                 mm = float16_abs(mm);
5155             }
5156             xx += 8;
5157         }
5158         d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s);
5159     }
5160 }
5161 
HELPER(sve_ftmad_s)5162 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm,
5163                          float_status *s, uint32_t desc)
5164 {
5165     static const float32 coeff[16] = {
5166         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5167         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5168         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5169         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5170     };
5171     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5172     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5173     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5174     float32 *d = vd, *n = vn, *m = vm;
5175 
5176     for (i = 0; i < opr_sz; i++) {
5177         float32 mm = m[i];
5178         intptr_t xx = x;
5179         int flags = 0;
5180 
5181         if (float32_is_neg(mm)) {
5182             if (fpcr_ah) {
5183                 flags = float_muladd_negate_product;
5184             } else {
5185                 mm = float32_abs(mm);
5186             }
5187             xx += 8;
5188         }
5189         d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s);
5190     }
5191 }
5192 
HELPER(sve_ftmad_d)5193 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm,
5194                          float_status *s, uint32_t desc)
5195 {
5196     static const float64 coeff[16] = {
5197         0x3ff0000000000000ull, 0xbfc5555555555543ull,
5198         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5199         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5200         0x3de5d8408868552full, 0x0000000000000000ull,
5201         0x3ff0000000000000ull, 0xbfe0000000000000ull,
5202         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5203         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5204         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5205     };
5206     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5207     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5208     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5209     float64 *d = vd, *n = vn, *m = vm;
5210 
5211     for (i = 0; i < opr_sz; i++) {
5212         float64 mm = m[i];
5213         intptr_t xx = x;
5214         int flags = 0;
5215 
5216         if (float64_is_neg(mm)) {
5217             if (fpcr_ah) {
5218                 flags = float_muladd_negate_product;
5219             } else {
5220                 mm = float64_abs(mm);
5221             }
5222             xx += 8;
5223         }
5224         d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s);
5225     }
5226 }
5227 
5228 /*
5229  * FP Complex Add
5230  */
5231 
HELPER(sve_fcadd_h)5232 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5233                          float_status *s, uint32_t desc)
5234 {
5235     intptr_t j, i = simd_oprsz(desc);
5236     uint64_t *g = vg;
5237     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5238     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5239 
5240     do {
5241         uint64_t pg = g[(i - 1) >> 6];
5242         do {
5243             float16 e0, e1, e2, e3;
5244 
5245             /* I holds the real index; J holds the imag index.  */
5246             j = i - sizeof(float16);
5247             i -= 2 * sizeof(float16);
5248 
5249             e0 = *(float16 *)(vn + H1_2(i));
5250             e1 = *(float16 *)(vm + H1_2(j));
5251             e2 = *(float16 *)(vn + H1_2(j));
5252             e3 = *(float16 *)(vm + H1_2(i));
5253 
5254             if (rot) {
5255                 e3 = float16_maybe_ah_chs(e3, fpcr_ah);
5256             } else {
5257                 e1 = float16_maybe_ah_chs(e1, fpcr_ah);
5258             }
5259 
5260             if (likely((pg >> (i & 63)) & 1)) {
5261                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s);
5262             }
5263             if (likely((pg >> (j & 63)) & 1)) {
5264                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s);
5265             }
5266         } while (i & 63);
5267     } while (i != 0);
5268 }
5269 
HELPER(sve_fcadd_s)5270 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5271                          float_status *s, uint32_t desc)
5272 {
5273     intptr_t j, i = simd_oprsz(desc);
5274     uint64_t *g = vg;
5275     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5276     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5277 
5278     do {
5279         uint64_t pg = g[(i - 1) >> 6];
5280         do {
5281             float32 e0, e1, e2, e3;
5282 
5283             /* I holds the real index; J holds the imag index.  */
5284             j = i - sizeof(float32);
5285             i -= 2 * sizeof(float32);
5286 
5287             e0 = *(float32 *)(vn + H1_2(i));
5288             e1 = *(float32 *)(vm + H1_2(j));
5289             e2 = *(float32 *)(vn + H1_2(j));
5290             e3 = *(float32 *)(vm + H1_2(i));
5291 
5292             if (rot) {
5293                 e3 = float32_maybe_ah_chs(e3, fpcr_ah);
5294             } else {
5295                 e1 = float32_maybe_ah_chs(e1, fpcr_ah);
5296             }
5297 
5298             if (likely((pg >> (i & 63)) & 1)) {
5299                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s);
5300             }
5301             if (likely((pg >> (j & 63)) & 1)) {
5302                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s);
5303             }
5304         } while (i & 63);
5305     } while (i != 0);
5306 }
5307 
HELPER(sve_fcadd_d)5308 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5309                          float_status *s, uint32_t desc)
5310 {
5311     intptr_t j, i = simd_oprsz(desc);
5312     uint64_t *g = vg;
5313     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5314     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5315 
5316     do {
5317         uint64_t pg = g[(i - 1) >> 6];
5318         do {
5319             float64 e0, e1, e2, e3;
5320 
5321             /* I holds the real index; J holds the imag index.  */
5322             j = i - sizeof(float64);
5323             i -= 2 * sizeof(float64);
5324 
5325             e0 = *(float64 *)(vn + H1_2(i));
5326             e1 = *(float64 *)(vm + H1_2(j));
5327             e2 = *(float64 *)(vn + H1_2(j));
5328             e3 = *(float64 *)(vm + H1_2(i));
5329 
5330             if (rot) {
5331                 e3 = float64_maybe_ah_chs(e3, fpcr_ah);
5332             } else {
5333                 e1 = float64_maybe_ah_chs(e1, fpcr_ah);
5334             }
5335 
5336             if (likely((pg >> (i & 63)) & 1)) {
5337                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s);
5338             }
5339             if (likely((pg >> (j & 63)) & 1)) {
5340                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s);
5341             }
5342         } while (i & 63);
5343     } while (i != 0);
5344 }
5345 
5346 /*
5347  * FP Complex Multiply
5348  */
5349 
HELPER(sve_fcmla_zpzzz_h)5350 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5351                                void *vg, float_status *status, uint32_t desc)
5352 {
5353     intptr_t j, i = simd_oprsz(desc);
5354     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5355     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5356     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5357     uint32_t negf_real = flip ^ negf_imag;
5358     float16 negx_imag, negx_real;
5359     uint64_t *g = vg;
5360 
5361     /* With AH=0, use negx; with AH=1 use negf. */
5362     negx_real = (negf_real & ~fpcr_ah) << 15;
5363     negx_imag = (negf_imag & ~fpcr_ah) << 15;
5364     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5365     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5366 
5367     do {
5368         uint64_t pg = g[(i - 1) >> 6];
5369         do {
5370             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5371 
5372             /* I holds the real index; J holds the imag index.  */
5373             j = i - sizeof(float16);
5374             i -= 2 * sizeof(float16);
5375 
5376             nr = *(float16 *)(vn + H1_2(i));
5377             ni = *(float16 *)(vn + H1_2(j));
5378             mr = *(float16 *)(vm + H1_2(i));
5379             mi = *(float16 *)(vm + H1_2(j));
5380 
5381             e2 = (flip ? ni : nr);
5382             e1 = (flip ? mi : mr) ^ negx_real;
5383             e4 = e2;
5384             e3 = (flip ? mr : mi) ^ negx_imag;
5385 
5386             if (likely((pg >> (i & 63)) & 1)) {
5387                 d = *(float16 *)(va + H1_2(i));
5388                 d = float16_muladd(e2, e1, d, negf_real, status);
5389                 *(float16 *)(vd + H1_2(i)) = d;
5390             }
5391             if (likely((pg >> (j & 63)) & 1)) {
5392                 d = *(float16 *)(va + H1_2(j));
5393                 d = float16_muladd(e4, e3, d, negf_imag, status);
5394                 *(float16 *)(vd + H1_2(j)) = d;
5395             }
5396         } while (i & 63);
5397     } while (i != 0);
5398 }
5399 
HELPER(sve_fcmla_zpzzz_s)5400 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5401                                void *vg, float_status *status, uint32_t desc)
5402 {
5403     intptr_t j, i = simd_oprsz(desc);
5404     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5405     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5406     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5407     uint32_t negf_real = flip ^ negf_imag;
5408     float32 negx_imag, negx_real;
5409     uint64_t *g = vg;
5410 
5411     /* With AH=0, use negx; with AH=1 use negf. */
5412     negx_real = (negf_real & ~fpcr_ah) << 31;
5413     negx_imag = (negf_imag & ~fpcr_ah) << 31;
5414     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5415     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5416 
5417     do {
5418         uint64_t pg = g[(i - 1) >> 6];
5419         do {
5420             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5421 
5422             /* I holds the real index; J holds the imag index.  */
5423             j = i - sizeof(float32);
5424             i -= 2 * sizeof(float32);
5425 
5426             nr = *(float32 *)(vn + H1_2(i));
5427             ni = *(float32 *)(vn + H1_2(j));
5428             mr = *(float32 *)(vm + H1_2(i));
5429             mi = *(float32 *)(vm + H1_2(j));
5430 
5431             e2 = (flip ? ni : nr);
5432             e1 = (flip ? mi : mr) ^ negx_real;
5433             e4 = e2;
5434             e3 = (flip ? mr : mi) ^ negx_imag;
5435 
5436             if (likely((pg >> (i & 63)) & 1)) {
5437                 d = *(float32 *)(va + H1_2(i));
5438                 d = float32_muladd(e2, e1, d, negf_real, status);
5439                 *(float32 *)(vd + H1_2(i)) = d;
5440             }
5441             if (likely((pg >> (j & 63)) & 1)) {
5442                 d = *(float32 *)(va + H1_2(j));
5443                 d = float32_muladd(e4, e3, d, negf_imag, status);
5444                 *(float32 *)(vd + H1_2(j)) = d;
5445             }
5446         } while (i & 63);
5447     } while (i != 0);
5448 }
5449 
HELPER(sve_fcmla_zpzzz_d)5450 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5451                                void *vg, float_status *status, uint32_t desc)
5452 {
5453     intptr_t j, i = simd_oprsz(desc);
5454     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5455     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5456     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5457     uint32_t negf_real = flip ^ negf_imag;
5458     float64 negx_imag, negx_real;
5459     uint64_t *g = vg;
5460 
5461     /* With AH=0, use negx; with AH=1 use negf. */
5462     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
5463     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
5464     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5465     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5466 
5467     do {
5468         uint64_t pg = g[(i - 1) >> 6];
5469         do {
5470             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5471 
5472             /* I holds the real index; J holds the imag index.  */
5473             j = i - sizeof(float64);
5474             i -= 2 * sizeof(float64);
5475 
5476             nr = *(float64 *)(vn + H1_2(i));
5477             ni = *(float64 *)(vn + H1_2(j));
5478             mr = *(float64 *)(vm + H1_2(i));
5479             mi = *(float64 *)(vm + H1_2(j));
5480 
5481             e2 = (flip ? ni : nr);
5482             e1 = (flip ? mi : mr) ^ negx_real;
5483             e4 = e2;
5484             e3 = (flip ? mr : mi) ^ negx_imag;
5485 
5486             if (likely((pg >> (i & 63)) & 1)) {
5487                 d = *(float64 *)(va + H1_2(i));
5488                 d = float64_muladd(e2, e1, d, negf_real, status);
5489                 *(float64 *)(vd + H1_2(i)) = d;
5490             }
5491             if (likely((pg >> (j & 63)) & 1)) {
5492                 d = *(float64 *)(va + H1_2(j));
5493                 d = float64_muladd(e4, e3, d, negf_imag, status);
5494                 *(float64 *)(vd + H1_2(j)) = d;
5495             }
5496         } while (i & 63);
5497     } while (i != 0);
5498 }
5499 
5500 /*
5501  * Load contiguous data, protected by a governing predicate.
5502  */
5503 
5504 /*
5505  * Skip through a sequence of inactive elements in the guarding predicate @vg,
5506  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5507  * element >= @reg_off, or @reg_max if there were no active elements at all.
5508  */
find_next_active(uint64_t * vg,intptr_t reg_off,intptr_t reg_max,int esz)5509 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5510                                  intptr_t reg_max, int esz)
5511 {
5512     uint64_t pg_mask = pred_esz_masks[esz];
5513     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5514 
5515     /* In normal usage, the first element is active.  */
5516     if (likely(pg & 1)) {
5517         return reg_off;
5518     }
5519 
5520     if (pg == 0) {
5521         reg_off &= -64;
5522         do {
5523             reg_off += 64;
5524             if (unlikely(reg_off >= reg_max)) {
5525                 /* The entire predicate was false.  */
5526                 return reg_max;
5527             }
5528             pg = vg[reg_off >> 6] & pg_mask;
5529         } while (pg == 0);
5530     }
5531     reg_off += ctz64(pg);
5532 
5533     /* We should never see an out of range predicate bit set.  */
5534     tcg_debug_assert(reg_off < reg_max);
5535     return reg_off;
5536 }
5537 
5538 /*
5539  * Resolve the guest virtual address to info->host and info->flags.
5540  * If @nofault, return false if the page is invalid, otherwise
5541  * exit via page fault exception.
5542  */
5543 
sve_probe_page(SVEHostPage * info,bool nofault,CPUARMState * env,target_ulong addr,int mem_off,MMUAccessType access_type,int mmu_idx,uintptr_t retaddr)5544 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5545                     target_ulong addr, int mem_off, MMUAccessType access_type,
5546                     int mmu_idx, uintptr_t retaddr)
5547 {
5548     int flags;
5549 
5550     addr += mem_off;
5551 
5552     /*
5553      * User-only currently always issues with TBI.  See the comment
5554      * above useronly_clean_ptr.  Usually we clean this top byte away
5555      * during translation, but we can't do that for e.g. vector + imm
5556      * addressing modes.
5557      *
5558      * We currently always enable TBI for user-only, and do not provide
5559      * a way to turn it off.  So clean the pointer unconditionally here,
5560      * rather than look it up here, or pass it down from above.
5561      */
5562     addr = useronly_clean_ptr(addr);
5563 
5564 #ifdef CONFIG_USER_ONLY
5565     flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5566                                &info->host, retaddr);
5567 #else
5568     CPUTLBEntryFull *full;
5569     flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5570                               &info->host, &full, retaddr);
5571 #endif
5572     info->flags = flags;
5573 
5574     if (flags & TLB_INVALID_MASK) {
5575         g_assert(nofault);
5576         return false;
5577     }
5578 
5579 #ifdef CONFIG_USER_ONLY
5580     memset(&info->attrs, 0, sizeof(info->attrs));
5581     /* Require both ANON and MTE; see allocation_tag_mem(). */
5582     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5583 #else
5584     info->attrs = full->attrs;
5585     info->tagged = full->extra.arm.pte_attrs == 0xf0;
5586 #endif
5587 
5588     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5589     info->host -= mem_off;
5590     return true;
5591 }
5592 
5593 /*
5594  * Find first active element on each page, and a loose bound for the
5595  * final element on each page.  Identify any single element that spans
5596  * the page boundary.  Return true if there are any active elements.
5597  */
sve_cont_ldst_elements(SVEContLdSt * info,target_ulong addr,uint64_t * vg,intptr_t reg_max,int esz,int msize)5598 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5599                             intptr_t reg_max, int esz, int msize)
5600 {
5601     const int esize = 1 << esz;
5602     const uint64_t pg_mask = pred_esz_masks[esz];
5603     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5604     intptr_t mem_off_last, mem_off_split;
5605     intptr_t page_split, elt_split;
5606     intptr_t i;
5607 
5608     /* Set all of the element indices to -1, and the TLB data to 0. */
5609     memset(info, -1, offsetof(SVEContLdSt, page));
5610     memset(info->page, 0, sizeof(info->page));
5611 
5612     /* Gross scan over the entire predicate to find bounds. */
5613     i = 0;
5614     do {
5615         uint64_t pg = vg[i] & pg_mask;
5616         if (pg) {
5617             reg_off_last = i * 64 + 63 - clz64(pg);
5618             if (reg_off_first < 0) {
5619                 reg_off_first = i * 64 + ctz64(pg);
5620             }
5621         }
5622     } while (++i * 64 < reg_max);
5623 
5624     if (unlikely(reg_off_first < 0)) {
5625         /* No active elements, no pages touched. */
5626         return false;
5627     }
5628     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5629 
5630     info->reg_off_first[0] = reg_off_first;
5631     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5632     mem_off_last = (reg_off_last >> esz) * msize;
5633 
5634     page_split = -(addr | TARGET_PAGE_MASK);
5635     if (likely(mem_off_last + msize <= page_split)) {
5636         /* The entire operation fits within a single page. */
5637         info->reg_off_last[0] = reg_off_last;
5638         return true;
5639     }
5640 
5641     info->page_split = page_split;
5642     elt_split = page_split / msize;
5643     reg_off_split = elt_split << esz;
5644     mem_off_split = elt_split * msize;
5645 
5646     /*
5647      * This is the last full element on the first page, but it is not
5648      * necessarily active.  If there is no full element, i.e. the first
5649      * active element is the one that's split, this value remains -1.
5650      * It is useful as iteration bounds.
5651      */
5652     if (elt_split != 0) {
5653         info->reg_off_last[0] = reg_off_split - esize;
5654     }
5655 
5656     /* Determine if an unaligned element spans the pages.  */
5657     if (page_split % msize != 0) {
5658         /* It is helpful to know if the split element is active. */
5659         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5660             info->reg_off_split = reg_off_split;
5661             info->mem_off_split = mem_off_split;
5662 
5663             if (reg_off_split == reg_off_last) {
5664                 /* The page crossing element is last. */
5665                 return true;
5666             }
5667         }
5668         reg_off_split += esize;
5669         mem_off_split += msize;
5670     }
5671 
5672     /*
5673      * We do want the first active element on the second page, because
5674      * this may affect the address reported in an exception.
5675      */
5676     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5677     tcg_debug_assert(reg_off_split <= reg_off_last);
5678     info->reg_off_first[1] = reg_off_split;
5679     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5680     info->reg_off_last[1] = reg_off_last;
5681     return true;
5682 }
5683 
5684 /*
5685  * Resolve the guest virtual addresses to info->page[].
5686  * Control the generation of page faults with @fault.  Return false if
5687  * there is no work to do, which can only happen with @fault == FAULT_NO.
5688  */
sve_cont_ldst_pages(SVEContLdSt * info,SVEContFault fault,CPUARMState * env,target_ulong addr,MMUAccessType access_type,uintptr_t retaddr)5689 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5690                          CPUARMState *env, target_ulong addr,
5691                          MMUAccessType access_type, uintptr_t retaddr)
5692 {
5693     int mmu_idx = arm_env_mmu_index(env);
5694     int mem_off = info->mem_off_first[0];
5695     bool nofault = fault == FAULT_NO;
5696     bool have_work = true;
5697 
5698     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5699                         access_type, mmu_idx, retaddr)) {
5700         /* No work to be done. */
5701         return false;
5702     }
5703 
5704     if (likely(info->page_split < 0)) {
5705         /* The entire operation was on the one page. */
5706         return true;
5707     }
5708 
5709     /*
5710      * If the second page is invalid, then we want the fault address to be
5711      * the first byte on that page which is accessed.
5712      */
5713     if (info->mem_off_split >= 0) {
5714         /*
5715          * There is an element split across the pages.  The fault address
5716          * should be the first byte of the second page.
5717          */
5718         mem_off = info->page_split;
5719         /*
5720          * If the split element is also the first active element
5721          * of the vector, then:  For first-fault we should continue
5722          * to generate faults for the second page.  For no-fault,
5723          * we have work only if the second page is valid.
5724          */
5725         if (info->mem_off_first[0] < info->mem_off_split) {
5726             nofault = FAULT_FIRST;
5727             have_work = false;
5728         }
5729     } else {
5730         /*
5731          * There is no element split across the pages.  The fault address
5732          * should be the first active element on the second page.
5733          */
5734         mem_off = info->mem_off_first[1];
5735         /*
5736          * There must have been one active element on the first page,
5737          * so we're out of first-fault territory.
5738          */
5739         nofault = fault != FAULT_ALL;
5740     }
5741 
5742     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5743                                 access_type, mmu_idx, retaddr);
5744     return have_work;
5745 }
5746 
5747 #ifndef CONFIG_USER_ONLY
sve_cont_ldst_watchpoints(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,int wp_access,uintptr_t retaddr)5748 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5749                                uint64_t *vg, target_ulong addr,
5750                                int esize, int msize, int wp_access,
5751                                uintptr_t retaddr)
5752 {
5753     intptr_t mem_off, reg_off, reg_last;
5754     int flags0 = info->page[0].flags;
5755     int flags1 = info->page[1].flags;
5756 
5757     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5758         return;
5759     }
5760 
5761     /* Indicate that watchpoints are handled. */
5762     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5763     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5764 
5765     if (flags0 & TLB_WATCHPOINT) {
5766         mem_off = info->mem_off_first[0];
5767         reg_off = info->reg_off_first[0];
5768         reg_last = info->reg_off_last[0];
5769 
5770         while (reg_off <= reg_last) {
5771             uint64_t pg = vg[reg_off >> 6];
5772             do {
5773                 if ((pg >> (reg_off & 63)) & 1) {
5774                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5775                                          msize, info->page[0].attrs,
5776                                          wp_access, retaddr);
5777                 }
5778                 reg_off += esize;
5779                 mem_off += msize;
5780             } while (reg_off <= reg_last && (reg_off & 63));
5781         }
5782     }
5783 
5784     mem_off = info->mem_off_split;
5785     if (mem_off >= 0) {
5786         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5787                              info->page[0].attrs, wp_access, retaddr);
5788     }
5789 
5790     mem_off = info->mem_off_first[1];
5791     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5792         reg_off = info->reg_off_first[1];
5793         reg_last = info->reg_off_last[1];
5794 
5795         do {
5796             uint64_t pg = vg[reg_off >> 6];
5797             do {
5798                 if ((pg >> (reg_off & 63)) & 1) {
5799                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5800                                          msize, info->page[1].attrs,
5801                                          wp_access, retaddr);
5802                 }
5803                 reg_off += esize;
5804                 mem_off += msize;
5805             } while (reg_off & 63);
5806         } while (reg_off <= reg_last);
5807     }
5808 }
5809 #endif
5810 
sve_cont_ldst_mte_check(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,uint32_t mtedesc,uintptr_t ra)5811 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5812                              uint64_t *vg, target_ulong addr, int esize,
5813                              int msize, uint32_t mtedesc, uintptr_t ra)
5814 {
5815     intptr_t mem_off, reg_off, reg_last;
5816 
5817     /* Process the page only if MemAttr == Tagged. */
5818     if (info->page[0].tagged) {
5819         mem_off = info->mem_off_first[0];
5820         reg_off = info->reg_off_first[0];
5821         reg_last = info->reg_off_split;
5822         if (reg_last < 0) {
5823             reg_last = info->reg_off_last[0];
5824         }
5825 
5826         do {
5827             uint64_t pg = vg[reg_off >> 6];
5828             do {
5829                 if ((pg >> (reg_off & 63)) & 1) {
5830                     mte_check(env, mtedesc, addr, ra);
5831                 }
5832                 reg_off += esize;
5833                 mem_off += msize;
5834             } while (reg_off <= reg_last && (reg_off & 63));
5835         } while (reg_off <= reg_last);
5836     }
5837 
5838     mem_off = info->mem_off_first[1];
5839     if (mem_off >= 0 && info->page[1].tagged) {
5840         reg_off = info->reg_off_first[1];
5841         reg_last = info->reg_off_last[1];
5842 
5843         do {
5844             uint64_t pg = vg[reg_off >> 6];
5845             do {
5846                 if ((pg >> (reg_off & 63)) & 1) {
5847                     mte_check(env, mtedesc, addr, ra);
5848                 }
5849                 reg_off += esize;
5850                 mem_off += msize;
5851             } while (reg_off & 63);
5852         } while (reg_off <= reg_last);
5853     }
5854 }
5855 
5856 /*
5857  * Common helper for all contiguous 1,2,3,4-register predicated stores.
5858  */
5859 static inline QEMU_ALWAYS_INLINE
sve_ldN_r(CPUARMState * env,uint64_t * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const int N,uint32_t mtedesc,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)5860 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5861                uint32_t desc, const uintptr_t retaddr,
5862                const int esz, const int msz, const int N, uint32_t mtedesc,
5863                sve_ldst1_host_fn *host_fn,
5864                sve_ldst1_tlb_fn *tlb_fn)
5865 {
5866     const unsigned rd = simd_data(desc);
5867     const intptr_t reg_max = simd_oprsz(desc);
5868     intptr_t reg_off, reg_last, mem_off;
5869     SVEContLdSt info;
5870     void *host;
5871     int flags, i;
5872 
5873     /* Find the active elements.  */
5874     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5875         /* The entire predicate was false; no load occurs.  */
5876         for (i = 0; i < N; ++i) {
5877             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5878         }
5879         return;
5880     }
5881 
5882     /* Probe the page(s).  Exit with exception for any invalid page. */
5883     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5884 
5885     /* Handle watchpoints for all active elements. */
5886     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5887                               BP_MEM_READ, retaddr);
5888 
5889     /*
5890      * Handle mte checks for all active elements.
5891      * Since TBI must be set for MTE, !mtedesc => !mte_active.
5892      */
5893     if (mtedesc) {
5894         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5895                                 mtedesc, retaddr);
5896     }
5897 
5898     flags = info.page[0].flags | info.page[1].flags;
5899     if (unlikely(flags != 0)) {
5900         /*
5901          * At least one page includes MMIO.
5902          * Any bus operation can fail with cpu_transaction_failed,
5903          * which for ARM will raise SyncExternal.  Perform the load
5904          * into scratch memory to preserve register state until the end.
5905          */
5906         ARMVectorReg scratch[4] = { };
5907 
5908         mem_off = info.mem_off_first[0];
5909         reg_off = info.reg_off_first[0];
5910         reg_last = info.reg_off_last[1];
5911         if (reg_last < 0) {
5912             reg_last = info.reg_off_split;
5913             if (reg_last < 0) {
5914                 reg_last = info.reg_off_last[0];
5915             }
5916         }
5917 
5918         do {
5919             uint64_t pg = vg[reg_off >> 6];
5920             do {
5921                 if ((pg >> (reg_off & 63)) & 1) {
5922                     for (i = 0; i < N; ++i) {
5923                         tlb_fn(env, &scratch[i], reg_off,
5924                                addr + mem_off + (i << msz), retaddr);
5925                     }
5926                 }
5927                 reg_off += 1 << esz;
5928                 mem_off += N << msz;
5929             } while (reg_off & 63);
5930         } while (reg_off <= reg_last);
5931 
5932         for (i = 0; i < N; ++i) {
5933             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5934         }
5935         return;
5936     }
5937 
5938     /* The entire operation is in RAM, on valid pages. */
5939 
5940     for (i = 0; i < N; ++i) {
5941         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5942     }
5943 
5944     mem_off = info.mem_off_first[0];
5945     reg_off = info.reg_off_first[0];
5946     reg_last = info.reg_off_last[0];
5947     host = info.page[0].host;
5948 
5949     set_helper_retaddr(retaddr);
5950 
5951     while (reg_off <= reg_last) {
5952         uint64_t pg = vg[reg_off >> 6];
5953         do {
5954             if ((pg >> (reg_off & 63)) & 1) {
5955                 for (i = 0; i < N; ++i) {
5956                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5957                             host + mem_off + (i << msz));
5958                 }
5959             }
5960             reg_off += 1 << esz;
5961             mem_off += N << msz;
5962         } while (reg_off <= reg_last && (reg_off & 63));
5963     }
5964 
5965     clear_helper_retaddr();
5966 
5967     /*
5968      * Use the slow path to manage the cross-page misalignment.
5969      * But we know this is RAM and cannot trap.
5970      */
5971     mem_off = info.mem_off_split;
5972     if (unlikely(mem_off >= 0)) {
5973         reg_off = info.reg_off_split;
5974         for (i = 0; i < N; ++i) {
5975             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5976                    addr + mem_off + (i << msz), retaddr);
5977         }
5978     }
5979 
5980     mem_off = info.mem_off_first[1];
5981     if (unlikely(mem_off >= 0)) {
5982         reg_off = info.reg_off_first[1];
5983         reg_last = info.reg_off_last[1];
5984         host = info.page[1].host;
5985 
5986         set_helper_retaddr(retaddr);
5987 
5988         do {
5989             uint64_t pg = vg[reg_off >> 6];
5990             do {
5991                 if ((pg >> (reg_off & 63)) & 1) {
5992                     for (i = 0; i < N; ++i) {
5993                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5994                                 host + mem_off + (i << msz));
5995                     }
5996                 }
5997                 reg_off += 1 << esz;
5998                 mem_off += N << msz;
5999             } while (reg_off & 63);
6000         } while (reg_off <= reg_last);
6001 
6002         clear_helper_retaddr();
6003     }
6004 }
6005 
6006 static inline QEMU_ALWAYS_INLINE
sve_ldN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint32_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6007 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6008                    uint32_t desc, const uintptr_t ra,
6009                    const int esz, const int msz, const int N,
6010                    sve_ldst1_host_fn *host_fn,
6011                    sve_ldst1_tlb_fn *tlb_fn)
6012 {
6013     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6014     int bit55 = extract64(addr, 55, 1);
6015 
6016     /* Remove mtedesc from the normal sve descriptor. */
6017     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6018 
6019     /* Perform gross MTE suppression early. */
6020     if (!tbi_check(mtedesc, bit55) ||
6021         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6022         mtedesc = 0;
6023     }
6024 
6025     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6026 }
6027 
6028 #define DO_LD1_1(NAME, ESZ)                                             \
6029 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
6030                             target_ulong addr, uint32_t desc)           \
6031 {                                                                       \
6032     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
6033               sve_##NAME##_host, sve_##NAME##_tlb);                     \
6034 }                                                                       \
6035 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
6036                                 target_ulong addr, uint32_t desc)       \
6037 {                                                                       \
6038     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
6039                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
6040 }
6041 
6042 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
6043 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
6044                                target_ulong addr, uint32_t desc)        \
6045 {                                                                       \
6046     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6047               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
6048 }                                                                       \
6049 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
6050                                target_ulong addr, uint32_t desc)        \
6051 {                                                                       \
6052     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6053               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
6054 }                                                                       \
6055 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
6056                                    target_ulong addr, uint32_t desc)    \
6057 {                                                                       \
6058     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6059                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
6060 }                                                                       \
6061 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
6062                                    target_ulong addr, uint32_t desc)    \
6063 {                                                                       \
6064     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6065                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
6066 }
6067 
DO_LD1_1(ld1bb,MO_8)6068 DO_LD1_1(ld1bb,  MO_8)
6069 DO_LD1_1(ld1bhu, MO_16)
6070 DO_LD1_1(ld1bhs, MO_16)
6071 DO_LD1_1(ld1bsu, MO_32)
6072 DO_LD1_1(ld1bss, MO_32)
6073 DO_LD1_1(ld1bdu, MO_64)
6074 DO_LD1_1(ld1bds, MO_64)
6075 
6076 DO_LD1_2(ld1hh,  MO_16, MO_16)
6077 DO_LD1_2(ld1hsu, MO_32, MO_16)
6078 DO_LD1_2(ld1hss, MO_32, MO_16)
6079 DO_LD1_2(ld1hdu, MO_64, MO_16)
6080 DO_LD1_2(ld1hds, MO_64, MO_16)
6081 
6082 DO_LD1_2(ld1ss,  MO_32, MO_32)
6083 DO_LD1_2(ld1sdu, MO_64, MO_32)
6084 DO_LD1_2(ld1sds, MO_64, MO_32)
6085 
6086 DO_LD1_2(ld1dd,  MO_64, MO_64)
6087 
6088 #undef DO_LD1_1
6089 #undef DO_LD1_2
6090 
6091 #define DO_LDN_1(N)                                                     \
6092 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
6093                              target_ulong addr, uint32_t desc)          \
6094 {                                                                       \
6095     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
6096               sve_ld1bb_host, sve_ld1bb_tlb);                           \
6097 }                                                                       \
6098 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
6099                                  target_ulong addr, uint32_t desc)      \
6100 {                                                                       \
6101     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
6102                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
6103 }
6104 
6105 #define DO_LDN_2(N, SUFF, ESZ)                                          \
6106 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
6107                                     target_ulong addr, uint32_t desc)   \
6108 {                                                                       \
6109     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6110               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
6111 }                                                                       \
6112 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
6113                                     target_ulong addr, uint32_t desc)   \
6114 {                                                                       \
6115     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6116               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
6117 }                                                                       \
6118 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
6119                                         target_ulong addr, uint32_t desc) \
6120 {                                                                       \
6121     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6122                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
6123 }                                                                       \
6124 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
6125                                         target_ulong addr, uint32_t desc) \
6126 {                                                                       \
6127     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6128                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
6129 }
6130 
6131 DO_LDN_1(2)
6132 DO_LDN_1(3)
6133 DO_LDN_1(4)
6134 
6135 DO_LDN_2(2, hh, MO_16)
6136 DO_LDN_2(3, hh, MO_16)
6137 DO_LDN_2(4, hh, MO_16)
6138 
6139 DO_LDN_2(2, ss, MO_32)
6140 DO_LDN_2(3, ss, MO_32)
6141 DO_LDN_2(4, ss, MO_32)
6142 
6143 DO_LDN_2(2, dd, MO_64)
6144 DO_LDN_2(3, dd, MO_64)
6145 DO_LDN_2(4, dd, MO_64)
6146 
6147 #undef DO_LDN_1
6148 #undef DO_LDN_2
6149 
6150 /*
6151  * Load contiguous data, first-fault and no-fault.
6152  *
6153  * For user-only, we control the race between page_check_range and
6154  * another thread's munmap by using set/clear_helper_retaddr.  Any
6155  * SEGV that occurs between those markers is assumed to be because
6156  * the guest page vanished.  Keep that block as small as possible
6157  * so that unrelated QEMU bugs are not blamed on the guest.
6158  */
6159 
6160 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
6161  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6162  * option, which leaves subsequent data unchanged.
6163  */
6164 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6165 {
6166     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6167 
6168     if (i & 63) {
6169         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6170         i = ROUND_UP(i, 64);
6171     }
6172     for (; i < oprsz; i += 64) {
6173         ffr[i / 64] = 0;
6174     }
6175 }
6176 
6177 /*
6178  * Common helper for all contiguous no-fault and first-fault loads.
6179  */
6180 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r(CPUARMState * env,void * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,uint32_t mtedesc,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6181 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6182                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6183                    const int esz, const int msz, const SVEContFault fault,
6184                    sve_ldst1_host_fn *host_fn,
6185                    sve_ldst1_tlb_fn *tlb_fn)
6186 {
6187     const unsigned rd = simd_data(desc);
6188     void *vd = &env->vfp.zregs[rd];
6189     const intptr_t reg_max = simd_oprsz(desc);
6190     intptr_t reg_off, mem_off, reg_last;
6191     SVEContLdSt info;
6192     int flags;
6193     void *host;
6194 
6195     /* Find the active elements.  */
6196     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6197         /* The entire predicate was false; no load occurs.  */
6198         memset(vd, 0, reg_max);
6199         return;
6200     }
6201     reg_off = info.reg_off_first[0];
6202 
6203     /* Probe the page(s). */
6204     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6205         /* Fault on first element. */
6206         tcg_debug_assert(fault == FAULT_NO);
6207         memset(vd, 0, reg_max);
6208         goto do_fault;
6209     }
6210 
6211     mem_off = info.mem_off_first[0];
6212     flags = info.page[0].flags;
6213 
6214     /*
6215      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6216      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6217      */
6218     if (!info.page[0].tagged) {
6219         mtedesc = 0;
6220     }
6221 
6222     if (fault == FAULT_FIRST) {
6223         /* Trapping mte check for the first-fault element.  */
6224         if (mtedesc) {
6225             mte_check(env, mtedesc, addr + mem_off, retaddr);
6226         }
6227 
6228         /*
6229          * Special handling of the first active element,
6230          * if it crosses a page boundary or is MMIO.
6231          */
6232         bool is_split = mem_off == info.mem_off_split;
6233         if (unlikely(flags != 0) || unlikely(is_split)) {
6234             /*
6235              * Use the slow path for cross-page handling.
6236              * Might trap for MMIO or watchpoints.
6237              */
6238             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6239 
6240             /* After any fault, zero the other elements. */
6241             swap_memzero(vd, reg_off);
6242             reg_off += 1 << esz;
6243             mem_off += 1 << msz;
6244             swap_memzero(vd + reg_off, reg_max - reg_off);
6245 
6246             if (is_split) {
6247                 goto second_page;
6248             }
6249         } else {
6250             memset(vd, 0, reg_max);
6251         }
6252     } else {
6253         memset(vd, 0, reg_max);
6254         if (unlikely(mem_off == info.mem_off_split)) {
6255             /* The first active element crosses a page boundary. */
6256             flags |= info.page[1].flags;
6257             if (unlikely(flags & TLB_MMIO)) {
6258                 /* Some page is MMIO, see below. */
6259                 goto do_fault;
6260             }
6261             if (unlikely(flags & TLB_WATCHPOINT) &&
6262                 (cpu_watchpoint_address_matches
6263                  (env_cpu(env), addr + mem_off, 1 << msz)
6264                  & BP_MEM_READ)) {
6265                 /* Watchpoint hit, see below. */
6266                 goto do_fault;
6267             }
6268             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6269                 goto do_fault;
6270             }
6271             /*
6272              * Use the slow path for cross-page handling.
6273              * This is RAM, without a watchpoint, and will not trap.
6274              */
6275             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6276             goto second_page;
6277         }
6278     }
6279 
6280     /*
6281      * From this point on, all memory operations are MemSingleNF.
6282      *
6283      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6284      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6285      *
6286      * Unfortuately we do not have access to the memory attributes from the
6287      * PTE to tell Device memory from Normal memory.  So we make a mostly
6288      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6289      * This gives the right answer for the common cases of "Normal memory,
6290      * backed by host RAM" and "Device memory, backed by MMIO".
6291      * The architecture allows us to suppress an NF load and return
6292      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6293      * case of "Normal memory, backed by MMIO" is permitted.  The case we
6294      * get wrong is "Device memory, backed by host RAM", for which we
6295      * should return (UNKNOWN, FAULT) for but do not.
6296      *
6297      * Similarly, CPU_BP breakpoints would raise exceptions, and so
6298      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6299      * architectural breakpoints the same.
6300      */
6301     if (unlikely(flags & TLB_MMIO)) {
6302         goto do_fault;
6303     }
6304 
6305     reg_last = info.reg_off_last[0];
6306     host = info.page[0].host;
6307 
6308     set_helper_retaddr(retaddr);
6309 
6310     do {
6311         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6312         do {
6313             if ((pg >> (reg_off & 63)) & 1) {
6314                 if (unlikely(flags & TLB_WATCHPOINT) &&
6315                     (cpu_watchpoint_address_matches
6316                      (env_cpu(env), addr + mem_off, 1 << msz)
6317                      & BP_MEM_READ)) {
6318                     clear_helper_retaddr();
6319                     goto do_fault;
6320                 }
6321                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6322                     clear_helper_retaddr();
6323                     goto do_fault;
6324                 }
6325                 host_fn(vd, reg_off, host + mem_off);
6326             }
6327             reg_off += 1 << esz;
6328             mem_off += 1 << msz;
6329         } while (reg_off <= reg_last && (reg_off & 63));
6330     } while (reg_off <= reg_last);
6331 
6332     clear_helper_retaddr();
6333 
6334     /*
6335      * MemSingleNF is allowed to fail for any reason.  We have special
6336      * code above to handle the first element crossing a page boundary.
6337      * As an implementation choice, decline to handle a cross-page element
6338      * in any other position.
6339      */
6340     reg_off = info.reg_off_split;
6341     if (reg_off >= 0) {
6342         goto do_fault;
6343     }
6344 
6345  second_page:
6346     reg_off = info.reg_off_first[1];
6347     if (likely(reg_off < 0)) {
6348         /* No active elements on the second page.  All done. */
6349         return;
6350     }
6351 
6352     /*
6353      * MemSingleNF is allowed to fail for any reason.  As an implementation
6354      * choice, decline to handle elements on the second page.  This should
6355      * be low frequency as the guest walks through memory -- the next
6356      * iteration of the guest's loop should be aligned on the page boundary,
6357      * and then all following iterations will stay aligned.
6358      */
6359 
6360  do_fault:
6361     record_fault(env, reg_off, reg_max);
6362 }
6363 
6364 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r_mte(CPUARMState * env,void * vg,target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6365 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6366                        uint32_t desc, const uintptr_t retaddr,
6367                        const int esz, const int msz, const SVEContFault fault,
6368                        sve_ldst1_host_fn *host_fn,
6369                        sve_ldst1_tlb_fn *tlb_fn)
6370 {
6371     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6372     int bit55 = extract64(addr, 55, 1);
6373 
6374     /* Remove mtedesc from the normal sve descriptor. */
6375     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6376 
6377     /* Perform gross MTE suppression early. */
6378     if (!tbi_check(mtedesc, bit55) ||
6379         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6380         mtedesc = 0;
6381     }
6382 
6383     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6384                   esz, msz, fault, host_fn, tlb_fn);
6385 }
6386 
6387 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6388 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6389                                  target_ulong addr, uint32_t desc)      \
6390 {                                                                       \
6391     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6392                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6393 }                                                                       \
6394 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6395                                  target_ulong addr, uint32_t desc)      \
6396 {                                                                       \
6397     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6398                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6399 }                                                                       \
6400 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6401                                      target_ulong addr, uint32_t desc)  \
6402 {                                                                       \
6403     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6404                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6405 }                                                                       \
6406 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6407                                      target_ulong addr, uint32_t desc)  \
6408 {                                                                       \
6409     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6410                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6411 }
6412 
6413 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6414 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6415                                     target_ulong addr, uint32_t desc)   \
6416 {                                                                       \
6417     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6418                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6419 }                                                                       \
6420 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6421                                     target_ulong addr, uint32_t desc)   \
6422 {                                                                       \
6423     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6424                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6425 }                                                                       \
6426 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6427                                     target_ulong addr, uint32_t desc)   \
6428 {                                                                       \
6429     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6430                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6431 }                                                                       \
6432 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6433                                     target_ulong addr, uint32_t desc)   \
6434 {                                                                       \
6435     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6436                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6437 }                                                                       \
6438 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6439                                         target_ulong addr, uint32_t desc) \
6440 {                                                                       \
6441     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6442                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6443 }                                                                       \
6444 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6445                                         target_ulong addr, uint32_t desc) \
6446 {                                                                       \
6447     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6448                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6449 }                                                                       \
6450 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6451                                         target_ulong addr, uint32_t desc) \
6452 {                                                                       \
6453     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6454                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6455 }                                                                       \
6456 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6457                                         target_ulong addr, uint32_t desc) \
6458 {                                                                       \
6459     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6460                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6461 }
6462 
DO_LDFF1_LDNF1_1(bb,MO_8)6463 DO_LDFF1_LDNF1_1(bb,  MO_8)
6464 DO_LDFF1_LDNF1_1(bhu, MO_16)
6465 DO_LDFF1_LDNF1_1(bhs, MO_16)
6466 DO_LDFF1_LDNF1_1(bsu, MO_32)
6467 DO_LDFF1_LDNF1_1(bss, MO_32)
6468 DO_LDFF1_LDNF1_1(bdu, MO_64)
6469 DO_LDFF1_LDNF1_1(bds, MO_64)
6470 
6471 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6472 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6473 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6474 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6475 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6476 
6477 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6478 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6479 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6480 
6481 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6482 
6483 #undef DO_LDFF1_LDNF1_1
6484 #undef DO_LDFF1_LDNF1_2
6485 
6486 /*
6487  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6488  */
6489 
6490 static inline QEMU_ALWAYS_INLINE
6491 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6492                uint32_t desc, const uintptr_t retaddr,
6493                const int esz, const int msz, const int N, uint32_t mtedesc,
6494                sve_ldst1_host_fn *host_fn,
6495                sve_ldst1_tlb_fn *tlb_fn)
6496 {
6497     const unsigned rd = simd_data(desc);
6498     const intptr_t reg_max = simd_oprsz(desc);
6499     intptr_t reg_off, reg_last, mem_off;
6500     SVEContLdSt info;
6501     void *host;
6502     int i, flags;
6503 
6504     /* Find the active elements.  */
6505     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6506         /* The entire predicate was false; no store occurs.  */
6507         return;
6508     }
6509 
6510     /* Probe the page(s).  Exit with exception for any invalid page. */
6511     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6512 
6513     /* Handle watchpoints for all active elements. */
6514     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6515                               BP_MEM_WRITE, retaddr);
6516 
6517     /*
6518      * Handle mte checks for all active elements.
6519      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6520      */
6521     if (mtedesc) {
6522         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6523                                 mtedesc, retaddr);
6524     }
6525 
6526     flags = info.page[0].flags | info.page[1].flags;
6527     if (unlikely(flags != 0)) {
6528         /*
6529          * At least one page includes MMIO.
6530          * Any bus operation can fail with cpu_transaction_failed,
6531          * which for ARM will raise SyncExternal.  We cannot avoid
6532          * this fault and will leave with the store incomplete.
6533          */
6534         mem_off = info.mem_off_first[0];
6535         reg_off = info.reg_off_first[0];
6536         reg_last = info.reg_off_last[1];
6537         if (reg_last < 0) {
6538             reg_last = info.reg_off_split;
6539             if (reg_last < 0) {
6540                 reg_last = info.reg_off_last[0];
6541             }
6542         }
6543 
6544         do {
6545             uint64_t pg = vg[reg_off >> 6];
6546             do {
6547                 if ((pg >> (reg_off & 63)) & 1) {
6548                     for (i = 0; i < N; ++i) {
6549                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6550                                addr + mem_off + (i << msz), retaddr);
6551                     }
6552                 }
6553                 reg_off += 1 << esz;
6554                 mem_off += N << msz;
6555             } while (reg_off & 63);
6556         } while (reg_off <= reg_last);
6557         return;
6558     }
6559 
6560     mem_off = info.mem_off_first[0];
6561     reg_off = info.reg_off_first[0];
6562     reg_last = info.reg_off_last[0];
6563     host = info.page[0].host;
6564 
6565     set_helper_retaddr(retaddr);
6566 
6567     while (reg_off <= reg_last) {
6568         uint64_t pg = vg[reg_off >> 6];
6569         do {
6570             if ((pg >> (reg_off & 63)) & 1) {
6571                 for (i = 0; i < N; ++i) {
6572                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6573                             host + mem_off + (i << msz));
6574                 }
6575             }
6576             reg_off += 1 << esz;
6577             mem_off += N << msz;
6578         } while (reg_off <= reg_last && (reg_off & 63));
6579     }
6580 
6581     clear_helper_retaddr();
6582 
6583     /*
6584      * Use the slow path to manage the cross-page misalignment.
6585      * But we know this is RAM and cannot trap.
6586      */
6587     mem_off = info.mem_off_split;
6588     if (unlikely(mem_off >= 0)) {
6589         reg_off = info.reg_off_split;
6590         for (i = 0; i < N; ++i) {
6591             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6592                    addr + mem_off + (i << msz), retaddr);
6593         }
6594     }
6595 
6596     mem_off = info.mem_off_first[1];
6597     if (unlikely(mem_off >= 0)) {
6598         reg_off = info.reg_off_first[1];
6599         reg_last = info.reg_off_last[1];
6600         host = info.page[1].host;
6601 
6602         set_helper_retaddr(retaddr);
6603 
6604         do {
6605             uint64_t pg = vg[reg_off >> 6];
6606             do {
6607                 if ((pg >> (reg_off & 63)) & 1) {
6608                     for (i = 0; i < N; ++i) {
6609                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6610                                 host + mem_off + (i << msz));
6611                     }
6612                 }
6613                 reg_off += 1 << esz;
6614                 mem_off += N << msz;
6615             } while (reg_off & 63);
6616         } while (reg_off <= reg_last);
6617 
6618         clear_helper_retaddr();
6619     }
6620 }
6621 
6622 static inline QEMU_ALWAYS_INLINE
sve_stN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint32_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6623 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6624                    uint32_t desc, const uintptr_t ra,
6625                    const int esz, const int msz, const int N,
6626                    sve_ldst1_host_fn *host_fn,
6627                    sve_ldst1_tlb_fn *tlb_fn)
6628 {
6629     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6630     int bit55 = extract64(addr, 55, 1);
6631 
6632     /* Remove mtedesc from the normal sve descriptor. */
6633     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6634 
6635     /* Perform gross MTE suppression early. */
6636     if (!tbi_check(mtedesc, bit55) ||
6637         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6638         mtedesc = 0;
6639     }
6640 
6641     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6642 }
6643 
6644 #define DO_STN_1(N, NAME, ESZ)                                          \
6645 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6646                                  target_ulong addr, uint32_t desc)      \
6647 {                                                                       \
6648     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6649               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6650 }                                                                       \
6651 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6652                                      target_ulong addr, uint32_t desc)  \
6653 {                                                                       \
6654     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6655                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6656 }
6657 
6658 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6659 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6660                                     target_ulong addr, uint32_t desc)   \
6661 {                                                                       \
6662     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6663               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6664 }                                                                       \
6665 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6666                                     target_ulong addr, uint32_t desc)   \
6667 {                                                                       \
6668     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6669               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6670 }                                                                       \
6671 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6672                                         target_ulong addr, uint32_t desc) \
6673 {                                                                       \
6674     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6675                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6676 }                                                                       \
6677 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6678                                         target_ulong addr, uint32_t desc) \
6679 {                                                                       \
6680     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6681                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6682 }
6683 
6684 DO_STN_1(1, bb, MO_8)
6685 DO_STN_1(1, bh, MO_16)
6686 DO_STN_1(1, bs, MO_32)
6687 DO_STN_1(1, bd, MO_64)
6688 DO_STN_1(2, bb, MO_8)
6689 DO_STN_1(3, bb, MO_8)
6690 DO_STN_1(4, bb, MO_8)
6691 
6692 DO_STN_2(1, hh, MO_16, MO_16)
6693 DO_STN_2(1, hs, MO_32, MO_16)
6694 DO_STN_2(1, hd, MO_64, MO_16)
6695 DO_STN_2(2, hh, MO_16, MO_16)
6696 DO_STN_2(3, hh, MO_16, MO_16)
6697 DO_STN_2(4, hh, MO_16, MO_16)
6698 
6699 DO_STN_2(1, ss, MO_32, MO_32)
6700 DO_STN_2(1, sd, MO_64, MO_32)
6701 DO_STN_2(2, ss, MO_32, MO_32)
6702 DO_STN_2(3, ss, MO_32, MO_32)
6703 DO_STN_2(4, ss, MO_32, MO_32)
6704 
6705 DO_STN_2(1, dd, MO_64, MO_64)
6706 DO_STN_2(2, dd, MO_64, MO_64)
6707 DO_STN_2(3, dd, MO_64, MO_64)
6708 DO_STN_2(4, dd, MO_64, MO_64)
6709 
6710 #undef DO_STN_1
6711 #undef DO_STN_2
6712 
6713 /*
6714  * Loads with a vector index.
6715  */
6716 
6717 /*
6718  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6719  */
6720 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6721 
off_zsu_s(void * reg,intptr_t reg_ofs)6722 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6723 {
6724     return *(uint32_t *)(reg + H1_4(reg_ofs));
6725 }
6726 
off_zss_s(void * reg,intptr_t reg_ofs)6727 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6728 {
6729     return *(int32_t *)(reg + H1_4(reg_ofs));
6730 }
6731 
off_zsu_d(void * reg,intptr_t reg_ofs)6732 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6733 {
6734     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6735 }
6736 
off_zss_d(void * reg,intptr_t reg_ofs)6737 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6738 {
6739     return (int32_t)*(uint64_t *)(reg + reg_ofs);
6740 }
6741 
off_zd_d(void * reg,intptr_t reg_ofs)6742 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6743 {
6744     return *(uint64_t *)(reg + reg_ofs);
6745 }
6746 
6747 static inline QEMU_ALWAYS_INLINE
sve_ld1_z(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,uint32_t mtedesc,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6748 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6749                target_ulong base, uint32_t desc, uintptr_t retaddr,
6750                uint32_t mtedesc, int esize, int msize,
6751                zreg_off_fn *off_fn,
6752                sve_ldst1_host_fn *host_fn,
6753                sve_ldst1_tlb_fn *tlb_fn)
6754 {
6755     const int mmu_idx = arm_env_mmu_index(env);
6756     const intptr_t reg_max = simd_oprsz(desc);
6757     const int scale = simd_data(desc);
6758     ARMVectorReg scratch;
6759     intptr_t reg_off;
6760     SVEHostPage info, info2;
6761 
6762     memset(&scratch, 0, reg_max);
6763     reg_off = 0;
6764     do {
6765         uint64_t pg = vg[reg_off >> 6];
6766         do {
6767             if (likely(pg & 1)) {
6768                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6769                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6770 
6771                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6772                                mmu_idx, retaddr);
6773 
6774                 if (likely(in_page >= msize)) {
6775                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
6776                         cpu_check_watchpoint(env_cpu(env), addr, msize,
6777                                              info.attrs, BP_MEM_READ, retaddr);
6778                     }
6779                     if (mtedesc && info.tagged) {
6780                         mte_check(env, mtedesc, addr, retaddr);
6781                     }
6782                     if (unlikely(info.flags & TLB_MMIO)) {
6783                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
6784                     } else {
6785                         set_helper_retaddr(retaddr);
6786                         host_fn(&scratch, reg_off, info.host);
6787                         clear_helper_retaddr();
6788                     }
6789                 } else {
6790                     /* Element crosses the page boundary. */
6791                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6792                                    MMU_DATA_LOAD, mmu_idx, retaddr);
6793                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6794                         cpu_check_watchpoint(env_cpu(env), addr,
6795                                              msize, info.attrs,
6796                                              BP_MEM_READ, retaddr);
6797                     }
6798                     if (mtedesc && info.tagged) {
6799                         mte_check(env, mtedesc, addr, retaddr);
6800                     }
6801                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
6802                 }
6803             }
6804             reg_off += esize;
6805             pg >>= esize;
6806         } while (reg_off & 63);
6807     } while (reg_off < reg_max);
6808 
6809     /* Wait until all exceptions have been raised to write back.  */
6810     memcpy(vd, &scratch, reg_max);
6811 }
6812 
6813 static inline QEMU_ALWAYS_INLINE
sve_ld1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6814 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6815                    target_ulong base, uint32_t desc, uintptr_t retaddr,
6816                    int esize, int msize, zreg_off_fn *off_fn,
6817                    sve_ldst1_host_fn *host_fn,
6818                    sve_ldst1_tlb_fn *tlb_fn)
6819 {
6820     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6821     /* Remove mtedesc from the normal sve descriptor. */
6822     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6823 
6824     /*
6825      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6826      * offset base entirely over the address space hole to change the
6827      * pointer tag, or change the bit55 selector.  So we could here
6828      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6829      */
6830     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6831               esize, msize, off_fn, host_fn, tlb_fn);
6832 }
6833 
6834 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6835 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6836                                  void *vm, target_ulong base, uint32_t desc) \
6837 {                                                                            \
6838     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
6839               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6840 }                                                                            \
6841 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6842      void *vm, target_ulong base, uint32_t desc)                             \
6843 {                                                                            \
6844     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
6845                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6846 }
6847 
6848 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6849 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6850                                  void *vm, target_ulong base, uint32_t desc) \
6851 {                                                                            \
6852     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
6853               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6854 }                                                                            \
6855 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6856     void *vm, target_ulong base, uint32_t desc)                              \
6857 {                                                                            \
6858     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
6859                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6860 }
6861 
DO_LD1_ZPZ_S(bsu,zsu,MO_8)6862 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6863 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6864 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6865 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6866 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6867 
6868 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6869 DO_LD1_ZPZ_S(bss, zss, MO_8)
6870 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6871 DO_LD1_ZPZ_D(bds, zss, MO_8)
6872 DO_LD1_ZPZ_D(bds, zd, MO_8)
6873 
6874 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6875 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6876 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6877 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6878 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6879 
6880 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6881 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6882 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6883 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6884 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6885 
6886 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6887 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6888 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6889 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6890 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6891 
6892 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6893 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6894 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6895 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6896 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6897 
6898 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6899 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6900 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6901 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6902 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6903 
6904 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6905 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6906 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6907 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6908 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6909 
6910 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6911 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6912 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6913 
6914 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6915 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6916 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6917 
6918 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6919 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6920 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6921 
6922 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6923 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6924 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6925 
6926 #undef DO_LD1_ZPZ_S
6927 #undef DO_LD1_ZPZ_D
6928 
6929 /* First fault loads with a vector index.  */
6930 
6931 /*
6932  * Common helpers for all gather first-faulting loads.
6933  */
6934 
6935 static inline QEMU_ALWAYS_INLINE
6936 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6937                  target_ulong base, uint32_t desc, uintptr_t retaddr,
6938                  uint32_t mtedesc, const int esz, const int msz,
6939                  zreg_off_fn *off_fn,
6940                  sve_ldst1_host_fn *host_fn,
6941                  sve_ldst1_tlb_fn *tlb_fn)
6942 {
6943     const int mmu_idx = arm_env_mmu_index(env);
6944     const intptr_t reg_max = simd_oprsz(desc);
6945     const int scale = simd_data(desc);
6946     const int esize = 1 << esz;
6947     const int msize = 1 << msz;
6948     intptr_t reg_off;
6949     SVEHostPage info;
6950     target_ulong addr, in_page;
6951     ARMVectorReg scratch;
6952 
6953     /* Skip to the first true predicate.  */
6954     reg_off = find_next_active(vg, 0, reg_max, esz);
6955     if (unlikely(reg_off >= reg_max)) {
6956         /* The entire predicate was false; no load occurs.  */
6957         memset(vd, 0, reg_max);
6958         return;
6959     }
6960 
6961     /* Protect against overlap between vd and vm. */
6962     if (unlikely(vd == vm)) {
6963         vm = memcpy(&scratch, vm, reg_max);
6964     }
6965 
6966     /*
6967      * Probe the first element, allowing faults.
6968      */
6969     addr = base + (off_fn(vm, reg_off) << scale);
6970     if (mtedesc) {
6971         mte_check(env, mtedesc, addr, retaddr);
6972     }
6973     tlb_fn(env, vd, reg_off, addr, retaddr);
6974 
6975     /* After any fault, zero the other elements. */
6976     swap_memzero(vd, reg_off);
6977     reg_off += esize;
6978     swap_memzero(vd + reg_off, reg_max - reg_off);
6979 
6980     /*
6981      * Probe the remaining elements, not allowing faults.
6982      */
6983     while (reg_off < reg_max) {
6984         uint64_t pg = vg[reg_off >> 6];
6985         do {
6986             if (likely((pg >> (reg_off & 63)) & 1)) {
6987                 addr = base + (off_fn(vm, reg_off) << scale);
6988                 in_page = -(addr | TARGET_PAGE_MASK);
6989 
6990                 if (unlikely(in_page < msize)) {
6991                     /* Stop if the element crosses a page boundary. */
6992                     goto fault;
6993                 }
6994 
6995                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6996                                mmu_idx, retaddr);
6997                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6998                     goto fault;
6999                 }
7000                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
7001                     (cpu_watchpoint_address_matches
7002                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
7003                     goto fault;
7004                 }
7005                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
7006                     goto fault;
7007                 }
7008 
7009                 set_helper_retaddr(retaddr);
7010                 host_fn(vd, reg_off, info.host);
7011                 clear_helper_retaddr();
7012             }
7013             reg_off += esize;
7014         } while (reg_off & 63);
7015     }
7016     return;
7017 
7018  fault:
7019     record_fault(env, reg_off, reg_max);
7020 }
7021 
7022 static inline QEMU_ALWAYS_INLINE
sve_ldff1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,const int esz,const int msz,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7023 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7024                      target_ulong base, uint32_t desc, uintptr_t retaddr,
7025                      const int esz, const int msz,
7026                      zreg_off_fn *off_fn,
7027                      sve_ldst1_host_fn *host_fn,
7028                      sve_ldst1_tlb_fn *tlb_fn)
7029 {
7030     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7031     /* Remove mtedesc from the normal sve descriptor. */
7032     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7033 
7034     /*
7035      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7036      * offset base entirely over the address space hole to change the
7037      * pointer tag, or change the bit55 selector.  So we could here
7038      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7039      */
7040     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7041                 esz, msz, off_fn, host_fn, tlb_fn);
7042 }
7043 
7044 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
7045 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7046     (CPUARMState *env, void *vd, void *vg,                              \
7047      void *vm, target_ulong base, uint32_t desc)                        \
7048 {                                                                       \
7049     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
7050                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7051 }                                                                       \
7052 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7053     (CPUARMState *env, void *vd, void *vg,                              \
7054      void *vm, target_ulong base, uint32_t desc)                        \
7055 {                                                                       \
7056     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
7057                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7058 }
7059 
7060 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
7061 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7062     (CPUARMState *env, void *vd, void *vg,                              \
7063      void *vm, target_ulong base, uint32_t desc)                        \
7064 {                                                                       \
7065     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
7066                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7067 }                                                                       \
7068 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7069     (CPUARMState *env, void *vd, void *vg,                              \
7070      void *vm, target_ulong base, uint32_t desc)                        \
7071 {                                                                       \
7072     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
7073                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7074 }
7075 
DO_LDFF1_ZPZ_S(bsu,zsu,MO_8)7076 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7077 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7078 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7079 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7080 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7081 
7082 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7083 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7084 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7085 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7086 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7087 
7088 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7089 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7090 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7091 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7092 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7093 
7094 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7095 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7096 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7097 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7098 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7099 
7100 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7101 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7102 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7103 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7104 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7105 
7106 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7107 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7108 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7109 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7110 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7111 
7112 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
7113 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
7114 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7115 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7116 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7117 
7118 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
7119 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
7120 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7121 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7122 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7123 
7124 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7125 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7126 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7127 
7128 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7129 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7130 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7131 
7132 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7133 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7134 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7135 
7136 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7137 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7138 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7139 
7140 /* Stores with a vector index.  */
7141 
7142 static inline QEMU_ALWAYS_INLINE
7143 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7144                target_ulong base, uint32_t desc, uintptr_t retaddr,
7145                uint32_t mtedesc, int esize, int msize,
7146                zreg_off_fn *off_fn,
7147                sve_ldst1_host_fn *host_fn,
7148                sve_ldst1_tlb_fn *tlb_fn)
7149 {
7150     const int mmu_idx = arm_env_mmu_index(env);
7151     const intptr_t reg_max = simd_oprsz(desc);
7152     const int scale = simd_data(desc);
7153     void *host[ARM_MAX_VQ * 4];
7154     intptr_t reg_off, i;
7155     SVEHostPage info, info2;
7156 
7157     /*
7158      * Probe all of the elements for host addresses and flags.
7159      */
7160     i = reg_off = 0;
7161     do {
7162         uint64_t pg = vg[reg_off >> 6];
7163         do {
7164             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7165             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7166 
7167             host[i] = NULL;
7168             if (likely((pg >> (reg_off & 63)) & 1)) {
7169                 if (likely(in_page >= msize)) {
7170                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7171                                    mmu_idx, retaddr);
7172                     if (!(info.flags & TLB_MMIO)) {
7173                         host[i] = info.host;
7174                     }
7175                 } else {
7176                     /*
7177                      * Element crosses the page boundary.
7178                      * Probe both pages, but do not record the host address,
7179                      * so that we use the slow path.
7180                      */
7181                     sve_probe_page(&info, false, env, addr, 0,
7182                                    MMU_DATA_STORE, mmu_idx, retaddr);
7183                     sve_probe_page(&info2, false, env, addr + in_page, 0,
7184                                    MMU_DATA_STORE, mmu_idx, retaddr);
7185                     info.flags |= info2.flags;
7186                 }
7187 
7188                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7189                     cpu_check_watchpoint(env_cpu(env), addr, msize,
7190                                          info.attrs, BP_MEM_WRITE, retaddr);
7191                 }
7192 
7193                 if (mtedesc && info.tagged) {
7194                     mte_check(env, mtedesc, addr, retaddr);
7195                 }
7196             }
7197             i += 1;
7198             reg_off += esize;
7199         } while (reg_off & 63);
7200     } while (reg_off < reg_max);
7201 
7202     /*
7203      * Now that we have recognized all exceptions except SyncExternal
7204      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7205      *
7206      * Note for the common case of an element in RAM, not crossing a page
7207      * boundary, we have stored the host address in host[].  This doubles
7208      * as a first-level check against the predicate, since only enabled
7209      * elements have non-null host addresses.
7210      */
7211     i = reg_off = 0;
7212     do {
7213         void *h = host[i];
7214         if (likely(h != NULL)) {
7215             set_helper_retaddr(retaddr);
7216             host_fn(vd, reg_off, h);
7217             clear_helper_retaddr();
7218         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7219             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7220             tlb_fn(env, vd, reg_off, addr, retaddr);
7221         }
7222         i += 1;
7223         reg_off += esize;
7224     } while (reg_off < reg_max);
7225 }
7226 
7227 static inline QEMU_ALWAYS_INLINE
sve_st1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7228 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7229                    target_ulong base, uint32_t desc, uintptr_t retaddr,
7230                    int esize, int msize, zreg_off_fn *off_fn,
7231                    sve_ldst1_host_fn *host_fn,
7232                    sve_ldst1_tlb_fn *tlb_fn)
7233 {
7234     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7235     /* Remove mtedesc from the normal sve descriptor. */
7236     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7237 
7238     /*
7239      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7240      * offset base entirely over the address space hole to change the
7241      * pointer tag, or change the bit55 selector.  So we could here
7242      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7243      */
7244     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7245               esize, msize, off_fn, host_fn, tlb_fn);
7246 }
7247 
7248 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7249 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7250                                  void *vm, target_ulong base, uint32_t desc) \
7251 {                                                                       \
7252     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7253               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7254 }                                                                       \
7255 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7256     void *vm, target_ulong base, uint32_t desc)                         \
7257 {                                                                       \
7258     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7259                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7260 }
7261 
7262 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7263 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7264                                  void *vm, target_ulong base, uint32_t desc) \
7265 {                                                                       \
7266     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7267               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7268 }                                                                       \
7269 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7270     void *vm, target_ulong base, uint32_t desc)                         \
7271 {                                                                       \
7272     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7273                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7274 }
7275 
DO_ST1_ZPZ_S(bs,zsu,MO_8)7276 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7277 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7278 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7279 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7280 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7281 
7282 DO_ST1_ZPZ_S(bs, zss, MO_8)
7283 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7284 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7285 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7286 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7287 
7288 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7289 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7290 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7291 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7292 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7293 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7294 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7295 
7296 DO_ST1_ZPZ_D(bd, zss, MO_8)
7297 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7298 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7299 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7300 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7301 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7302 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7303 
7304 DO_ST1_ZPZ_D(bd, zd, MO_8)
7305 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7306 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7307 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7308 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7309 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7310 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7311 
7312 #undef DO_ST1_ZPZ_S
7313 #undef DO_ST1_ZPZ_D
7314 
7315 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7316 {
7317     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7318     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7319 
7320     for (i = 0; i < opr_sz; ++i) {
7321         d[i] = n[i] ^ m[i] ^ k[i];
7322     }
7323 }
7324 
HELPER(sve2_bcax)7325 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7326 {
7327     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7328     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7329 
7330     for (i = 0; i < opr_sz; ++i) {
7331         d[i] = n[i] ^ (m[i] & ~k[i]);
7332     }
7333 }
7334 
HELPER(sve2_bsl1n)7335 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7336 {
7337     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7338     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7339 
7340     for (i = 0; i < opr_sz; ++i) {
7341         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7342     }
7343 }
7344 
HELPER(sve2_bsl2n)7345 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7346 {
7347     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7348     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7349 
7350     for (i = 0; i < opr_sz; ++i) {
7351         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7352     }
7353 }
7354 
HELPER(sve2_nbsl)7355 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7356 {
7357     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7358     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7359 
7360     for (i = 0; i < opr_sz; ++i) {
7361         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7362     }
7363 }
7364 
7365 /*
7366  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7367  * See hasless(v,1) from
7368  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7369  */
do_match2(uint64_t n,uint64_t m0,uint64_t m1,int esz)7370 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7371 {
7372     int bits = 8 << esz;
7373     uint64_t ones = dup_const(esz, 1);
7374     uint64_t signs = ones << (bits - 1);
7375     uint64_t cmp0, cmp1;
7376 
7377     cmp1 = dup_const(esz, n);
7378     cmp0 = cmp1 ^ m0;
7379     cmp1 = cmp1 ^ m1;
7380     cmp0 = (cmp0 - ones) & ~cmp0;
7381     cmp1 = (cmp1 - ones) & ~cmp1;
7382     return (cmp0 | cmp1) & signs;
7383 }
7384 
do_match(void * vd,void * vn,void * vm,void * vg,uint32_t desc,int esz,bool nmatch)7385 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7386                                 uint32_t desc, int esz, bool nmatch)
7387 {
7388     uint16_t esz_mask = pred_esz_masks[esz];
7389     intptr_t opr_sz = simd_oprsz(desc);
7390     uint32_t flags = PREDTEST_INIT;
7391     intptr_t i, j, k;
7392 
7393     for (i = 0; i < opr_sz; i += 16) {
7394         uint64_t m0 = *(uint64_t *)(vm + i);
7395         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7396         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7397         uint16_t out = 0;
7398 
7399         for (j = 0; j < 16; j += 8) {
7400             uint64_t n = *(uint64_t *)(vn + i + j);
7401 
7402             for (k = 0; k < 8; k += 1 << esz) {
7403                 if (pg & (1 << (j + k))) {
7404                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
7405                     out |= (o ^ nmatch) << (j + k);
7406                 }
7407             }
7408         }
7409         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7410         flags = iter_predtest_fwd(out, pg, flags);
7411     }
7412     return flags;
7413 }
7414 
7415 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
7416 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
7417 {                                                                             \
7418     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
7419 }
7420 
DO_PPZZ_MATCH(sve2_match_ppzz_b,MO_8,false)7421 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7422 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7423 
7424 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7425 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7426 
7427 #undef DO_PPZZ_MATCH
7428 
7429 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7430                             uint32_t desc)
7431 {
7432     ARMVectorReg scratch;
7433     intptr_t i, j;
7434     intptr_t opr_sz = simd_oprsz(desc);
7435     uint32_t *d = vd, *n = vn, *m = vm;
7436     uint8_t *pg = vg;
7437 
7438     if (d == n) {
7439         n = memcpy(&scratch, n, opr_sz);
7440         if (d == m) {
7441             m = n;
7442         }
7443     } else if (d == m) {
7444         m = memcpy(&scratch, m, opr_sz);
7445     }
7446 
7447     for (i = 0; i < opr_sz; i += 4) {
7448         uint64_t count = 0;
7449         uint8_t pred;
7450 
7451         pred = pg[H1(i >> 3)] >> (i & 7);
7452         if (pred & 1) {
7453             uint32_t nn = n[H4(i >> 2)];
7454 
7455             for (j = 0; j <= i; j += 4) {
7456                 pred = pg[H1(j >> 3)] >> (j & 7);
7457                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7458                     ++count;
7459                 }
7460             }
7461         }
7462         d[H4(i >> 2)] = count;
7463     }
7464 }
7465 
HELPER(sve2_histcnt_d)7466 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7467                             uint32_t desc)
7468 {
7469     ARMVectorReg scratch;
7470     intptr_t i, j;
7471     intptr_t opr_sz = simd_oprsz(desc);
7472     uint64_t *d = vd, *n = vn, *m = vm;
7473     uint8_t *pg = vg;
7474 
7475     if (d == n) {
7476         n = memcpy(&scratch, n, opr_sz);
7477         if (d == m) {
7478             m = n;
7479         }
7480     } else if (d == m) {
7481         m = memcpy(&scratch, m, opr_sz);
7482     }
7483 
7484     for (i = 0; i < opr_sz / 8; ++i) {
7485         uint64_t count = 0;
7486         if (pg[H1(i)] & 1) {
7487             uint64_t nn = n[i];
7488             for (j = 0; j <= i; ++j) {
7489                 if ((pg[H1(j)] & 1) && nn == m[j]) {
7490                     ++count;
7491                 }
7492             }
7493         }
7494         d[i] = count;
7495     }
7496 }
7497 
7498 /*
7499  * Returns the number of bytes in m0 and m1 that match n.
7500  * Unlike do_match2 we don't just need true/false, we need an exact count.
7501  * This requires two extra logical operations.
7502  */
do_histseg_cnt(uint8_t n,uint64_t m0,uint64_t m1)7503 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7504 {
7505     const uint64_t mask = dup_const(MO_8, 0x7f);
7506     uint64_t cmp0, cmp1;
7507 
7508     cmp1 = dup_const(MO_8, n);
7509     cmp0 = cmp1 ^ m0;
7510     cmp1 = cmp1 ^ m1;
7511 
7512     /*
7513      * 1: clear msb of each byte to avoid carry to next byte (& mask)
7514      * 2: carry in to msb if byte != 0 (+ mask)
7515      * 3: set msb if cmp has msb set (| cmp)
7516      * 4: set ~msb to ignore them (| mask)
7517      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7518      * 5: invert, resulting in 0x80 if and only if byte == 0.
7519      */
7520     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7521     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7522 
7523     /*
7524      * Combine the two compares in a way that the bits do
7525      * not overlap, and so preserves the count of set bits.
7526      * If the host has an efficient instruction for ctpop,
7527      * then ctpop(x) + ctpop(y) has the same number of
7528      * operations as ctpop(x | (y >> 1)).  If the host does
7529      * not have an efficient ctpop, then we only want to
7530      * use it once.
7531      */
7532     return ctpop64(cmp0 | (cmp1 >> 1));
7533 }
7534 
HELPER(sve2_histseg)7535 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7536 {
7537     intptr_t i, j;
7538     intptr_t opr_sz = simd_oprsz(desc);
7539 
7540     for (i = 0; i < opr_sz; i += 16) {
7541         uint64_t n0 = *(uint64_t *)(vn + i);
7542         uint64_t m0 = *(uint64_t *)(vm + i);
7543         uint64_t n1 = *(uint64_t *)(vn + i + 8);
7544         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7545         uint64_t out0 = 0;
7546         uint64_t out1 = 0;
7547 
7548         for (j = 0; j < 64; j += 8) {
7549             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7550             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7551             out0 |= cnt0 << j;
7552             out1 |= cnt1 << j;
7553         }
7554 
7555         *(uint64_t *)(vd + i) = out0;
7556         *(uint64_t *)(vd + i + 8) = out1;
7557     }
7558 }
7559 
HELPER(sve2_xar_b)7560 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7561 {
7562     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7563     int shr = simd_data(desc);
7564     int shl = 8 - shr;
7565     uint64_t mask = dup_const(MO_8, 0xff >> shr);
7566     uint64_t *d = vd, *n = vn, *m = vm;
7567 
7568     for (i = 0; i < opr_sz; ++i) {
7569         uint64_t t = n[i] ^ m[i];
7570         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7571     }
7572 }
7573 
HELPER(sve2_xar_h)7574 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7575 {
7576     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7577     int shr = simd_data(desc);
7578     int shl = 16 - shr;
7579     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7580     uint64_t *d = vd, *n = vn, *m = vm;
7581 
7582     for (i = 0; i < opr_sz; ++i) {
7583         uint64_t t = n[i] ^ m[i];
7584         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7585     }
7586 }
7587 
HELPER(sve2_xar_s)7588 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7589 {
7590     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7591     int shr = simd_data(desc);
7592     uint32_t *d = vd, *n = vn, *m = vm;
7593 
7594     for (i = 0; i < opr_sz; ++i) {
7595         d[i] = ror32(n[i] ^ m[i], shr);
7596     }
7597 }
7598 
HELPER(fmmla_s)7599 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7600                      float_status *status, uint32_t desc)
7601 {
7602     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7603 
7604     for (s = 0; s < opr_sz; ++s) {
7605         float32 *n = vn + s * sizeof(float32) * 4;
7606         float32 *m = vm + s * sizeof(float32) * 4;
7607         float32 *a = va + s * sizeof(float32) * 4;
7608         float32 *d = vd + s * sizeof(float32) * 4;
7609         float32 n00 = n[H4(0)], n01 = n[H4(1)];
7610         float32 n10 = n[H4(2)], n11 = n[H4(3)];
7611         float32 m00 = m[H4(0)], m01 = m[H4(1)];
7612         float32 m10 = m[H4(2)], m11 = m[H4(3)];
7613         float32 p0, p1;
7614 
7615         /* i = 0, j = 0 */
7616         p0 = float32_mul(n00, m00, status);
7617         p1 = float32_mul(n01, m01, status);
7618         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7619 
7620         /* i = 0, j = 1 */
7621         p0 = float32_mul(n00, m10, status);
7622         p1 = float32_mul(n01, m11, status);
7623         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7624 
7625         /* i = 1, j = 0 */
7626         p0 = float32_mul(n10, m00, status);
7627         p1 = float32_mul(n11, m01, status);
7628         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7629 
7630         /* i = 1, j = 1 */
7631         p0 = float32_mul(n10, m10, status);
7632         p1 = float32_mul(n11, m11, status);
7633         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7634     }
7635 }
7636 
HELPER(fmmla_d)7637 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7638                      float_status *status, uint32_t desc)
7639 {
7640     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7641 
7642     for (s = 0; s < opr_sz; ++s) {
7643         float64 *n = vn + s * sizeof(float64) * 4;
7644         float64 *m = vm + s * sizeof(float64) * 4;
7645         float64 *a = va + s * sizeof(float64) * 4;
7646         float64 *d = vd + s * sizeof(float64) * 4;
7647         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7648         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7649         float64 p0, p1;
7650 
7651         /* i = 0, j = 0 */
7652         p0 = float64_mul(n00, m00, status);
7653         p1 = float64_mul(n01, m01, status);
7654         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7655 
7656         /* i = 0, j = 1 */
7657         p0 = float64_mul(n00, m10, status);
7658         p1 = float64_mul(n01, m11, status);
7659         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7660 
7661         /* i = 1, j = 0 */
7662         p0 = float64_mul(n10, m00, status);
7663         p1 = float64_mul(n11, m01, status);
7664         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7665 
7666         /* i = 1, j = 1 */
7667         p0 = float64_mul(n10, m10, status);
7668         p1 = float64_mul(n11, m11, status);
7669         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7670     }
7671 }
7672 
7673 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7674 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
7675                   float_status *status, uint32_t desc)                        \
7676 {                                                                             \
7677     intptr_t i = simd_oprsz(desc);                                            \
7678     uint64_t *g = vg;                                                         \
7679     do {                                                                      \
7680         uint64_t pg = g[(i - 1) >> 6];                                        \
7681         do {                                                                  \
7682             i -= sizeof(TYPEW);                                               \
7683             if (likely((pg >> (i & 63)) & 1)) {                               \
7684                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
7685                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
7686             }                                                                 \
7687         } while (i & 63);                                                     \
7688     } while (i != 0);                                                         \
7689 }
7690 
7691 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7692 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7693 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7694 
7695 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7696 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
7697                   float_status *status, uint32_t desc)                        \
7698 {                                                                             \
7699     intptr_t i = simd_oprsz(desc);                                            \
7700     uint64_t *g = vg;                                                         \
7701     do {                                                                      \
7702         uint64_t pg = g[(i - 1) >> 6];                                        \
7703         do {                                                                  \
7704             i -= sizeof(TYPEW);                                               \
7705             if (likely((pg >> (i & 63)) & 1)) {                               \
7706                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
7707                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
7708             }                                                                 \
7709         } while (i & 63);                                                     \
7710     } while (i != 0);                                                         \
7711 }
7712 
7713 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7714 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7715 
7716 #undef DO_FCVTLT
7717 #undef DO_FCVTNT
7718