1 /*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/page-protection.h"
24 #include "exec/helper-proto.h"
25 #include "exec/target_page.h"
26 #include "exec/tlb-flags.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "fpu/softfloat.h"
29 #include "tcg/tcg.h"
30 #include "vec_internal.h"
31 #include "sve_ldst_internal.h"
32 #include "accel/tcg/cpu-ldst.h"
33 #include "accel/tcg/helper-retaddr.h"
34 #include "accel/tcg/cpu-ops.h"
35 #include "accel/tcg/probe.h"
36 #ifdef CONFIG_USER_ONLY
37 #include "user/page-protection.h"
38 #endif
39
40
41 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
42 *
43 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
44 * and bit 0 set if C is set. Compare the definitions of these variables
45 * within CPUARMState.
46 */
47
48 /* For no G bits set, NZCV = C. */
49 #define PREDTEST_INIT 1
50
51 /* This is an iterative function, called for each Pd and Pg word
52 * moving forward.
53 */
iter_predtest_fwd(uint64_t d,uint64_t g,uint32_t flags)54 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
55 {
56 if (likely(g)) {
57 /* Compute N from first D & G.
58 Use bit 2 to signal first G bit seen. */
59 if (!(flags & 4)) {
60 flags |= ((d & (g & -g)) != 0) << 31;
61 flags |= 4;
62 }
63
64 /* Accumulate Z from each D & G. */
65 flags |= ((d & g) != 0) << 1;
66
67 /* Compute C from last !(D & G). Replace previous. */
68 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
69 }
70 return flags;
71 }
72
73 /* This is an iterative function, called for each Pd and Pg word
74 * moving backward.
75 */
iter_predtest_bwd(uint64_t d,uint64_t g,uint32_t flags)76 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
77 {
78 if (likely(g)) {
79 /* Compute C from first (i.e last) !(D & G).
80 Use bit 2 to signal first G bit seen. */
81 if (!(flags & 4)) {
82 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
83 flags |= (d & pow2floor(g)) == 0;
84 }
85
86 /* Accumulate Z from each D & G. */
87 flags |= ((d & g) != 0) << 1;
88
89 /* Compute N from last (i.e first) D & G. Replace previous. */
90 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
91 }
92 return flags;
93 }
94
95 /* The same for a single word predicate. */
HELPER(sve_predtest1)96 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
97 {
98 return iter_predtest_fwd(d, g, PREDTEST_INIT);
99 }
100
101 /* The same for a multi-word predicate. */
HELPER(sve_predtest)102 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
103 {
104 uint32_t flags = PREDTEST_INIT;
105 uint64_t *d = vd, *g = vg;
106 uintptr_t i = 0;
107
108 do {
109 flags = iter_predtest_fwd(d[i], g[i], flags);
110 } while (++i < words);
111
112 return flags;
113 }
114
115 /* Similarly for single word elements. */
expand_pred_s(uint8_t byte)116 static inline uint64_t expand_pred_s(uint8_t byte)
117 {
118 static const uint64_t word[] = {
119 [0x01] = 0x00000000ffffffffull,
120 [0x10] = 0xffffffff00000000ull,
121 [0x11] = 0xffffffffffffffffull,
122 };
123 return word[byte & 0x11];
124 }
125
126 #define LOGICAL_PPPP(NAME, FUNC) \
127 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
128 { \
129 uintptr_t opr_sz = simd_oprsz(desc); \
130 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
131 uintptr_t i; \
132 for (i = 0; i < opr_sz / 8; ++i) { \
133 d[i] = FUNC(n[i], m[i], g[i]); \
134 } \
135 }
136
137 #define DO_AND(N, M, G) (((N) & (M)) & (G))
138 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
139 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
140 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
141 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
142 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
143 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
144 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
145
LOGICAL_PPPP(sve_and_pppp,DO_AND)146 LOGICAL_PPPP(sve_and_pppp, DO_AND)
147 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
148 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
149 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
150 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
151 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
152 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
153 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
154
155 #undef DO_AND
156 #undef DO_BIC
157 #undef DO_EOR
158 #undef DO_ORR
159 #undef DO_ORN
160 #undef DO_NOR
161 #undef DO_NAND
162 #undef DO_SEL
163 #undef LOGICAL_PPPP
164
165 /* Fully general three-operand expander, controlled by a predicate.
166 * This is complicated by the host-endian storage of the register file.
167 */
168 /* ??? I don't expect the compiler could ever vectorize this itself.
169 * With some tables we can convert bit masks to byte masks, and with
170 * extra care wrt byte/word ordering we could use gcc generic vectors
171 * and do 16 bytes at a time.
172 */
173 #define DO_ZPZZ(NAME, TYPE, H, OP) \
174 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
175 { \
176 intptr_t i, opr_sz = simd_oprsz(desc); \
177 for (i = 0; i < opr_sz; ) { \
178 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
179 do { \
180 if (pg & 1) { \
181 TYPE nn = *(TYPE *)(vn + H(i)); \
182 TYPE mm = *(TYPE *)(vm + H(i)); \
183 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
184 } \
185 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
186 } while (i & 15); \
187 } \
188 }
189
190 /* Similarly, specialized for 64-bit operands. */
191 #define DO_ZPZZ_D(NAME, TYPE, OP) \
192 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
193 { \
194 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
195 TYPE *d = vd, *n = vn, *m = vm; \
196 uint8_t *pg = vg; \
197 for (i = 0; i < opr_sz; i += 1) { \
198 if (pg[H1(i)] & 1) { \
199 TYPE nn = n[i], mm = m[i]; \
200 d[i] = OP(nn, mm); \
201 } \
202 } \
203 }
204
205 #define DO_AND(N, M) (N & M)
206 #define DO_EOR(N, M) (N ^ M)
207 #define DO_ORR(N, M) (N | M)
208 #define DO_BIC(N, M) (N & ~M)
209 #define DO_ADD(N, M) (N + M)
210 #define DO_SUB(N, M) (N - M)
211 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
212 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
213 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
214 #define DO_MUL(N, M) (N * M)
215
216
217 /*
218 * We must avoid the C undefined behaviour cases: division by
219 * zero and signed division of INT_MIN by -1. Both of these
220 * have architecturally defined required results for Arm.
221 * We special case all signed divisions by -1 to avoid having
222 * to deduce the minimum integer for the type involved.
223 */
224 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
225 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
226
227 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
228 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
229 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
230 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
231
232 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
233 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
234 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
235 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
236
237 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
238 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
239 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
240 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
241
242 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
243 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
244 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
245 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
246
247 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
248 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
249 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
250 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
251
252 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
253 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
254 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
255 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
256
257 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
258 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
259 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
260 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
261
262 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
263 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
264 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
265 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
266
267 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
268 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
269 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
270 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
271
272 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
273 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
274 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
275 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
276
277 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
278 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
279 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
280 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
281
282 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
283 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
284 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
285 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
286
287 /* Because the computation type is at least twice as large as required,
288 these work for both signed and unsigned source types. */
289 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
290 {
291 return (n * m) >> 8;
292 }
293
do_mulh_h(int32_t n,int32_t m)294 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
295 {
296 return (n * m) >> 16;
297 }
298
do_mulh_s(int64_t n,int64_t m)299 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
300 {
301 return (n * m) >> 32;
302 }
303
do_smulh_d(uint64_t n,uint64_t m)304 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
305 {
306 uint64_t lo, hi;
307 muls64(&lo, &hi, n, m);
308 return hi;
309 }
310
do_umulh_d(uint64_t n,uint64_t m)311 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
312 {
313 uint64_t lo, hi;
314 mulu64(&lo, &hi, n, m);
315 return hi;
316 }
317
DO_ZPZZ(sve_mul_zpzz_b,uint8_t,H1,DO_MUL)318 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
319 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
320 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
321 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
322
323 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
324 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
325 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
326 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
327
328 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
329 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
330 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
331 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
332
333 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
334 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
335
336 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
337 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
338
339 /* Note that all bits of the shift are significant
340 and not modulo the element size. */
341 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
342 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
343 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
344
345 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
346 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
347 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
348
349 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
350 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
351 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
352
353 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
354 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
355 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
356
357 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
358 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
359 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
360
361 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
362 {
363 int8_t n1 = n, n2 = n >> 8;
364 return m + n1 + n2;
365 }
366
do_sadalp_s(int32_t n,int32_t m)367 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
368 {
369 int16_t n1 = n, n2 = n >> 16;
370 return m + n1 + n2;
371 }
372
do_sadalp_d(int64_t n,int64_t m)373 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
374 {
375 int32_t n1 = n, n2 = n >> 32;
376 return m + n1 + n2;
377 }
378
DO_ZPZZ(sve2_sadalp_zpzz_h,int16_t,H1_2,do_sadalp_h)379 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
380 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
381 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
382
383 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
384 {
385 uint8_t n1 = n, n2 = n >> 8;
386 return m + n1 + n2;
387 }
388
do_uadalp_s(uint32_t n,uint32_t m)389 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
390 {
391 uint16_t n1 = n, n2 = n >> 16;
392 return m + n1 + n2;
393 }
394
do_uadalp_d(uint64_t n,uint64_t m)395 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
396 {
397 uint32_t n1 = n, n2 = n >> 32;
398 return m + n1 + n2;
399 }
400
DO_ZPZZ(sve2_uadalp_zpzz_h,uint16_t,H1_2,do_uadalp_h)401 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
402 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
403 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
404
405 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
406 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
407 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
408 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
409
410 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
411 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
412 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
413 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
414
415 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
416 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
417 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
418 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
419
420 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
421 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
422 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
423 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
424
425 /*
426 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
427 * We pass in a pointer to a dummy saturation field to trigger
428 * the saturating arithmetic but discard the information about
429 * whether it has occurred.
430 */
431 #define do_sqshl_b(n, m) \
432 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
433 #define do_sqshl_h(n, m) \
434 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
435 #define do_sqshl_s(n, m) \
436 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
437 #define do_sqshl_d(n, m) \
438 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
439
440 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
441 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
442 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
443 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
444
445 #define do_uqshl_b(n, m) \
446 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
447 #define do_uqshl_h(n, m) \
448 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
449 #define do_uqshl_s(n, m) \
450 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
451 #define do_uqshl_d(n, m) \
452 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
453
454 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
455 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
456 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
457 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
458
459 #define do_sqrshl_b(n, m) \
460 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
461 #define do_sqrshl_h(n, m) \
462 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
463 #define do_sqrshl_s(n, m) \
464 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
465 #define do_sqrshl_d(n, m) \
466 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
467
468 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
469 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
470 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
471 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
472
473 #undef do_sqrshl_d
474
475 #define do_uqrshl_b(n, m) \
476 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
477 #define do_uqrshl_h(n, m) \
478 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
479 #define do_uqrshl_s(n, m) \
480 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
481 #define do_uqrshl_d(n, m) \
482 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
483
484 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
485 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
486 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
487 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
488
489 #undef do_uqrshl_d
490
491 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
492 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
493
494 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
495 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
496 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
497 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
498
499 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
500 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
501 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
502 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
503
504 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
505 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
506
507 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
508 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
509 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
510 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
511
512 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
513 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
514 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
515 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
516
517 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
518 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
519
520 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
521 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
522 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
523 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
524
525 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
526 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
527 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
528 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
529
530 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
531 {
532 return val >= max ? max : val <= min ? min : val;
533 }
534
535 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
536 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
537 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
538
do_sqadd_d(int64_t n,int64_t m)539 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
540 {
541 int64_t r = n + m;
542 if (((r ^ n) & ~(n ^ m)) < 0) {
543 /* Signed overflow. */
544 return r < 0 ? INT64_MAX : INT64_MIN;
545 }
546 return r;
547 }
548
DO_ZPZZ(sve2_sqadd_zpzz_b,int8_t,H1,DO_SQADD_B)549 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
550 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
551 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
552 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
553
554 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
555 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
556 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
557
558 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
559 {
560 uint64_t r = n + m;
561 return r < n ? UINT64_MAX : r;
562 }
563
DO_ZPZZ(sve2_uqadd_zpzz_b,uint8_t,H1,DO_UQADD_B)564 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
565 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
566 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
567 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
568
569 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
570 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
571 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
572
573 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
574 {
575 int64_t r = n - m;
576 if (((r ^ n) & (n ^ m)) < 0) {
577 /* Signed overflow. */
578 return r < 0 ? INT64_MAX : INT64_MIN;
579 }
580 return r;
581 }
582
DO_ZPZZ(sve2_sqsub_zpzz_b,int8_t,H1,DO_SQSUB_B)583 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
584 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
585 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
586 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
587
588 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
589 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
590 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
591
592 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
593 {
594 return n > m ? n - m : 0;
595 }
596
DO_ZPZZ(sve2_uqsub_zpzz_b,uint8_t,H1,DO_UQSUB_B)597 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
598 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
599 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
600 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
601
602 #define DO_SUQADD_B(n, m) \
603 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
604 #define DO_SUQADD_H(n, m) \
605 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
606 #define DO_SUQADD_S(n, m) \
607 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
608
609 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
610 {
611 uint64_t r = n + m;
612
613 if (n < 0) {
614 /* Note that m - abs(n) cannot underflow. */
615 if (r > INT64_MAX) {
616 /* Result is either very large positive or negative. */
617 if (m > -n) {
618 /* m > abs(n), so r is a very large positive. */
619 return INT64_MAX;
620 }
621 /* Result is negative. */
622 }
623 } else {
624 /* Both inputs are positive: check for overflow. */
625 if (r < m || r > INT64_MAX) {
626 return INT64_MAX;
627 }
628 }
629 return r;
630 }
631
DO_ZPZZ(sve2_suqadd_zpzz_b,uint8_t,H1,DO_SUQADD_B)632 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
633 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
634 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
635 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
636
637 #define DO_USQADD_B(n, m) \
638 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
639 #define DO_USQADD_H(n, m) \
640 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
641 #define DO_USQADD_S(n, m) \
642 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
643
644 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
645 {
646 uint64_t r = n + m;
647
648 if (m < 0) {
649 return n < -m ? 0 : r;
650 }
651 return r < n ? UINT64_MAX : r;
652 }
653
DO_ZPZZ(sve2_usqadd_zpzz_b,uint8_t,H1,DO_USQADD_B)654 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
655 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
656 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
657 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
658
659 #undef DO_ZPZZ
660 #undef DO_ZPZZ_D
661
662 /*
663 * Three operand expander, operating on element pairs.
664 * If the slot I is even, the elements from from VN {I, I+1}.
665 * If the slot I is odd, the elements from from VM {I-1, I}.
666 * Load all of the input elements in each pair before overwriting output.
667 */
668 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
669 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
670 { \
671 intptr_t i, opr_sz = simd_oprsz(desc); \
672 for (i = 0; i < opr_sz; ) { \
673 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
674 do { \
675 TYPE n0 = *(TYPE *)(vn + H(i)); \
676 TYPE m0 = *(TYPE *)(vm + H(i)); \
677 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
678 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
679 if (pg & 1) { \
680 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
681 } \
682 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
683 if (pg & 1) { \
684 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
685 } \
686 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
687 } while (i & 15); \
688 } \
689 }
690
691 /* Similarly, specialized for 64-bit operands. */
692 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
693 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
694 { \
695 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
696 TYPE *d = vd, *n = vn, *m = vm; \
697 uint8_t *pg = vg; \
698 for (i = 0; i < opr_sz; i += 2) { \
699 TYPE n0 = n[i], n1 = n[i + 1]; \
700 TYPE m0 = m[i], m1 = m[i + 1]; \
701 if (pg[H1(i)] & 1) { \
702 d[i] = OP(n0, n1); \
703 } \
704 if (pg[H1(i + 1)] & 1) { \
705 d[i + 1] = OP(m0, m1); \
706 } \
707 } \
708 }
709
710 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
711 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
712 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
713 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
714
715 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
716 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
717 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
718 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
719
720 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
721 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
722 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
723 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
724
725 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
726 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
727 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
728 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
729
730 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
731 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
732 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
733 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
734
735 #undef DO_ZPZZ_PAIR
736 #undef DO_ZPZZ_PAIR_D
737
738 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
739 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
740 float_status *status, uint32_t desc) \
741 { \
742 intptr_t i, opr_sz = simd_oprsz(desc); \
743 for (i = 0; i < opr_sz; ) { \
744 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
745 do { \
746 TYPE n0 = *(TYPE *)(vn + H(i)); \
747 TYPE m0 = *(TYPE *)(vm + H(i)); \
748 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
749 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
750 if (pg & 1) { \
751 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
752 } \
753 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
754 if (pg & 1) { \
755 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
756 } \
757 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
758 } while (i & 15); \
759 } \
760 }
761
762 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
763 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
764 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
765
766 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
767 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
768 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
769
770 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
771 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
772 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
773
774 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
775 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
776 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
777
778 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
779 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
780 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
781
782 #undef DO_ZPZZ_PAIR_FP
783
784 /* Three-operand expander, controlled by a predicate, in which the
785 * third operand is "wide". That is, for D = N op M, the same 64-bit
786 * value of M is used with all of the narrower values of N.
787 */
788 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
789 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
790 { \
791 intptr_t i, opr_sz = simd_oprsz(desc); \
792 for (i = 0; i < opr_sz; ) { \
793 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
794 TYPEW mm = *(TYPEW *)(vm + i); \
795 do { \
796 if (pg & 1) { \
797 TYPE nn = *(TYPE *)(vn + H(i)); \
798 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
799 } \
800 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
801 } while (i & 7); \
802 } \
803 }
804
805 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
806 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
807 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
808
809 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
810 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
811 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
812
813 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
814 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
815 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
816
817 #undef DO_ZPZW
818
819 /* Fully general two-operand expander, controlled by a predicate.
820 */
821 #define DO_ZPZ(NAME, TYPE, H, OP) \
822 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
823 { \
824 intptr_t i, opr_sz = simd_oprsz(desc); \
825 for (i = 0; i < opr_sz; ) { \
826 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
827 do { \
828 if (pg & 1) { \
829 TYPE nn = *(TYPE *)(vn + H(i)); \
830 *(TYPE *)(vd + H(i)) = OP(nn); \
831 } \
832 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
833 } while (i & 15); \
834 } \
835 }
836
837 /* Similarly, specialized for 64-bit operands. */
838 #define DO_ZPZ_D(NAME, TYPE, OP) \
839 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
840 { \
841 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
842 TYPE *d = vd, *n = vn; \
843 uint8_t *pg = vg; \
844 for (i = 0; i < opr_sz; i += 1) { \
845 if (pg[H1(i)] & 1) { \
846 TYPE nn = n[i]; \
847 d[i] = OP(nn); \
848 } \
849 } \
850 }
851
852 #define DO_CLS_B(N) (clrsb32(N) - 24)
853 #define DO_CLS_H(N) (clrsb32(N) - 16)
854
855 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
856 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
857 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
858 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
859
860 #define DO_CLZ_B(N) (clz32(N) - 24)
861 #define DO_CLZ_H(N) (clz32(N) - 16)
862
863 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
864 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
865 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
866 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
867
868 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
869 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
870 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
871 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
872
873 #define DO_CNOT(N) (N == 0)
874
875 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
876 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
877 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
878 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
879
880 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
881
882 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
883 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
884 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
885
886 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N))
887 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N))
888 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N))
889
890 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H)
891 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S)
892 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D)
893
894 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
895
896 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
897 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
898 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
899
900 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N))
901 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N))
902 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N))
903
904 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H)
905 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S)
906 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D)
907
908 #define DO_NOT(N) (~N)
909
910 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
911 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
912 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
913 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
914
915 #define DO_SXTB(N) ((int8_t)N)
916 #define DO_SXTH(N) ((int16_t)N)
917 #define DO_SXTS(N) ((int32_t)N)
918 #define DO_UXTB(N) ((uint8_t)N)
919 #define DO_UXTH(N) ((uint16_t)N)
920 #define DO_UXTS(N) ((uint32_t)N)
921
922 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
923 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
924 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
925 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
926 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
927 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
928
929 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
930 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
931 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
932 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
933 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
934 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
935
936 #define DO_ABS(N) (N < 0 ? -N : N)
937
938 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
939 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
940 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
941 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
942
943 #define DO_NEG(N) (-N)
944
945 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
946 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
947 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
948 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
949
950 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
951 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
952 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
953
954 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
955 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
956
957 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
958
959 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
960 {
961 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
962 uint64_t *d = vd, *n = vn;
963 uint8_t *pg = vg;
964
965 for (i = 0; i < opr_sz; i += 2) {
966 if (pg[H1(i)] & 1) {
967 uint64_t n0 = n[i + 0];
968 uint64_t n1 = n[i + 1];
969 d[i + 0] = n1;
970 d[i + 1] = n0;
971 }
972 }
973 }
974
DO_ZPZ(sve_rbit_b,uint8_t,H1,revbit8)975 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
976 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
977 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
978 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
979
980 #define DO_SQABS(X) \
981 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
982 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
983
984 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
985 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
986 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
987 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
988
989 #define DO_SQNEG(X) \
990 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
991 x_ == min_ ? -min_ - 1 : -x_; })
992
993 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
994 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
995 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
996 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
997
998 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
999 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
1000
1001 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1002 */
1003 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1004 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1005 { \
1006 intptr_t i, opr_sz = simd_oprsz(desc); \
1007 for (i = 0; i < opr_sz; ) { \
1008 TYPEW mm = *(TYPEW *)(vm + i); \
1009 do { \
1010 TYPE nn = *(TYPE *)(vn + H(i)); \
1011 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1012 i += sizeof(TYPE); \
1013 } while (i & 7); \
1014 } \
1015 }
1016
1017 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1018 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1019 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1020
1021 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1022 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1023 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1024
1025 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1026 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1027 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1028
1029 #undef DO_ZZW
1030
1031 #undef DO_CLS_B
1032 #undef DO_CLS_H
1033 #undef DO_CLZ_B
1034 #undef DO_CLZ_H
1035 #undef DO_CNOT
1036 #undef DO_FABS
1037 #undef DO_FNEG
1038 #undef DO_ABS
1039 #undef DO_NEG
1040 #undef DO_ZPZ
1041 #undef DO_ZPZ_D
1042
1043 /*
1044 * Three-operand expander, unpredicated, in which the two inputs are
1045 * selected from the top or bottom half of the wide column.
1046 */
1047 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1048 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1049 { \
1050 intptr_t i, opr_sz = simd_oprsz(desc); \
1051 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1052 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1053 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1054 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1055 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1056 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1057 } \
1058 }
1059
1060 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1061 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1062 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1063
1064 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1065 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1066 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1067
1068 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1069 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1070 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1071
1072 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1073 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1074 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1075
1076 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1077 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1078 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1079
1080 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1081 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1082 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1083
1084 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1085 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1086 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1087
1088 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1089 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1090 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1091
1092 /* Note that the multiply cannot overflow, but the doubling can. */
1093 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1094 {
1095 int16_t val = n * m;
1096 return DO_SQADD_H(val, val);
1097 }
1098
do_sqdmull_s(int32_t n,int32_t m)1099 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1100 {
1101 int32_t val = n * m;
1102 return DO_SQADD_S(val, val);
1103 }
1104
do_sqdmull_d(int64_t n,int64_t m)1105 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1106 {
1107 int64_t val = n * m;
1108 return do_sqadd_d(val, val);
1109 }
1110
DO_ZZZ_TB(sve2_sqdmull_zzz_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h)1111 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1112 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1113 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1114
1115 #undef DO_ZZZ_TB
1116
1117 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1118 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1119 { \
1120 intptr_t i, opr_sz = simd_oprsz(desc); \
1121 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1122 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1123 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1124 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1125 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1126 } \
1127 }
1128
1129 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1130 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1131 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1132
1133 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1134 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1135 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1136
1137 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1138 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1139 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1140
1141 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1142 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1143 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1144
1145 #undef DO_ZZZ_WTB
1146
1147 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1148 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1149 { \
1150 intptr_t i, opr_sz = simd_oprsz(desc); \
1151 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1152 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1153 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1154 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1155 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1156 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1157 } \
1158 }
1159
1160 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1161 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1162 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1163 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1164
1165 #undef DO_ZZZ_NTB
1166
1167 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1168 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1169 { \
1170 intptr_t i, opr_sz = simd_oprsz(desc); \
1171 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1172 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1173 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1174 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1175 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1176 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1177 } \
1178 }
1179
1180 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1181 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1182 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1183
1184 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1185 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1186 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1187
1188 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1189 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1190 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1191
1192 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1193 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1194 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1195
1196 #define DO_NMUL(N, M) -(N * M)
1197
1198 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1199 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1200 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1201
1202 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1203 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1204 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1205
1206 #undef DO_ZZZW_ACC
1207
1208 #define DO_XTNB(NAME, TYPE, OP) \
1209 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1210 { \
1211 intptr_t i, opr_sz = simd_oprsz(desc); \
1212 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1213 TYPE nn = *(TYPE *)(vn + i); \
1214 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1215 *(TYPE *)(vd + i) = nn; \
1216 } \
1217 }
1218
1219 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1220 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1221 { \
1222 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1223 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1224 TYPE nn = *(TYPE *)(vn + i); \
1225 *(TYPEN *)(vd + i + odd) = OP(nn); \
1226 } \
1227 }
1228
1229 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1230 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1231 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1232
1233 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1234 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1235 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1236
1237 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1238 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1239 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1240
1241 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1242 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1243 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1244
1245 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1246 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1247 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1248
1249 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1250 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1251 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1252
1253 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1254 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1255 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1256
1257 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1258 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1259 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1260
1261 #undef DO_XTNB
1262 #undef DO_XTNT
1263
1264 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1265 {
1266 intptr_t i, opr_sz = simd_oprsz(desc);
1267 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1268 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1269 uint32_t *a = va, *n = vn;
1270 uint64_t *d = vd, *m = vm;
1271
1272 for (i = 0; i < opr_sz / 8; ++i) {
1273 uint32_t e1 = a[2 * i + H4(0)];
1274 uint32_t e2 = n[2 * i + sel] ^ inv;
1275 uint64_t c = extract64(m[i], 32, 1);
1276 /* Compute and store the entire 33-bit result at once. */
1277 d[i] = c + e1 + e2;
1278 }
1279 }
1280
HELPER(sve2_adcl_d)1281 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1282 {
1283 intptr_t i, opr_sz = simd_oprsz(desc);
1284 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1285 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1286 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1287
1288 for (i = 0; i < opr_sz / 8; i += 2) {
1289 Int128 e1 = int128_make64(a[i]);
1290 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1291 Int128 c = int128_make64(m[i + 1] & 1);
1292 Int128 r = int128_add(int128_add(e1, e2), c);
1293 d[i + 0] = int128_getlo(r);
1294 d[i + 1] = int128_gethi(r);
1295 }
1296 }
1297
1298 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1299 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1300 { \
1301 intptr_t i, opr_sz = simd_oprsz(desc); \
1302 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1303 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1304 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1305 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1306 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1307 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1308 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1309 } \
1310 }
1311
DO_SQDMLAL(sve2_sqdmlal_zzzw_h,int16_t,int8_t,H1_2,H1,do_sqdmull_h,DO_SQADD_H)1312 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1313 do_sqdmull_h, DO_SQADD_H)
1314 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1315 do_sqdmull_s, DO_SQADD_S)
1316 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1317 do_sqdmull_d, do_sqadd_d)
1318
1319 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1320 do_sqdmull_h, DO_SQSUB_H)
1321 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1322 do_sqdmull_s, DO_SQSUB_S)
1323 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1324 do_sqdmull_d, do_sqsub_d)
1325
1326 #undef DO_SQDMLAL
1327
1328 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1329 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1330 { \
1331 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1332 int rot = simd_data(desc); \
1333 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1334 bool sub_r = rot == 1 || rot == 2; \
1335 bool sub_i = rot >= 2; \
1336 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1337 for (i = 0; i < opr_sz; i += 2) { \
1338 TYPE elt1_a = n[H(i + sel_a)]; \
1339 TYPE elt2_a = m[H(i + sel_a)]; \
1340 TYPE elt2_b = m[H(i + sel_b)]; \
1341 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1342 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1343 } \
1344 }
1345
1346 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1347
1348 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1349 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1350 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1351 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1352
1353 #define DO_SQRDMLAH_B(N, M, A, S) \
1354 do_sqrdmlah_b(N, M, A, S, true)
1355 #define DO_SQRDMLAH_H(N, M, A, S) \
1356 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1357 #define DO_SQRDMLAH_S(N, M, A, S) \
1358 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1359 #define DO_SQRDMLAH_D(N, M, A, S) \
1360 do_sqrdmlah_d(N, M, A, S, true)
1361
1362 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1363 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1364 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1365 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1366
1367 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1368 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1369 { \
1370 intptr_t i, j, oprsz = simd_oprsz(desc); \
1371 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1372 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1373 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1374 bool sub_r = rot == 1 || rot == 2; \
1375 bool sub_i = rot >= 2; \
1376 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1377 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1378 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1379 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1380 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1381 TYPE elt1_a = n[H(i + j + sel_a)]; \
1382 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1383 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1384 } \
1385 } \
1386 }
1387
1388 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1389 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1390
1391 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1392 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1393
1394 #undef DO_CMLA
1395 #undef DO_CMLA_FUNC
1396 #undef DO_CMLA_IDX_FUNC
1397 #undef DO_SQRDMLAH_B
1398 #undef DO_SQRDMLAH_H
1399 #undef DO_SQRDMLAH_S
1400 #undef DO_SQRDMLAH_D
1401
1402 /* Note N and M are 4 elements bundled into one unit. */
1403 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1404 int sel_a, int sel_b, int sub_i)
1405 {
1406 for (int i = 0; i <= 1; i++) {
1407 int32_t elt1_r = (int8_t)(n >> (16 * i));
1408 int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1409 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1410 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1411
1412 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1413 }
1414 return a;
1415 }
1416
do_cdot_d(uint64_t n,uint64_t m,int64_t a,int sel_a,int sel_b,int sub_i)1417 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1418 int sel_a, int sel_b, int sub_i)
1419 {
1420 for (int i = 0; i <= 1; i++) {
1421 int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1422 int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1423 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1424 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1425
1426 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1427 }
1428 return a;
1429 }
1430
HELPER(sve2_cdot_zzzz_s)1431 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1432 void *va, uint32_t desc)
1433 {
1434 int opr_sz = simd_oprsz(desc);
1435 int rot = simd_data(desc);
1436 int sel_a = rot & 1;
1437 int sel_b = sel_a ^ 1;
1438 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1439 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1440
1441 for (int e = 0; e < opr_sz / 4; e++) {
1442 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1443 }
1444 }
1445
HELPER(sve2_cdot_zzzz_d)1446 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1447 void *va, uint32_t desc)
1448 {
1449 int opr_sz = simd_oprsz(desc);
1450 int rot = simd_data(desc);
1451 int sel_a = rot & 1;
1452 int sel_b = sel_a ^ 1;
1453 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1454 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1455
1456 for (int e = 0; e < opr_sz / 8; e++) {
1457 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1458 }
1459 }
1460
HELPER(sve2_cdot_idx_s)1461 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1462 void *va, uint32_t desc)
1463 {
1464 int opr_sz = simd_oprsz(desc);
1465 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1466 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1467 int sel_a = rot & 1;
1468 int sel_b = sel_a ^ 1;
1469 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1470 uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1471
1472 for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1473 uint32_t seg_m = m[seg + idx];
1474 for (int e = 0; e < 4; e++) {
1475 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1476 sel_a, sel_b, sub_i);
1477 }
1478 }
1479 }
1480
HELPER(sve2_cdot_idx_d)1481 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1482 void *va, uint32_t desc)
1483 {
1484 int seg, opr_sz = simd_oprsz(desc);
1485 int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1486 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1487 int sel_a = rot & 1;
1488 int sel_b = sel_a ^ 1;
1489 int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1490 uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1491
1492 for (seg = 0; seg < opr_sz / 8; seg += 2) {
1493 uint64_t seg_m = m[seg + idx];
1494 for (int e = 0; e < 2; e++) {
1495 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1496 sel_a, sel_b, sub_i);
1497 }
1498 }
1499 }
1500
1501 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1502 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1503 { \
1504 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1505 intptr_t i, j, idx = simd_data(desc); \
1506 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1507 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1508 TYPE mm = m[i]; \
1509 for (j = 0; j < segment; j++) { \
1510 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1511 } \
1512 } \
1513 }
1514
1515 #define DO_SQRDMLAH_H(N, M, A) \
1516 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1517 #define DO_SQRDMLAH_S(N, M, A) \
1518 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1519 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1520
DO_ZZXZ(sve2_sqrdmlah_idx_h,int16_t,H2,DO_SQRDMLAH_H)1521 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1522 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1523 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1524
1525 #define DO_SQRDMLSH_H(N, M, A) \
1526 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1527 #define DO_SQRDMLSH_S(N, M, A) \
1528 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1529 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1530
1531 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1532 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1533 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1534
1535 #undef DO_ZZXZ
1536
1537 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1538 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1539 { \
1540 intptr_t i, j, oprsz = simd_oprsz(desc); \
1541 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1542 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1543 for (i = 0; i < oprsz; i += 16) { \
1544 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1545 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1546 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1547 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1548 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1549 } \
1550 } \
1551 }
1552
1553 #define DO_MLA(N, M, A) (A + N * M)
1554
1555 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1556 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1557 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1558 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1559
1560 #define DO_MLS(N, M, A) (A - N * M)
1561
1562 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1563 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1564 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1565 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1566
1567 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1568 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1569
1570 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1571 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1572
1573 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1574 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1575
1576 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1577 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1578
1579 #undef DO_MLA
1580 #undef DO_MLS
1581 #undef DO_ZZXW
1582
1583 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1584 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1585 { \
1586 intptr_t i, j, oprsz = simd_oprsz(desc); \
1587 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1588 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1589 for (i = 0; i < oprsz; i += 16) { \
1590 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1591 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1592 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1593 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1594 } \
1595 } \
1596 }
1597
1598 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1599 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1600
1601 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1602 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1603
1604 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1605 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1606
1607 #undef DO_ZZX
1608
1609 #define DO_BITPERM(NAME, TYPE, OP) \
1610 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1611 { \
1612 intptr_t i, opr_sz = simd_oprsz(desc); \
1613 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1614 TYPE nn = *(TYPE *)(vn + i); \
1615 TYPE mm = *(TYPE *)(vm + i); \
1616 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1617 } \
1618 }
1619
1620 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1621 {
1622 uint64_t res = 0;
1623 int db, rb = 0;
1624
1625 for (db = 0; db < n; ++db) {
1626 if ((mask >> db) & 1) {
1627 res |= ((data >> db) & 1) << rb;
1628 ++rb;
1629 }
1630 }
1631 return res;
1632 }
1633
DO_BITPERM(sve2_bext_b,uint8_t,bitextract)1634 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1635 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1636 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1637 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1638
1639 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1640 {
1641 uint64_t res = 0;
1642 int rb, db = 0;
1643
1644 for (rb = 0; rb < n; ++rb) {
1645 if ((mask >> rb) & 1) {
1646 res |= ((data >> db) & 1) << rb;
1647 ++db;
1648 }
1649 }
1650 return res;
1651 }
1652
DO_BITPERM(sve2_bdep_b,uint8_t,bitdeposit)1653 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1654 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1655 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1656 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1657
1658 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1659 {
1660 uint64_t resm = 0, resu = 0;
1661 int db, rbm = 0, rbu = 0;
1662
1663 for (db = 0; db < n; ++db) {
1664 uint64_t val = (data >> db) & 1;
1665 if ((mask >> db) & 1) {
1666 resm |= val << rbm++;
1667 } else {
1668 resu |= val << rbu++;
1669 }
1670 }
1671
1672 return resm | (resu << rbm);
1673 }
1674
DO_BITPERM(sve2_bgrp_b,uint8_t,bitgroup)1675 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1676 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1677 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1678 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1679
1680 #undef DO_BITPERM
1681
1682 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1683 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1684 { \
1685 intptr_t i, opr_sz = simd_oprsz(desc); \
1686 int sub_r = simd_data(desc); \
1687 if (sub_r) { \
1688 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1689 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1690 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1691 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1692 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1693 acc_r = ADD_OP(acc_r, el2_i); \
1694 acc_i = SUB_OP(acc_i, el2_r); \
1695 *(TYPE *)(vd + H(i)) = acc_r; \
1696 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1697 } \
1698 } else { \
1699 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1700 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1701 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1702 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1703 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1704 acc_r = SUB_OP(acc_r, el2_i); \
1705 acc_i = ADD_OP(acc_i, el2_r); \
1706 *(TYPE *)(vd + H(i)) = acc_r; \
1707 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1708 } \
1709 } \
1710 }
1711
1712 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1713 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1714 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1715 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1716
1717 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1718 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1719 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1720 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1721
1722 #undef DO_CADD
1723
1724 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1725 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1726 { \
1727 intptr_t i, opr_sz = simd_oprsz(desc); \
1728 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1729 int shift = simd_data(desc) >> 1; \
1730 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1731 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1732 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1733 } \
1734 }
1735
1736 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1737 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1738 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1739
1740 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1741 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1742 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1743
1744 #undef DO_ZZI_SHLL
1745
1746 /* Two-operand reduction expander, controlled by a predicate.
1747 * The difference between TYPERED and TYPERET has to do with
1748 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1749 * but TYPERET must be unsigned so that e.g. a 32-bit value
1750 * is not sign-extended to the ABI uint64_t return type.
1751 */
1752 /* ??? If we were to vectorize this by hand the reduction ordering
1753 * would change. For integer operands, this is perfectly fine.
1754 */
1755 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1756 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1757 { \
1758 intptr_t i, opr_sz = simd_oprsz(desc); \
1759 TYPERED ret = INIT; \
1760 for (i = 0; i < opr_sz; ) { \
1761 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1762 do { \
1763 if (pg & 1) { \
1764 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1765 ret = OP(ret, nn); \
1766 } \
1767 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1768 } while (i & 15); \
1769 } \
1770 return (TYPERET)ret; \
1771 }
1772
1773 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1774 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1775 { \
1776 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1777 TYPEE *n = vn; \
1778 uint8_t *pg = vg; \
1779 TYPER ret = INIT; \
1780 for (i = 0; i < opr_sz; i += 1) { \
1781 if (pg[H1(i)] & 1) { \
1782 TYPEE nn = n[i]; \
1783 ret = OP(ret, nn); \
1784 } \
1785 } \
1786 return ret; \
1787 }
1788
1789 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1790 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1791 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1792 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1793
1794 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1795 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1796 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1797 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1798
1799 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1800 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1801 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1802 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1803
1804 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1805 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1806 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1807
1808 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1809 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1810 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1811 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1812
1813 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1814 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1815 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1816 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1817
1818 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1819 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1820 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1821 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1822
1823 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1824 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1825 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1826 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1827
1828 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1829 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1830 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1831 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1832
1833 #undef DO_VPZ
1834 #undef DO_VPZ_D
1835
1836 /* Two vector operand, one scalar operand, unpredicated. */
1837 #define DO_ZZI(NAME, TYPE, OP) \
1838 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1839 { \
1840 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1841 TYPE s = s64, *d = vd, *n = vn; \
1842 for (i = 0; i < opr_sz; ++i) { \
1843 d[i] = OP(n[i], s); \
1844 } \
1845 }
1846
1847 #define DO_SUBR(X, Y) (Y - X)
1848
1849 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1850 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1851 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1852 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1853
1854 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1855 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1856 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1857 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1858
1859 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1860 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1861 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1862 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1863
1864 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1865 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1866 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1867 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1868
1869 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1870 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1871 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1872 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1873
1874 #undef DO_ZZI
1875
1876 #undef DO_AND
1877 #undef DO_ORR
1878 #undef DO_EOR
1879 #undef DO_BIC
1880 #undef DO_ADD
1881 #undef DO_SUB
1882 #undef DO_MAX
1883 #undef DO_MIN
1884 #undef DO_ABD
1885 #undef DO_MUL
1886 #undef DO_DIV
1887 #undef DO_ASR
1888 #undef DO_LSR
1889 #undef DO_LSL
1890 #undef DO_SUBR
1891
1892 /* Similar to the ARM LastActiveElement pseudocode function, except the
1893 result is multiplied by the element size. This includes the not found
1894 indication; e.g. not found for esz=3 is -8. */
1895 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1896 {
1897 uint64_t mask = pred_esz_masks[esz];
1898 intptr_t i = words;
1899
1900 do {
1901 uint64_t this_g = g[--i] & mask;
1902 if (this_g) {
1903 return i * 64 + (63 - clz64(this_g));
1904 }
1905 } while (i > 0);
1906 return (intptr_t)-1 << esz;
1907 }
1908
HELPER(sve_pfirst)1909 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1910 {
1911 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1912 uint32_t flags = PREDTEST_INIT;
1913 uint64_t *d = vd, *g = vg;
1914 intptr_t i = 0;
1915
1916 do {
1917 uint64_t this_d = d[i];
1918 uint64_t this_g = g[i];
1919
1920 if (this_g) {
1921 if (!(flags & 4)) {
1922 /* Set in D the first bit of G. */
1923 this_d |= this_g & -this_g;
1924 d[i] = this_d;
1925 }
1926 flags = iter_predtest_fwd(this_d, this_g, flags);
1927 }
1928 } while (++i < words);
1929
1930 return flags;
1931 }
1932
HELPER(sve_pnext)1933 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1934 {
1935 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1936 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1937 uint32_t flags = PREDTEST_INIT;
1938 uint64_t *d = vd, *g = vg, esz_mask;
1939 intptr_t i, next;
1940
1941 next = last_active_element(vd, words, esz) + (1 << esz);
1942 esz_mask = pred_esz_masks[esz];
1943
1944 /* Similar to the pseudocode for pnext, but scaled by ESZ
1945 so that we find the correct bit. */
1946 if (next < words * 64) {
1947 uint64_t mask = -1;
1948
1949 if (next & 63) {
1950 mask = ~((1ull << (next & 63)) - 1);
1951 next &= -64;
1952 }
1953 do {
1954 uint64_t this_g = g[next / 64] & esz_mask & mask;
1955 if (this_g != 0) {
1956 next = (next & -64) + ctz64(this_g);
1957 break;
1958 }
1959 next += 64;
1960 mask = -1;
1961 } while (next < words * 64);
1962 }
1963
1964 i = 0;
1965 do {
1966 uint64_t this_d = 0;
1967 if (i == next / 64) {
1968 this_d = 1ull << (next & 63);
1969 }
1970 d[i] = this_d;
1971 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1972 } while (++i < words);
1973
1974 return flags;
1975 }
1976
1977 /*
1978 * Copy Zn into Zd, and store zero into inactive elements.
1979 * If inv, store zeros into the active elements.
1980 */
HELPER(sve_movz_b)1981 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1982 {
1983 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1984 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1985 uint64_t *d = vd, *n = vn;
1986 uint8_t *pg = vg;
1987
1988 for (i = 0; i < opr_sz; i += 1) {
1989 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1990 }
1991 }
1992
HELPER(sve_movz_h)1993 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1994 {
1995 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1996 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1997 uint64_t *d = vd, *n = vn;
1998 uint8_t *pg = vg;
1999
2000 for (i = 0; i < opr_sz; i += 1) {
2001 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
2002 }
2003 }
2004
HELPER(sve_movz_s)2005 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2006 {
2007 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2008 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2009 uint64_t *d = vd, *n = vn;
2010 uint8_t *pg = vg;
2011
2012 for (i = 0; i < opr_sz; i += 1) {
2013 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2014 }
2015 }
2016
HELPER(sve_movz_d)2017 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2018 {
2019 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2020 uint64_t *d = vd, *n = vn;
2021 uint8_t *pg = vg;
2022 uint8_t inv = simd_data(desc);
2023
2024 for (i = 0; i < opr_sz; i += 1) {
2025 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2026 }
2027 }
2028
2029 /* Three-operand expander, immediate operand, controlled by a predicate.
2030 */
2031 #define DO_ZPZI(NAME, TYPE, H, OP) \
2032 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2033 { \
2034 intptr_t i, opr_sz = simd_oprsz(desc); \
2035 TYPE imm = simd_data(desc); \
2036 for (i = 0; i < opr_sz; ) { \
2037 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2038 do { \
2039 if (pg & 1) { \
2040 TYPE nn = *(TYPE *)(vn + H(i)); \
2041 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2042 } \
2043 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2044 } while (i & 15); \
2045 } \
2046 }
2047
2048 /* Similarly, specialized for 64-bit operands. */
2049 #define DO_ZPZI_D(NAME, TYPE, OP) \
2050 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2051 { \
2052 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2053 TYPE *d = vd, *n = vn; \
2054 TYPE imm = simd_data(desc); \
2055 uint8_t *pg = vg; \
2056 for (i = 0; i < opr_sz; i += 1) { \
2057 if (pg[H1(i)] & 1) { \
2058 TYPE nn = n[i]; \
2059 d[i] = OP(nn, imm); \
2060 } \
2061 } \
2062 }
2063
2064 #define DO_SHR(N, M) (N >> M)
2065 #define DO_SHL(N, M) (N << M)
2066
2067 /* Arithmetic shift right for division. This rounds negative numbers
2068 toward zero as per signed division. Therefore before shifting,
2069 when N is negative, add 2**M-1. */
2070 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2071
do_urshr(uint64_t x,unsigned sh)2072 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2073 {
2074 if (likely(sh < 64)) {
2075 return (x >> sh) + ((x >> (sh - 1)) & 1);
2076 } else if (sh == 64) {
2077 return x >> 63;
2078 } else {
2079 return 0;
2080 }
2081 }
2082
do_srshr(int64_t x,unsigned sh)2083 static inline int64_t do_srshr(int64_t x, unsigned sh)
2084 {
2085 if (likely(sh < 64)) {
2086 return (x >> sh) + ((x >> (sh - 1)) & 1);
2087 } else {
2088 /* Rounding the sign bit always produces 0. */
2089 return 0;
2090 }
2091 }
2092
DO_ZPZI(sve_asr_zpzi_b,int8_t,H1,DO_SHR)2093 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2094 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2095 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2096 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2097
2098 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2099 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2100 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2101 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2102
2103 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2104 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2105 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2106 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2107
2108 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2109 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2110 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2111 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2112
2113 /* SVE2 bitwise shift by immediate */
2114 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2115 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2116 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2117 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2118
2119 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2120 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2121 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2122 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2123
2124 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2125 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2126 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2127 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2128
2129 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2130 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2131 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2132 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2133
2134 #define do_suqrshl_b(n, m) \
2135 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2136 #define do_suqrshl_h(n, m) \
2137 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2138 #define do_suqrshl_s(n, m) \
2139 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2140 #define do_suqrshl_d(n, m) \
2141 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2142
2143 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2144 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2145 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2146 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2147
2148 #undef DO_ASRD
2149 #undef DO_ZPZI
2150 #undef DO_ZPZI_D
2151
2152 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2153 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2154 { \
2155 intptr_t i, opr_sz = simd_oprsz(desc); \
2156 int shift = simd_data(desc); \
2157 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2158 TYPEW nn = *(TYPEW *)(vn + i); \
2159 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2160 } \
2161 }
2162
2163 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2164 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2165 { \
2166 intptr_t i, opr_sz = simd_oprsz(desc); \
2167 int shift = simd_data(desc); \
2168 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2169 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2170 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2171 } \
2172 }
2173
2174 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2175 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2176 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2177
2178 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2179 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2180 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2181
2182 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2183 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2184 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2185
2186 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2187 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2188 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2189
2190 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2191 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2192 #define DO_SQSHRUN_D(x, sh) \
2193 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2194
2195 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2196 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2197 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2198
2199 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2200 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2201 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2202
2203 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2204 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2205 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2206
2207 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2208 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2209 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2210
2211 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2212 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2213 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2214
2215 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2216 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2217 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2218
2219 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2220 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2221 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2222
2223 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2224 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2225 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2226
2227 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2228 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2229 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2230
2231 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2232 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2233 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2234
2235 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2236 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2237 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2238
2239 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2240 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2241 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2242
2243 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2244 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2245 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2246
2247 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2248 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2249 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2250
2251 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2252 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2253 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2254
2255 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2256 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2257 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2258
2259 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2260 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2261 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2262
2263 #undef DO_SHRNB
2264 #undef DO_SHRNT
2265
2266 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2267 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2268 { \
2269 intptr_t i, opr_sz = simd_oprsz(desc); \
2270 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2271 TYPEW nn = *(TYPEW *)(vn + i); \
2272 TYPEW mm = *(TYPEW *)(vm + i); \
2273 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2274 } \
2275 }
2276
2277 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2278 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2279 { \
2280 intptr_t i, opr_sz = simd_oprsz(desc); \
2281 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2282 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2283 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2284 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2285 } \
2286 }
2287
2288 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2289 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2290 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2291 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2292
2293 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2294 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2295 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2296
2297 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2298 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2299 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2300
2301 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2302 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2303 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2304
2305 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2306 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2307 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2308
2309 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2310 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2311 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2312
2313 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2314 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2315 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2316
2317 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2318 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2319 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2320
2321 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2322 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2323 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2324
2325 #undef DO_RSUBHN
2326 #undef DO_SUBHN
2327 #undef DO_RADDHN
2328 #undef DO_ADDHN
2329
2330 #undef DO_BINOPNB
2331
2332 /* Fully general four-operand expander, controlled by a predicate.
2333 */
2334 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2335 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2336 void *vg, uint32_t desc) \
2337 { \
2338 intptr_t i, opr_sz = simd_oprsz(desc); \
2339 for (i = 0; i < opr_sz; ) { \
2340 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2341 do { \
2342 if (pg & 1) { \
2343 TYPE nn = *(TYPE *)(vn + H(i)); \
2344 TYPE mm = *(TYPE *)(vm + H(i)); \
2345 TYPE aa = *(TYPE *)(va + H(i)); \
2346 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2347 } \
2348 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2349 } while (i & 15); \
2350 } \
2351 }
2352
2353 /* Similarly, specialized for 64-bit operands. */
2354 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2355 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2356 void *vg, uint32_t desc) \
2357 { \
2358 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2359 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2360 uint8_t *pg = vg; \
2361 for (i = 0; i < opr_sz; i += 1) { \
2362 if (pg[H1(i)] & 1) { \
2363 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2364 d[i] = OP(aa, nn, mm); \
2365 } \
2366 } \
2367 }
2368
2369 #define DO_MLA(A, N, M) (A + N * M)
2370 #define DO_MLS(A, N, M) (A - N * M)
2371
2372 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2373 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2374
2375 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2376 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2377
2378 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2379 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2380
2381 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2382 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2383
2384 #undef DO_MLA
2385 #undef DO_MLS
2386 #undef DO_ZPZZZ
2387 #undef DO_ZPZZZ_D
2388
2389 void HELPER(sve_index_b)(void *vd, uint32_t start,
2390 uint32_t incr, uint32_t desc)
2391 {
2392 intptr_t i, opr_sz = simd_oprsz(desc);
2393 uint8_t *d = vd;
2394 for (i = 0; i < opr_sz; i += 1) {
2395 d[H1(i)] = start + i * incr;
2396 }
2397 }
2398
HELPER(sve_index_h)2399 void HELPER(sve_index_h)(void *vd, uint32_t start,
2400 uint32_t incr, uint32_t desc)
2401 {
2402 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2403 uint16_t *d = vd;
2404 for (i = 0; i < opr_sz; i += 1) {
2405 d[H2(i)] = start + i * incr;
2406 }
2407 }
2408
HELPER(sve_index_s)2409 void HELPER(sve_index_s)(void *vd, uint32_t start,
2410 uint32_t incr, uint32_t desc)
2411 {
2412 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2413 uint32_t *d = vd;
2414 for (i = 0; i < opr_sz; i += 1) {
2415 d[H4(i)] = start + i * incr;
2416 }
2417 }
2418
HELPER(sve_index_d)2419 void HELPER(sve_index_d)(void *vd, uint64_t start,
2420 uint64_t incr, uint32_t desc)
2421 {
2422 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2423 uint64_t *d = vd;
2424 for (i = 0; i < opr_sz; i += 1) {
2425 d[i] = start + i * incr;
2426 }
2427 }
2428
HELPER(sve_adr_p32)2429 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2430 {
2431 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2432 uint32_t sh = simd_data(desc);
2433 uint32_t *d = vd, *n = vn, *m = vm;
2434 for (i = 0; i < opr_sz; i += 1) {
2435 d[i] = n[i] + (m[i] << sh);
2436 }
2437 }
2438
HELPER(sve_adr_p64)2439 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2440 {
2441 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2442 uint64_t sh = simd_data(desc);
2443 uint64_t *d = vd, *n = vn, *m = vm;
2444 for (i = 0; i < opr_sz; i += 1) {
2445 d[i] = n[i] + (m[i] << sh);
2446 }
2447 }
2448
HELPER(sve_adr_s32)2449 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2450 {
2451 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2452 uint64_t sh = simd_data(desc);
2453 uint64_t *d = vd, *n = vn, *m = vm;
2454 for (i = 0; i < opr_sz; i += 1) {
2455 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2456 }
2457 }
2458
HELPER(sve_adr_u32)2459 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2460 {
2461 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2462 uint64_t sh = simd_data(desc);
2463 uint64_t *d = vd, *n = vn, *m = vm;
2464 for (i = 0; i < opr_sz; i += 1) {
2465 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2466 }
2467 }
2468
HELPER(sve_fexpa_h)2469 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2470 {
2471 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2472 static const uint16_t coeff[] = {
2473 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2474 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2475 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2476 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2477 };
2478 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2479 uint16_t *d = vd, *n = vn;
2480
2481 for (i = 0; i < opr_sz; i++) {
2482 uint16_t nn = n[i];
2483 intptr_t idx = extract32(nn, 0, 5);
2484 uint16_t exp = extract32(nn, 5, 5);
2485 d[i] = coeff[idx] | (exp << 10);
2486 }
2487 }
2488
HELPER(sve_fexpa_s)2489 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2490 {
2491 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2492 static const uint32_t coeff[] = {
2493 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2494 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2495 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2496 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2497 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2498 0x1ef532, 0x20b051, 0x227043, 0x243516,
2499 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2500 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2501 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2502 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2503 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2504 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2505 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2506 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2507 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2508 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2509 };
2510 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2511 uint32_t *d = vd, *n = vn;
2512
2513 for (i = 0; i < opr_sz; i++) {
2514 uint32_t nn = n[i];
2515 intptr_t idx = extract32(nn, 0, 6);
2516 uint32_t exp = extract32(nn, 6, 8);
2517 d[i] = coeff[idx] | (exp << 23);
2518 }
2519 }
2520
HELPER(sve_fexpa_d)2521 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2522 {
2523 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2524 static const uint64_t coeff[] = {
2525 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2526 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2527 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2528 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2529 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2530 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2531 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2532 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2533 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2534 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2535 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2536 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2537 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2538 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2539 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2540 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2541 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2542 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2543 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2544 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2545 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2546 0xFA7C1819E90D8ull,
2547 };
2548 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2549 uint64_t *d = vd, *n = vn;
2550
2551 for (i = 0; i < opr_sz; i++) {
2552 uint64_t nn = n[i];
2553 intptr_t idx = extract32(nn, 0, 6);
2554 uint64_t exp = extract32(nn, 6, 11);
2555 d[i] = coeff[idx] | (exp << 52);
2556 }
2557 }
2558
HELPER(sve_ftssel_h)2559 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2560 {
2561 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2562 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2563 uint16_t *d = vd, *n = vn, *m = vm;
2564 for (i = 0; i < opr_sz; i += 1) {
2565 uint16_t nn = n[i];
2566 uint16_t mm = m[i];
2567 if (mm & 1) {
2568 nn = float16_one;
2569 }
2570 if (mm & 2) {
2571 nn = float16_maybe_ah_chs(nn, fpcr_ah);
2572 }
2573 d[i] = nn;
2574 }
2575 }
2576
HELPER(sve_ftssel_s)2577 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2578 {
2579 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2580 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2581 uint32_t *d = vd, *n = vn, *m = vm;
2582 for (i = 0; i < opr_sz; i += 1) {
2583 uint32_t nn = n[i];
2584 uint32_t mm = m[i];
2585 if (mm & 1) {
2586 nn = float32_one;
2587 }
2588 if (mm & 2) {
2589 nn = float32_maybe_ah_chs(nn, fpcr_ah);
2590 }
2591 d[i] = nn;
2592 }
2593 }
2594
HELPER(sve_ftssel_d)2595 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2596 {
2597 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2598 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2599 uint64_t *d = vd, *n = vn, *m = vm;
2600 for (i = 0; i < opr_sz; i += 1) {
2601 uint64_t nn = n[i];
2602 uint64_t mm = m[i];
2603 if (mm & 1) {
2604 nn = float64_one;
2605 }
2606 if (mm & 2) {
2607 nn = float64_maybe_ah_chs(nn, fpcr_ah);
2608 }
2609 d[i] = nn;
2610 }
2611 }
2612
2613 /*
2614 * Signed saturating addition with scalar operand.
2615 */
2616
HELPER(sve_sqaddi_b)2617 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2618 {
2619 intptr_t i, oprsz = simd_oprsz(desc);
2620
2621 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2622 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2623 }
2624 }
2625
HELPER(sve_sqaddi_h)2626 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2627 {
2628 intptr_t i, oprsz = simd_oprsz(desc);
2629
2630 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2631 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2632 }
2633 }
2634
HELPER(sve_sqaddi_s)2635 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2636 {
2637 intptr_t i, oprsz = simd_oprsz(desc);
2638
2639 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2640 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2641 }
2642 }
2643
HELPER(sve_sqaddi_d)2644 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2645 {
2646 intptr_t i, oprsz = simd_oprsz(desc);
2647
2648 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2649 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2650 }
2651 }
2652
2653 /*
2654 * Unsigned saturating addition with scalar operand.
2655 */
2656
HELPER(sve_uqaddi_b)2657 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2658 {
2659 intptr_t i, oprsz = simd_oprsz(desc);
2660
2661 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2662 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2663 }
2664 }
2665
HELPER(sve_uqaddi_h)2666 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2667 {
2668 intptr_t i, oprsz = simd_oprsz(desc);
2669
2670 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2671 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2672 }
2673 }
2674
HELPER(sve_uqaddi_s)2675 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2676 {
2677 intptr_t i, oprsz = simd_oprsz(desc);
2678
2679 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2680 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2681 }
2682 }
2683
HELPER(sve_uqaddi_d)2684 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2685 {
2686 intptr_t i, oprsz = simd_oprsz(desc);
2687
2688 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2689 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2690 }
2691 }
2692
HELPER(sve_uqsubi_d)2693 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2694 {
2695 intptr_t i, oprsz = simd_oprsz(desc);
2696
2697 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2698 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2699 }
2700 }
2701
2702 /* Two operand predicated copy immediate with merge. All valid immediates
2703 * can fit within 17 signed bits in the simd_data field.
2704 */
HELPER(sve_cpy_m_b)2705 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2706 uint64_t mm, uint32_t desc)
2707 {
2708 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2709 uint64_t *d = vd, *n = vn;
2710 uint8_t *pg = vg;
2711
2712 mm = dup_const(MO_8, mm);
2713 for (i = 0; i < opr_sz; i += 1) {
2714 uint64_t nn = n[i];
2715 uint64_t pp = expand_pred_b(pg[H1(i)]);
2716 d[i] = (mm & pp) | (nn & ~pp);
2717 }
2718 }
2719
HELPER(sve_cpy_m_h)2720 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2721 uint64_t mm, uint32_t desc)
2722 {
2723 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2724 uint64_t *d = vd, *n = vn;
2725 uint8_t *pg = vg;
2726
2727 mm = dup_const(MO_16, mm);
2728 for (i = 0; i < opr_sz; i += 1) {
2729 uint64_t nn = n[i];
2730 uint64_t pp = expand_pred_h(pg[H1(i)]);
2731 d[i] = (mm & pp) | (nn & ~pp);
2732 }
2733 }
2734
HELPER(sve_cpy_m_s)2735 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2736 uint64_t mm, uint32_t desc)
2737 {
2738 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2739 uint64_t *d = vd, *n = vn;
2740 uint8_t *pg = vg;
2741
2742 mm = dup_const(MO_32, mm);
2743 for (i = 0; i < opr_sz; i += 1) {
2744 uint64_t nn = n[i];
2745 uint64_t pp = expand_pred_s(pg[H1(i)]);
2746 d[i] = (mm & pp) | (nn & ~pp);
2747 }
2748 }
2749
HELPER(sve_cpy_m_d)2750 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2751 uint64_t mm, uint32_t desc)
2752 {
2753 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2754 uint64_t *d = vd, *n = vn;
2755 uint8_t *pg = vg;
2756
2757 for (i = 0; i < opr_sz; i += 1) {
2758 uint64_t nn = n[i];
2759 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2760 }
2761 }
2762
HELPER(sve_cpy_z_b)2763 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2764 {
2765 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2766 uint64_t *d = vd;
2767 uint8_t *pg = vg;
2768
2769 val = dup_const(MO_8, val);
2770 for (i = 0; i < opr_sz; i += 1) {
2771 d[i] = val & expand_pred_b(pg[H1(i)]);
2772 }
2773 }
2774
HELPER(sve_cpy_z_h)2775 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2776 {
2777 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2778 uint64_t *d = vd;
2779 uint8_t *pg = vg;
2780
2781 val = dup_const(MO_16, val);
2782 for (i = 0; i < opr_sz; i += 1) {
2783 d[i] = val & expand_pred_h(pg[H1(i)]);
2784 }
2785 }
2786
HELPER(sve_cpy_z_s)2787 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2788 {
2789 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2790 uint64_t *d = vd;
2791 uint8_t *pg = vg;
2792
2793 val = dup_const(MO_32, val);
2794 for (i = 0; i < opr_sz; i += 1) {
2795 d[i] = val & expand_pred_s(pg[H1(i)]);
2796 }
2797 }
2798
HELPER(sve_cpy_z_d)2799 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2800 {
2801 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2802 uint64_t *d = vd;
2803 uint8_t *pg = vg;
2804
2805 for (i = 0; i < opr_sz; i += 1) {
2806 d[i] = (pg[H1(i)] & 1 ? val : 0);
2807 }
2808 }
2809
2810 /* Big-endian hosts need to frob the byte indices. If the copy
2811 * happens to be 8-byte aligned, then no frobbing necessary.
2812 */
swap_memmove(void * vd,void * vs,size_t n)2813 static void swap_memmove(void *vd, void *vs, size_t n)
2814 {
2815 uintptr_t d = (uintptr_t)vd;
2816 uintptr_t s = (uintptr_t)vs;
2817 uintptr_t o = (d | s | n) & 7;
2818 size_t i;
2819
2820 #if !HOST_BIG_ENDIAN
2821 o = 0;
2822 #endif
2823 switch (o) {
2824 case 0:
2825 memmove(vd, vs, n);
2826 break;
2827
2828 case 4:
2829 if (d < s || d >= s + n) {
2830 for (i = 0; i < n; i += 4) {
2831 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2832 }
2833 } else {
2834 for (i = n; i > 0; ) {
2835 i -= 4;
2836 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2837 }
2838 }
2839 break;
2840
2841 case 2:
2842 case 6:
2843 if (d < s || d >= s + n) {
2844 for (i = 0; i < n; i += 2) {
2845 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2846 }
2847 } else {
2848 for (i = n; i > 0; ) {
2849 i -= 2;
2850 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2851 }
2852 }
2853 break;
2854
2855 default:
2856 if (d < s || d >= s + n) {
2857 for (i = 0; i < n; i++) {
2858 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2859 }
2860 } else {
2861 for (i = n; i > 0; ) {
2862 i -= 1;
2863 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2864 }
2865 }
2866 break;
2867 }
2868 }
2869
2870 /* Similarly for memset of 0. */
swap_memzero(void * vd,size_t n)2871 static void swap_memzero(void *vd, size_t n)
2872 {
2873 uintptr_t d = (uintptr_t)vd;
2874 uintptr_t o = (d | n) & 7;
2875 size_t i;
2876
2877 /* Usually, the first bit of a predicate is set, so N is 0. */
2878 if (likely(n == 0)) {
2879 return;
2880 }
2881
2882 #if !HOST_BIG_ENDIAN
2883 o = 0;
2884 #endif
2885 switch (o) {
2886 case 0:
2887 memset(vd, 0, n);
2888 break;
2889
2890 case 4:
2891 for (i = 0; i < n; i += 4) {
2892 *(uint32_t *)H1_4(d + i) = 0;
2893 }
2894 break;
2895
2896 case 2:
2897 case 6:
2898 for (i = 0; i < n; i += 2) {
2899 *(uint16_t *)H1_2(d + i) = 0;
2900 }
2901 break;
2902
2903 default:
2904 for (i = 0; i < n; i++) {
2905 *(uint8_t *)H1(d + i) = 0;
2906 }
2907 break;
2908 }
2909 }
2910
HELPER(sve_ext)2911 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2912 {
2913 intptr_t opr_sz = simd_oprsz(desc);
2914 size_t n_ofs = simd_data(desc);
2915 size_t n_siz = opr_sz - n_ofs;
2916
2917 if (vd != vm) {
2918 swap_memmove(vd, vn + n_ofs, n_siz);
2919 swap_memmove(vd + n_siz, vm, n_ofs);
2920 } else if (vd != vn) {
2921 swap_memmove(vd + n_siz, vd, n_ofs);
2922 swap_memmove(vd, vn + n_ofs, n_siz);
2923 } else {
2924 /* vd == vn == vm. Need temp space. */
2925 ARMVectorReg tmp;
2926 swap_memmove(&tmp, vm, n_ofs);
2927 swap_memmove(vd, vd + n_ofs, n_siz);
2928 memcpy(vd + n_siz, &tmp, n_ofs);
2929 }
2930 }
2931
2932 #define DO_INSR(NAME, TYPE, H) \
2933 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2934 { \
2935 intptr_t opr_sz = simd_oprsz(desc); \
2936 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2937 *(TYPE *)(vd + H(0)) = val; \
2938 }
2939
DO_INSR(sve_insr_b,uint8_t,H1)2940 DO_INSR(sve_insr_b, uint8_t, H1)
2941 DO_INSR(sve_insr_h, uint16_t, H1_2)
2942 DO_INSR(sve_insr_s, uint32_t, H1_4)
2943 DO_INSR(sve_insr_d, uint64_t, H1_8)
2944
2945 #undef DO_INSR
2946
2947 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2948 {
2949 intptr_t i, j, opr_sz = simd_oprsz(desc);
2950 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2951 uint64_t f = *(uint64_t *)(vn + i);
2952 uint64_t b = *(uint64_t *)(vn + j);
2953 *(uint64_t *)(vd + i) = bswap64(b);
2954 *(uint64_t *)(vd + j) = bswap64(f);
2955 }
2956 }
2957
HELPER(sve_rev_h)2958 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2959 {
2960 intptr_t i, j, opr_sz = simd_oprsz(desc);
2961 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2962 uint64_t f = *(uint64_t *)(vn + i);
2963 uint64_t b = *(uint64_t *)(vn + j);
2964 *(uint64_t *)(vd + i) = hswap64(b);
2965 *(uint64_t *)(vd + j) = hswap64(f);
2966 }
2967 }
2968
HELPER(sve_rev_s)2969 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2970 {
2971 intptr_t i, j, opr_sz = simd_oprsz(desc);
2972 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2973 uint64_t f = *(uint64_t *)(vn + i);
2974 uint64_t b = *(uint64_t *)(vn + j);
2975 *(uint64_t *)(vd + i) = rol64(b, 32);
2976 *(uint64_t *)(vd + j) = rol64(f, 32);
2977 }
2978 }
2979
HELPER(sve_rev_d)2980 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2981 {
2982 intptr_t i, j, opr_sz = simd_oprsz(desc);
2983 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2984 uint64_t f = *(uint64_t *)(vn + i);
2985 uint64_t b = *(uint64_t *)(vn + j);
2986 *(uint64_t *)(vd + i) = b;
2987 *(uint64_t *)(vd + j) = f;
2988 }
2989 }
2990
2991 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2992
do_tbl1(void * vd,void * vn,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)2993 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2994 bool is_tbx, tb_impl_fn *fn)
2995 {
2996 ARMVectorReg scratch;
2997 uintptr_t oprsz = simd_oprsz(desc);
2998
2999 if (unlikely(vd == vn)) {
3000 vn = memcpy(&scratch, vn, oprsz);
3001 }
3002
3003 fn(vd, vn, NULL, vm, oprsz, is_tbx);
3004 }
3005
do_tbl2(void * vd,void * vn0,void * vn1,void * vm,uint32_t desc,bool is_tbx,tb_impl_fn * fn)3006 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
3007 uint32_t desc, bool is_tbx, tb_impl_fn *fn)
3008 {
3009 ARMVectorReg scratch;
3010 uintptr_t oprsz = simd_oprsz(desc);
3011
3012 if (unlikely(vd == vn0)) {
3013 vn0 = memcpy(&scratch, vn0, oprsz);
3014 if (vd == vn1) {
3015 vn1 = vn0;
3016 }
3017 } else if (unlikely(vd == vn1)) {
3018 vn1 = memcpy(&scratch, vn1, oprsz);
3019 }
3020
3021 fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3022 }
3023
3024 #define DO_TB(SUFF, TYPE, H) \
3025 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
3026 void *vm, uintptr_t oprsz, bool is_tbx) \
3027 { \
3028 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
3029 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
3030 for (i = 0; i < nelem; ++i) { \
3031 TYPE index = indexes[H1(i)], val = 0; \
3032 if (index < nelem) { \
3033 val = tbl0[H(index)]; \
3034 } else { \
3035 index -= nelem; \
3036 if (tbl1 && index < nelem) { \
3037 val = tbl1[H(index)]; \
3038 } else if (is_tbx) { \
3039 continue; \
3040 } \
3041 } \
3042 d[H(i)] = val; \
3043 } \
3044 } \
3045 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3046 { \
3047 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3048 } \
3049 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3050 void *vm, uint32_t desc) \
3051 { \
3052 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3053 } \
3054 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3055 { \
3056 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3057 }
3058
3059 DO_TB(b, uint8_t, H1)
3060 DO_TB(h, uint16_t, H2)
3061 DO_TB(s, uint32_t, H4)
3062 DO_TB(d, uint64_t, H8)
3063
3064 #undef DO_TB
3065
3066 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3067 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3068 { \
3069 intptr_t i, opr_sz = simd_oprsz(desc); \
3070 TYPED *d = vd; \
3071 TYPES *n = vn; \
3072 ARMVectorReg tmp; \
3073 if (unlikely(vn - vd < opr_sz)) { \
3074 n = memcpy(&tmp, n, opr_sz / 2); \
3075 } \
3076 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3077 d[HD(i)] = n[HS(i)]; \
3078 } \
3079 }
3080
3081 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3082 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3083 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3084
3085 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3086 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3087 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3088
3089 #undef DO_UNPK
3090
3091 /* Mask of bits included in the even numbered predicates of width esz.
3092 * We also use this for expand_bits/compress_bits, and so extend the
3093 * same pattern out to 16-bit units.
3094 */
3095 static const uint64_t even_bit_esz_masks[5] = {
3096 0x5555555555555555ull,
3097 0x3333333333333333ull,
3098 0x0f0f0f0f0f0f0f0full,
3099 0x00ff00ff00ff00ffull,
3100 0x0000ffff0000ffffull,
3101 };
3102
3103 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3104 * For N==0, this corresponds to the operation that in qemu/bitops.h
3105 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3106 * section 7-2 Shuffling Bits.
3107 */
expand_bits(uint64_t x,int n)3108 static uint64_t expand_bits(uint64_t x, int n)
3109 {
3110 int i;
3111
3112 x &= 0xffffffffu;
3113 for (i = 4; i >= n; i--) {
3114 int sh = 1 << i;
3115 x = ((x << sh) | x) & even_bit_esz_masks[i];
3116 }
3117 return x;
3118 }
3119
3120 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3121 * For N==0, this corresponds to the operation that in qemu/bitops.h
3122 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3123 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3124 */
compress_bits(uint64_t x,int n)3125 static uint64_t compress_bits(uint64_t x, int n)
3126 {
3127 int i;
3128
3129 for (i = n; i <= 4; i++) {
3130 int sh = 1 << i;
3131 x &= even_bit_esz_masks[i];
3132 x = (x >> sh) | x;
3133 }
3134 return x & 0xffffffffu;
3135 }
3136
HELPER(sve_zip_p)3137 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3138 {
3139 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3140 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3141 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3142 int esize = 1 << esz;
3143 uint64_t *d = vd;
3144 intptr_t i;
3145
3146 if (oprsz <= 8) {
3147 uint64_t nn = *(uint64_t *)vn;
3148 uint64_t mm = *(uint64_t *)vm;
3149 int half = 4 * oprsz;
3150
3151 nn = extract64(nn, high * half, half);
3152 mm = extract64(mm, high * half, half);
3153 nn = expand_bits(nn, esz);
3154 mm = expand_bits(mm, esz);
3155 d[0] = nn | (mm << esize);
3156 } else {
3157 ARMPredicateReg tmp;
3158
3159 /* We produce output faster than we consume input.
3160 Therefore we must be mindful of possible overlap. */
3161 if (vd == vn) {
3162 vn = memcpy(&tmp, vn, oprsz);
3163 if (vd == vm) {
3164 vm = vn;
3165 }
3166 } else if (vd == vm) {
3167 vm = memcpy(&tmp, vm, oprsz);
3168 }
3169 if (high) {
3170 high = oprsz >> 1;
3171 }
3172
3173 if ((oprsz & 7) == 0) {
3174 uint32_t *n = vn, *m = vm;
3175 high >>= 2;
3176
3177 for (i = 0; i < oprsz / 8; i++) {
3178 uint64_t nn = n[H4(high + i)];
3179 uint64_t mm = m[H4(high + i)];
3180
3181 nn = expand_bits(nn, esz);
3182 mm = expand_bits(mm, esz);
3183 d[i] = nn | (mm << esize);
3184 }
3185 } else {
3186 uint8_t *n = vn, *m = vm;
3187 uint16_t *d16 = vd;
3188
3189 for (i = 0; i < oprsz / 2; i++) {
3190 uint16_t nn = n[H1(high + i)];
3191 uint16_t mm = m[H1(high + i)];
3192
3193 nn = expand_bits(nn, esz);
3194 mm = expand_bits(mm, esz);
3195 d16[H2(i)] = nn | (mm << esize);
3196 }
3197 }
3198 }
3199 }
3200
HELPER(sve_uzp_p)3201 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3202 {
3203 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3204 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3205 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3206 uint64_t *d = vd, *n = vn, *m = vm;
3207 uint64_t l, h;
3208 intptr_t i;
3209
3210 if (oprsz <= 8) {
3211 l = compress_bits(n[0] >> odd, esz);
3212 h = compress_bits(m[0] >> odd, esz);
3213 d[0] = l | (h << (4 * oprsz));
3214 } else {
3215 ARMPredicateReg tmp_m;
3216 intptr_t oprsz_16 = oprsz / 16;
3217
3218 if ((vm - vd) < (uintptr_t)oprsz) {
3219 m = memcpy(&tmp_m, vm, oprsz);
3220 }
3221
3222 for (i = 0; i < oprsz_16; i++) {
3223 l = n[2 * i + 0];
3224 h = n[2 * i + 1];
3225 l = compress_bits(l >> odd, esz);
3226 h = compress_bits(h >> odd, esz);
3227 d[i] = l | (h << 32);
3228 }
3229
3230 /*
3231 * For VL which is not a multiple of 512, the results from M do not
3232 * align nicely with the uint64_t for D. Put the aligned results
3233 * from M into TMP_M and then copy it into place afterward.
3234 */
3235 if (oprsz & 15) {
3236 int final_shift = (oprsz & 15) * 2;
3237
3238 l = n[2 * i + 0];
3239 h = n[2 * i + 1];
3240 l = compress_bits(l >> odd, esz);
3241 h = compress_bits(h >> odd, esz);
3242 d[i] = l | (h << final_shift);
3243
3244 for (i = 0; i < oprsz_16; i++) {
3245 l = m[2 * i + 0];
3246 h = m[2 * i + 1];
3247 l = compress_bits(l >> odd, esz);
3248 h = compress_bits(h >> odd, esz);
3249 tmp_m.p[i] = l | (h << 32);
3250 }
3251 l = m[2 * i + 0];
3252 h = m[2 * i + 1];
3253 l = compress_bits(l >> odd, esz);
3254 h = compress_bits(h >> odd, esz);
3255 tmp_m.p[i] = l | (h << final_shift);
3256
3257 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3258 } else {
3259 for (i = 0; i < oprsz_16; i++) {
3260 l = m[2 * i + 0];
3261 h = m[2 * i + 1];
3262 l = compress_bits(l >> odd, esz);
3263 h = compress_bits(h >> odd, esz);
3264 d[oprsz_16 + i] = l | (h << 32);
3265 }
3266 }
3267 }
3268 }
3269
HELPER(sve_trn_p)3270 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3271 {
3272 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3273 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3274 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3275 uint64_t *d = vd, *n = vn, *m = vm;
3276 uint64_t mask;
3277 int shr, shl;
3278 intptr_t i;
3279
3280 shl = 1 << esz;
3281 shr = 0;
3282 mask = even_bit_esz_masks[esz];
3283 if (odd) {
3284 mask <<= shl;
3285 shr = shl;
3286 shl = 0;
3287 }
3288
3289 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3290 uint64_t nn = (n[i] & mask) >> shr;
3291 uint64_t mm = (m[i] & mask) << shl;
3292 d[i] = nn + mm;
3293 }
3294 }
3295
3296 /* Reverse units of 2**N bits. */
reverse_bits_64(uint64_t x,int n)3297 static uint64_t reverse_bits_64(uint64_t x, int n)
3298 {
3299 int i, sh;
3300
3301 x = bswap64(x);
3302 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3303 uint64_t mask = even_bit_esz_masks[i];
3304 x = ((x & mask) << sh) | ((x >> sh) & mask);
3305 }
3306 return x;
3307 }
3308
reverse_bits_8(uint8_t x,int n)3309 static uint8_t reverse_bits_8(uint8_t x, int n)
3310 {
3311 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3312 int i, sh;
3313
3314 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3315 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3316 }
3317 return x;
3318 }
3319
HELPER(sve_rev_p)3320 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3321 {
3322 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3323 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3324 intptr_t i, oprsz_2 = oprsz / 2;
3325
3326 if (oprsz <= 8) {
3327 uint64_t l = *(uint64_t *)vn;
3328 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3329 *(uint64_t *)vd = l;
3330 } else if ((oprsz & 15) == 0) {
3331 for (i = 0; i < oprsz_2; i += 8) {
3332 intptr_t ih = oprsz - 8 - i;
3333 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3334 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3335 *(uint64_t *)(vd + i) = h;
3336 *(uint64_t *)(vd + ih) = l;
3337 }
3338 } else {
3339 for (i = 0; i < oprsz_2; i += 1) {
3340 intptr_t il = H1(i);
3341 intptr_t ih = H1(oprsz - 1 - i);
3342 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3343 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3344 *(uint8_t *)(vd + il) = h;
3345 *(uint8_t *)(vd + ih) = l;
3346 }
3347 }
3348 }
3349
HELPER(sve_punpk_p)3350 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3351 {
3352 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3353 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3354 uint64_t *d = vd;
3355 intptr_t i;
3356
3357 if (oprsz <= 8) {
3358 uint64_t nn = *(uint64_t *)vn;
3359 int half = 4 * oprsz;
3360
3361 nn = extract64(nn, high * half, half);
3362 nn = expand_bits(nn, 0);
3363 d[0] = nn;
3364 } else {
3365 ARMPredicateReg tmp_n;
3366
3367 /* We produce output faster than we consume input.
3368 Therefore we must be mindful of possible overlap. */
3369 if ((vn - vd) < (uintptr_t)oprsz) {
3370 vn = memcpy(&tmp_n, vn, oprsz);
3371 }
3372 if (high) {
3373 high = oprsz >> 1;
3374 }
3375
3376 if ((oprsz & 7) == 0) {
3377 uint32_t *n = vn;
3378 high >>= 2;
3379
3380 for (i = 0; i < oprsz / 8; i++) {
3381 uint64_t nn = n[H4(high + i)];
3382 d[i] = expand_bits(nn, 0);
3383 }
3384 } else {
3385 uint16_t *d16 = vd;
3386 uint8_t *n = vn;
3387
3388 for (i = 0; i < oprsz / 2; i++) {
3389 uint16_t nn = n[H1(high + i)];
3390 d16[H2(i)] = expand_bits(nn, 0);
3391 }
3392 }
3393 }
3394 }
3395
3396 #define DO_ZIP(NAME, TYPE, H) \
3397 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3398 { \
3399 intptr_t oprsz = simd_oprsz(desc); \
3400 intptr_t odd_ofs = simd_data(desc); \
3401 intptr_t i, oprsz_2 = oprsz / 2; \
3402 ARMVectorReg tmp_n, tmp_m; \
3403 /* We produce output faster than we consume input. \
3404 Therefore we must be mindful of possible overlap. */ \
3405 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3406 vn = memcpy(&tmp_n, vn, oprsz); \
3407 } \
3408 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3409 vm = memcpy(&tmp_m, vm, oprsz); \
3410 } \
3411 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3412 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3413 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \
3414 *(TYPE *)(vm + odd_ofs + H(i)); \
3415 } \
3416 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3417 memset(vd + oprsz - 16, 0, 16); \
3418 } \
3419 }
3420
DO_ZIP(sve_zip_b,uint8_t,H1)3421 DO_ZIP(sve_zip_b, uint8_t, H1)
3422 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3423 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3424 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3425 DO_ZIP(sve2_zip_q, Int128, )
3426
3427 #define DO_UZP(NAME, TYPE, H) \
3428 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3429 { \
3430 intptr_t oprsz = simd_oprsz(desc); \
3431 intptr_t odd_ofs = simd_data(desc); \
3432 intptr_t i, p; \
3433 ARMVectorReg tmp_m; \
3434 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3435 vm = memcpy(&tmp_m, vm, oprsz); \
3436 } \
3437 i = 0, p = odd_ofs; \
3438 do { \
3439 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
3440 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3441 } while (p < oprsz); \
3442 p -= oprsz; \
3443 do { \
3444 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
3445 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
3446 } while (p < oprsz); \
3447 tcg_debug_assert(i == oprsz); \
3448 }
3449
3450 DO_UZP(sve_uzp_b, uint8_t, H1)
3451 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3452 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3453 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3454 DO_UZP(sve2_uzp_q, Int128, )
3455
3456 #define DO_TRN(NAME, TYPE, H) \
3457 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3458 { \
3459 intptr_t oprsz = simd_oprsz(desc); \
3460 intptr_t odd_ofs = simd_data(desc); \
3461 intptr_t i; \
3462 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3463 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3464 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3465 *(TYPE *)(vd + H(i + 0)) = ae; \
3466 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3467 } \
3468 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
3469 memset(vd + oprsz - 16, 0, 16); \
3470 } \
3471 }
3472
3473 DO_TRN(sve_trn_b, uint8_t, H1)
3474 DO_TRN(sve_trn_h, uint16_t, H1_2)
3475 DO_TRN(sve_trn_s, uint32_t, H1_4)
3476 DO_TRN(sve_trn_d, uint64_t, H1_8)
3477 DO_TRN(sve2_trn_q, Int128, )
3478
3479 #undef DO_ZIP
3480 #undef DO_UZP
3481 #undef DO_TRN
3482
3483 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3484 {
3485 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3486 uint32_t *d = vd, *n = vn;
3487 uint8_t *pg = vg;
3488
3489 for (i = j = 0; i < opr_sz; i++) {
3490 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3491 d[H4(j)] = n[H4(i)];
3492 j++;
3493 }
3494 }
3495 for (; j < opr_sz; j++) {
3496 d[H4(j)] = 0;
3497 }
3498 }
3499
HELPER(sve_compact_d)3500 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3501 {
3502 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3503 uint64_t *d = vd, *n = vn;
3504 uint8_t *pg = vg;
3505
3506 for (i = j = 0; i < opr_sz; i++) {
3507 if (pg[H1(i)] & 1) {
3508 d[j] = n[i];
3509 j++;
3510 }
3511 }
3512 for (; j < opr_sz; j++) {
3513 d[j] = 0;
3514 }
3515 }
3516
3517 /* Similar to the ARM LastActiveElement pseudocode function, except the
3518 * result is multiplied by the element size. This includes the not found
3519 * indication; e.g. not found for esz=3 is -8.
3520 */
HELPER(sve_last_active_element)3521 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3522 {
3523 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3524 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3525
3526 return last_active_element(vg, words, esz);
3527 }
3528
HELPER(sve_splice)3529 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3530 {
3531 intptr_t opr_sz = simd_oprsz(desc) / 8;
3532 int esz = simd_data(desc);
3533 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3534 intptr_t i, first_i, last_i;
3535 ARMVectorReg tmp;
3536
3537 first_i = last_i = 0;
3538 first_g = last_g = 0;
3539
3540 /* Find the extent of the active elements within VG. */
3541 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3542 pg = *(uint64_t *)(vg + i) & mask;
3543 if (pg) {
3544 if (last_g == 0) {
3545 last_g = pg;
3546 last_i = i;
3547 }
3548 first_g = pg;
3549 first_i = i;
3550 }
3551 }
3552
3553 len = 0;
3554 if (first_g != 0) {
3555 first_i = first_i * 8 + ctz64(first_g);
3556 last_i = last_i * 8 + 63 - clz64(last_g);
3557 len = last_i - first_i + (1 << esz);
3558 if (vd == vm) {
3559 vm = memcpy(&tmp, vm, opr_sz * 8);
3560 }
3561 swap_memmove(vd, vn + first_i, len);
3562 }
3563 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3564 }
3565
HELPER(sve_sel_zpzz_b)3566 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3567 void *vg, uint32_t desc)
3568 {
3569 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3570 uint64_t *d = vd, *n = vn, *m = vm;
3571 uint8_t *pg = vg;
3572
3573 for (i = 0; i < opr_sz; i += 1) {
3574 uint64_t nn = n[i], mm = m[i];
3575 uint64_t pp = expand_pred_b(pg[H1(i)]);
3576 d[i] = (nn & pp) | (mm & ~pp);
3577 }
3578 }
3579
HELPER(sve_sel_zpzz_h)3580 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3581 void *vg, uint32_t desc)
3582 {
3583 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3584 uint64_t *d = vd, *n = vn, *m = vm;
3585 uint8_t *pg = vg;
3586
3587 for (i = 0; i < opr_sz; i += 1) {
3588 uint64_t nn = n[i], mm = m[i];
3589 uint64_t pp = expand_pred_h(pg[H1(i)]);
3590 d[i] = (nn & pp) | (mm & ~pp);
3591 }
3592 }
3593
HELPER(sve_sel_zpzz_s)3594 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3595 void *vg, uint32_t desc)
3596 {
3597 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3598 uint64_t *d = vd, *n = vn, *m = vm;
3599 uint8_t *pg = vg;
3600
3601 for (i = 0; i < opr_sz; i += 1) {
3602 uint64_t nn = n[i], mm = m[i];
3603 uint64_t pp = expand_pred_s(pg[H1(i)]);
3604 d[i] = (nn & pp) | (mm & ~pp);
3605 }
3606 }
3607
HELPER(sve_sel_zpzz_d)3608 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3609 void *vg, uint32_t desc)
3610 {
3611 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3612 uint64_t *d = vd, *n = vn, *m = vm;
3613 uint8_t *pg = vg;
3614
3615 for (i = 0; i < opr_sz; i += 1) {
3616 uint64_t nn = n[i], mm = m[i];
3617 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3618 }
3619 }
3620
HELPER(sve_sel_zpzz_q)3621 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3622 void *vg, uint32_t desc)
3623 {
3624 intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3625 Int128 *d = vd, *n = vn, *m = vm;
3626 uint16_t *pg = vg;
3627
3628 for (i = 0; i < opr_sz; i += 1) {
3629 d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3630 }
3631 }
3632
3633 /* Two operand comparison controlled by a predicate.
3634 * ??? It is very tempting to want to be able to expand this inline
3635 * with x86 instructions, e.g.
3636 *
3637 * vcmpeqw zm, zn, %ymm0
3638 * vpmovmskb %ymm0, %eax
3639 * and $0x5555, %eax
3640 * and pg, %eax
3641 *
3642 * or even aarch64, e.g.
3643 *
3644 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3645 * cmeq v0.8h, zn, zm
3646 * and v0.8h, v0.8h, mask
3647 * addv h0, v0.8h
3648 * and v0.8b, pg
3649 *
3650 * However, coming up with an abstraction that allows vector inputs and
3651 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3652 * scalar outputs, is tricky.
3653 */
3654 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3655 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3656 { \
3657 intptr_t opr_sz = simd_oprsz(desc); \
3658 uint32_t flags = PREDTEST_INIT; \
3659 intptr_t i = opr_sz; \
3660 do { \
3661 uint64_t out = 0, pg; \
3662 do { \
3663 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3664 TYPE nn = *(TYPE *)(vn + H(i)); \
3665 TYPE mm = *(TYPE *)(vm + H(i)); \
3666 out |= nn OP mm; \
3667 } while (i & 63); \
3668 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3669 out &= pg; \
3670 *(uint64_t *)(vd + (i >> 3)) = out; \
3671 flags = iter_predtest_bwd(out, pg, flags); \
3672 } while (i > 0); \
3673 return flags; \
3674 }
3675
3676 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3677 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3678 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3679 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3680 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3681 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3682 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3683 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3684
3685 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3686 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3687 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3688 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3689
3690 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3691 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3692 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3693 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3694
3695 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3696 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3697 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3698 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3699
3700 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3701 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3702 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3703 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3704
3705 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3706 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3707 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3708 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3709
3710 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3711 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3712 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3713 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3714
3715 #undef DO_CMP_PPZZ_B
3716 #undef DO_CMP_PPZZ_H
3717 #undef DO_CMP_PPZZ_S
3718 #undef DO_CMP_PPZZ_D
3719 #undef DO_CMP_PPZZ
3720
3721 /* Similar, but the second source is "wide". */
3722 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3723 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3724 { \
3725 intptr_t opr_sz = simd_oprsz(desc); \
3726 uint32_t flags = PREDTEST_INIT; \
3727 intptr_t i = opr_sz; \
3728 do { \
3729 uint64_t out = 0, pg; \
3730 do { \
3731 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3732 do { \
3733 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3734 TYPE nn = *(TYPE *)(vn + H(i)); \
3735 out |= nn OP mm; \
3736 } while (i & 7); \
3737 } while (i & 63); \
3738 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3739 out &= pg; \
3740 *(uint64_t *)(vd + (i >> 3)) = out; \
3741 flags = iter_predtest_bwd(out, pg, flags); \
3742 } while (i > 0); \
3743 return flags; \
3744 }
3745
3746 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3747 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3748 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3749 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3750 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3751 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3752
3753 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3754 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3755 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3756
3757 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3758 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3759 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3760
3761 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3762 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3763 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3764
3765 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3766 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3767 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3768
3769 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3770 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3771 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3772
3773 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3774 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3775 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3776
3777 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3778 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3779 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3780
3781 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3782 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3783 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3784
3785 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3786 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3787 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3788
3789 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3790 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3791 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3792
3793 #undef DO_CMP_PPZW_B
3794 #undef DO_CMP_PPZW_H
3795 #undef DO_CMP_PPZW_S
3796 #undef DO_CMP_PPZW
3797
3798 /* Similar, but the second source is immediate. */
3799 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3800 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3801 { \
3802 intptr_t opr_sz = simd_oprsz(desc); \
3803 uint32_t flags = PREDTEST_INIT; \
3804 TYPE mm = simd_data(desc); \
3805 intptr_t i = opr_sz; \
3806 do { \
3807 uint64_t out = 0, pg; \
3808 do { \
3809 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3810 TYPE nn = *(TYPE *)(vn + H(i)); \
3811 out |= nn OP mm; \
3812 } while (i & 63); \
3813 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3814 out &= pg; \
3815 *(uint64_t *)(vd + (i >> 3)) = out; \
3816 flags = iter_predtest_bwd(out, pg, flags); \
3817 } while (i > 0); \
3818 return flags; \
3819 }
3820
3821 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3822 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3823 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3824 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3825 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3826 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3827 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3828 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3829
3830 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3831 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3832 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3833 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3834
3835 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3836 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3837 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3838 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3839
3840 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3841 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3842 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3843 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3844
3845 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3846 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3847 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3848 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3849
3850 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3851 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3852 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3853 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3854
3855 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3856 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3857 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3858 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3859
3860 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3861 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3862 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3863 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3864
3865 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3866 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3867 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3868 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3869
3870 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3871 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3872 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3873 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3874
3875 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3876 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3877 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3878 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3879
3880 #undef DO_CMP_PPZI_B
3881 #undef DO_CMP_PPZI_H
3882 #undef DO_CMP_PPZI_S
3883 #undef DO_CMP_PPZI_D
3884 #undef DO_CMP_PPZI
3885
3886 /* Similar to the ARM LastActive pseudocode function. */
last_active_pred(void * vd,void * vg,intptr_t oprsz)3887 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3888 {
3889 intptr_t i;
3890
3891 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3892 uint64_t pg = *(uint64_t *)(vg + i);
3893 if (pg) {
3894 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3895 }
3896 }
3897 return 0;
3898 }
3899
3900 /* Compute a mask into RETB that is true for all G, up to and including
3901 * (if after) or excluding (if !after) the first G & N.
3902 * Return true if BRK found.
3903 */
compute_brk(uint64_t * retb,uint64_t n,uint64_t g,bool brk,bool after)3904 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3905 bool brk, bool after)
3906 {
3907 uint64_t b;
3908
3909 if (brk) {
3910 b = 0;
3911 } else if ((g & n) == 0) {
3912 /* For all G, no N are set; break not found. */
3913 b = g;
3914 } else {
3915 /* Break somewhere in N. Locate it. */
3916 b = g & n; /* guard true, pred true */
3917 b = b & -b; /* first such */
3918 if (after) {
3919 b = b | (b - 1); /* break after same */
3920 } else {
3921 b = b - 1; /* break before same */
3922 }
3923 brk = true;
3924 }
3925
3926 *retb = b;
3927 return brk;
3928 }
3929
3930 /* Compute a zeroing BRK. */
compute_brk_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3931 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3932 intptr_t oprsz, bool after)
3933 {
3934 bool brk = false;
3935 intptr_t i;
3936
3937 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3938 uint64_t this_b, this_g = g[i];
3939
3940 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3941 d[i] = this_b & this_g;
3942 }
3943 }
3944
3945 /* Likewise, but also compute flags. */
compute_brks_z(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3946 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3947 intptr_t oprsz, bool after)
3948 {
3949 uint32_t flags = PREDTEST_INIT;
3950 bool brk = false;
3951 intptr_t i;
3952
3953 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3954 uint64_t this_b, this_d, this_g = g[i];
3955
3956 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3957 d[i] = this_d = this_b & this_g;
3958 flags = iter_predtest_fwd(this_d, this_g, flags);
3959 }
3960 return flags;
3961 }
3962
3963 /* Compute a merging BRK. */
compute_brk_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3964 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3965 intptr_t oprsz, bool after)
3966 {
3967 bool brk = false;
3968 intptr_t i;
3969
3970 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3971 uint64_t this_b, this_g = g[i];
3972
3973 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3974 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3975 }
3976 }
3977
3978 /* Likewise, but also compute flags. */
compute_brks_m(uint64_t * d,uint64_t * n,uint64_t * g,intptr_t oprsz,bool after)3979 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3980 intptr_t oprsz, bool after)
3981 {
3982 uint32_t flags = PREDTEST_INIT;
3983 bool brk = false;
3984 intptr_t i;
3985
3986 for (i = 0; i < oprsz / 8; ++i) {
3987 uint64_t this_b, this_d = d[i], this_g = g[i];
3988
3989 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3990 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3991 flags = iter_predtest_fwd(this_d, this_g, flags);
3992 }
3993 return flags;
3994 }
3995
do_zero(ARMPredicateReg * d,intptr_t oprsz)3996 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3997 {
3998 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3999 * The compiler should turn this into 4 64-bit integer stores.
4000 */
4001 memset(d, 0, sizeof(ARMPredicateReg));
4002 return PREDTEST_INIT;
4003 }
4004
HELPER(sve_brkpa)4005 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
4006 uint32_t pred_desc)
4007 {
4008 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4009 if (last_active_pred(vn, vg, oprsz)) {
4010 compute_brk_z(vd, vm, vg, oprsz, true);
4011 } else {
4012 do_zero(vd, oprsz);
4013 }
4014 }
4015
HELPER(sve_brkpas)4016 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
4017 uint32_t pred_desc)
4018 {
4019 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4020 if (last_active_pred(vn, vg, oprsz)) {
4021 return compute_brks_z(vd, vm, vg, oprsz, true);
4022 } else {
4023 return do_zero(vd, oprsz);
4024 }
4025 }
4026
HELPER(sve_brkpb)4027 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
4028 uint32_t pred_desc)
4029 {
4030 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4031 if (last_active_pred(vn, vg, oprsz)) {
4032 compute_brk_z(vd, vm, vg, oprsz, false);
4033 } else {
4034 do_zero(vd, oprsz);
4035 }
4036 }
4037
HELPER(sve_brkpbs)4038 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4039 uint32_t pred_desc)
4040 {
4041 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4042 if (last_active_pred(vn, vg, oprsz)) {
4043 return compute_brks_z(vd, vm, vg, oprsz, false);
4044 } else {
4045 return do_zero(vd, oprsz);
4046 }
4047 }
4048
HELPER(sve_brka_z)4049 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4050 {
4051 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4052 compute_brk_z(vd, vn, vg, oprsz, true);
4053 }
4054
HELPER(sve_brkas_z)4055 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4056 {
4057 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4058 return compute_brks_z(vd, vn, vg, oprsz, true);
4059 }
4060
HELPER(sve_brkb_z)4061 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4062 {
4063 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4064 compute_brk_z(vd, vn, vg, oprsz, false);
4065 }
4066
HELPER(sve_brkbs_z)4067 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4068 {
4069 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4070 return compute_brks_z(vd, vn, vg, oprsz, false);
4071 }
4072
HELPER(sve_brka_m)4073 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4074 {
4075 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4076 compute_brk_m(vd, vn, vg, oprsz, true);
4077 }
4078
HELPER(sve_brkas_m)4079 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4080 {
4081 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4082 return compute_brks_m(vd, vn, vg, oprsz, true);
4083 }
4084
HELPER(sve_brkb_m)4085 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4086 {
4087 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4088 compute_brk_m(vd, vn, vg, oprsz, false);
4089 }
4090
HELPER(sve_brkbs_m)4091 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4092 {
4093 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4094 return compute_brks_m(vd, vn, vg, oprsz, false);
4095 }
4096
HELPER(sve_brkn)4097 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4098 {
4099 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4100 if (!last_active_pred(vn, vg, oprsz)) {
4101 do_zero(vd, oprsz);
4102 }
4103 }
4104
4105 /* As if PredTest(Ones(PL), D, esz). */
predtest_ones(ARMPredicateReg * d,intptr_t oprsz,uint64_t esz_mask)4106 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4107 uint64_t esz_mask)
4108 {
4109 uint32_t flags = PREDTEST_INIT;
4110 intptr_t i;
4111
4112 for (i = 0; i < oprsz / 8; i++) {
4113 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4114 }
4115 if (oprsz & 7) {
4116 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4117 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4118 }
4119 return flags;
4120 }
4121
HELPER(sve_brkns)4122 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4123 {
4124 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4125 if (last_active_pred(vn, vg, oprsz)) {
4126 return predtest_ones(vd, oprsz, -1);
4127 } else {
4128 return do_zero(vd, oprsz);
4129 }
4130 }
4131
HELPER(sve_cntp)4132 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4133 {
4134 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4135 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4136 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4137 intptr_t i;
4138
4139 for (i = 0; i < words; ++i) {
4140 uint64_t t = n[i] & g[i] & mask;
4141 sum += ctpop64(t);
4142 }
4143 return sum;
4144 }
4145
HELPER(sve_whilel)4146 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4147 {
4148 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4149 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4150 uint64_t esz_mask = pred_esz_masks[esz];
4151 ARMPredicateReg *d = vd;
4152 uint32_t flags;
4153 intptr_t i;
4154
4155 /* Begin with a zero predicate register. */
4156 flags = do_zero(d, oprsz);
4157 if (count == 0) {
4158 return flags;
4159 }
4160
4161 /* Set all of the requested bits. */
4162 for (i = 0; i < count / 64; ++i) {
4163 d->p[i] = esz_mask;
4164 }
4165 if (count & 63) {
4166 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4167 }
4168
4169 return predtest_ones(d, oprsz, esz_mask);
4170 }
4171
HELPER(sve_whileg)4172 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4173 {
4174 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4175 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4176 uint64_t esz_mask = pred_esz_masks[esz];
4177 ARMPredicateReg *d = vd;
4178 intptr_t i, invcount, oprbits;
4179 uint64_t bits;
4180
4181 if (count == 0) {
4182 return do_zero(d, oprsz);
4183 }
4184
4185 oprbits = oprsz * 8;
4186 tcg_debug_assert(count <= oprbits);
4187
4188 bits = esz_mask;
4189 if (oprbits & 63) {
4190 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4191 }
4192
4193 invcount = oprbits - count;
4194 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4195 d->p[i] = bits;
4196 bits = esz_mask;
4197 }
4198
4199 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4200
4201 while (--i >= 0) {
4202 d->p[i] = 0;
4203 }
4204
4205 return predtest_ones(d, oprsz, esz_mask);
4206 }
4207
4208 /* Recursive reduction on a function;
4209 * C.f. the ARM ARM function ReducePredicated.
4210 *
4211 * While it would be possible to write this without the DATA temporary,
4212 * it is much simpler to process the predicate register this way.
4213 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4214 * little to gain with a more complex non-recursive form.
4215 */
4216 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4217 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4218 { \
4219 if (n == 1) { \
4220 return *data; \
4221 } else { \
4222 uintptr_t half = n / 2; \
4223 TYPE lo = NAME##_reduce(data, status, half); \
4224 TYPE hi = NAME##_reduce(data + half, status, half); \
4225 return FUNC(lo, hi, status); \
4226 } \
4227 } \
4228 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \
4229 { \
4230 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4231 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4232 for (i = 0; i < oprsz; ) { \
4233 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4234 do { \
4235 TYPE nn = *(TYPE *)(vn + H(i)); \
4236 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4237 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4238 } while (i & 15); \
4239 } \
4240 for (; i < maxsz; i += sizeof(TYPE)) { \
4241 *(TYPE *)((void *)data + i) = IDENT; \
4242 } \
4243 return NAME##_reduce(data, s, maxsz / sizeof(TYPE)); \
4244 }
4245
DO_REDUCE(sve_faddv_h,float16,H1_2,float16_add,float16_zero)4246 DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero)
4247 DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero)
4248 DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero)
4249
4250 /* Identity is floatN_default_nan, without the function call. */
4251 DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00)
4252 DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000)
4253 DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL)
4254
4255 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00)
4256 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000)
4257 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL)
4258
4259 DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity)
4260 DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity)
4261 DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity)
4262
4263 DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity))
4264 DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity))
4265 DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity))
4266
4267 DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
4268 DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
4269 DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
4270
4271 DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh,
4272 float16_chs(float16_infinity))
4273 DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs,
4274 float32_chs(float32_infinity))
4275 DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd,
4276 float64_chs(float64_infinity))
4277
4278 #undef DO_REDUCE
4279
4280 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4281 float_status *status, uint32_t desc)
4282 {
4283 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4284 float16 result = nn;
4285
4286 do {
4287 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4288 do {
4289 if (pg & 1) {
4290 float16 mm = *(float16 *)(vm + H1_2(i));
4291 result = float16_add(result, mm, status);
4292 }
4293 i += sizeof(float16), pg >>= sizeof(float16);
4294 } while (i & 15);
4295 } while (i < opr_sz);
4296
4297 return result;
4298 }
4299
HELPER(sve_fadda_s)4300 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4301 float_status *status, uint32_t desc)
4302 {
4303 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4304 float32 result = nn;
4305
4306 do {
4307 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4308 do {
4309 if (pg & 1) {
4310 float32 mm = *(float32 *)(vm + H1_2(i));
4311 result = float32_add(result, mm, status);
4312 }
4313 i += sizeof(float32), pg >>= sizeof(float32);
4314 } while (i & 15);
4315 } while (i < opr_sz);
4316
4317 return result;
4318 }
4319
HELPER(sve_fadda_d)4320 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4321 float_status *status, uint32_t desc)
4322 {
4323 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4324 uint64_t *m = vm;
4325 uint8_t *pg = vg;
4326
4327 for (i = 0; i < opr_sz; i++) {
4328 if (pg[H1(i)] & 1) {
4329 nn = float64_add(nn, m[i], status);
4330 }
4331 }
4332
4333 return nn;
4334 }
4335
4336 /* Fully general three-operand expander, controlled by a predicate,
4337 * With the extra float_status parameter.
4338 */
4339 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4340 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4341 float_status *status, uint32_t desc) \
4342 { \
4343 intptr_t i = simd_oprsz(desc); \
4344 uint64_t *g = vg; \
4345 do { \
4346 uint64_t pg = g[(i - 1) >> 6]; \
4347 do { \
4348 i -= sizeof(TYPE); \
4349 if (likely((pg >> (i & 63)) & 1)) { \
4350 TYPE nn = *(TYPE *)(vn + H(i)); \
4351 TYPE mm = *(TYPE *)(vm + H(i)); \
4352 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4353 } \
4354 } while (i & 63); \
4355 } while (i != 0); \
4356 }
4357
DO_ZPZZ_FP(sve_fadd_h,uint16_t,H1_2,float16_add)4358 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4359 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4360 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4361
4362 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4363 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4364 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4365
4366 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4367 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4368 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4369
4370 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4371 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4372 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4373
4374 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4375 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4376 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4377
4378 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4379 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4380 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4381
4382 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh)
4383 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins)
4384 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind)
4385
4386 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh)
4387 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs)
4388 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd)
4389
4390 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4391 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4392 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4393
4394 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4395 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4396 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4397
4398 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4399 {
4400 return float16_abs(float16_sub(a, b, s));
4401 }
4402
abd_s(float32 a,float32 b,float_status * s)4403 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4404 {
4405 return float32_abs(float32_sub(a, b, s));
4406 }
4407
abd_d(float64 a,float64 b,float_status * s)4408 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4409 {
4410 return float64_abs(float64_sub(a, b, s));
4411 }
4412
4413 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
ah_abd_h(float16 op1,float16 op2,float_status * stat)4414 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat)
4415 {
4416 float16 r = float16_sub(op1, op2, stat);
4417 return float16_is_any_nan(r) ? r : float16_abs(r);
4418 }
4419
ah_abd_s(float32 op1,float32 op2,float_status * stat)4420 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat)
4421 {
4422 float32 r = float32_sub(op1, op2, stat);
4423 return float32_is_any_nan(r) ? r : float32_abs(r);
4424 }
4425
ah_abd_d(float64 op1,float64 op2,float_status * stat)4426 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat)
4427 {
4428 float64 r = float64_sub(op1, op2, stat);
4429 return float64_is_any_nan(r) ? r : float64_abs(r);
4430 }
4431
DO_ZPZZ_FP(sve_fabd_h,uint16_t,H1_2,abd_h)4432 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4433 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4434 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4435 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h)
4436 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s)
4437 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d)
4438
4439 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4440 {
4441 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4442 return float64_scalbn(a, b_int, s);
4443 }
4444
DO_ZPZZ_FP(sve_fscalbn_h,int16_t,H1_2,float16_scalbn)4445 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4446 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4447 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4448
4449 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4450 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4451 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4452
4453 #undef DO_ZPZZ_FP
4454
4455 /* Three-operand expander, with one scalar operand, controlled by
4456 * a predicate, with the extra float_status parameter.
4457 */
4458 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4459 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4460 float_status *status, uint32_t desc) \
4461 { \
4462 intptr_t i = simd_oprsz(desc); \
4463 uint64_t *g = vg; \
4464 TYPE mm = scalar; \
4465 do { \
4466 uint64_t pg = g[(i - 1) >> 6]; \
4467 do { \
4468 i -= sizeof(TYPE); \
4469 if (likely((pg >> (i & 63)) & 1)) { \
4470 TYPE nn = *(TYPE *)(vn + H(i)); \
4471 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4472 } \
4473 } while (i & 63); \
4474 } while (i != 0); \
4475 }
4476
4477 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4478 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4479 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4480
4481 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4482 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4483 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4484
4485 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4486 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4487 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4488
4489 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4490 {
4491 return float16_sub(b, a, s);
4492 }
4493
subr_s(float32 a,float32 b,float_status * s)4494 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4495 {
4496 return float32_sub(b, a, s);
4497 }
4498
subr_d(float64 a,float64 b,float_status * s)4499 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4500 {
4501 return float64_sub(b, a, s);
4502 }
4503
DO_ZPZS_FP(sve_fsubrs_h,float16,H1_2,subr_h)4504 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4505 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4506 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4507
4508 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4509 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4510 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4511
4512 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4513 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4514 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4515
4516 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4517 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4518 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4519
4520 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4521 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4522 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4523
4524 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh)
4525 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs)
4526 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd)
4527
4528 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh)
4529 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins)
4530 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind)
4531
4532 /* Fully general two-operand expander, controlled by a predicate,
4533 * With the extra float_status parameter.
4534 */
4535 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4536 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4537 float_status *status, uint32_t desc) \
4538 { \
4539 intptr_t i = simd_oprsz(desc); \
4540 uint64_t *g = vg; \
4541 do { \
4542 uint64_t pg = g[(i - 1) >> 6]; \
4543 do { \
4544 i -= sizeof(TYPE); \
4545 if (likely((pg >> (i & 63)) & 1)) { \
4546 TYPE nn = *(TYPE *)(vn + H(i)); \
4547 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4548 } \
4549 } while (i & 63); \
4550 } while (i != 0); \
4551 }
4552
4553 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4554 * FZ16. When converting from fp16, this affects flushing input denormals;
4555 * when converting to fp16, this affects flushing output denormals.
4556 */
4557 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4558 {
4559 bool save = get_flush_inputs_to_zero(fpst);
4560 float32 ret;
4561
4562 set_flush_inputs_to_zero(false, fpst);
4563 ret = float16_to_float32(f, true, fpst);
4564 set_flush_inputs_to_zero(save, fpst);
4565 return ret;
4566 }
4567
sve_f16_to_f64(float16 f,float_status * fpst)4568 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4569 {
4570 bool save = get_flush_inputs_to_zero(fpst);
4571 float64 ret;
4572
4573 set_flush_inputs_to_zero(false, fpst);
4574 ret = float16_to_float64(f, true, fpst);
4575 set_flush_inputs_to_zero(save, fpst);
4576 return ret;
4577 }
4578
sve_f32_to_f16(float32 f,float_status * fpst)4579 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4580 {
4581 bool save = get_flush_to_zero(fpst);
4582 float16 ret;
4583
4584 set_flush_to_zero(false, fpst);
4585 ret = float32_to_float16(f, true, fpst);
4586 set_flush_to_zero(save, fpst);
4587 return ret;
4588 }
4589
sve_f64_to_f16(float64 f,float_status * fpst)4590 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4591 {
4592 bool save = get_flush_to_zero(fpst);
4593 float16 ret;
4594
4595 set_flush_to_zero(false, fpst);
4596 ret = float64_to_float16(f, true, fpst);
4597 set_flush_to_zero(save, fpst);
4598 return ret;
4599 }
4600
vfp_float16_to_int16_rtz(float16 f,float_status * s)4601 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4602 {
4603 if (float16_is_any_nan(f)) {
4604 float_raise(float_flag_invalid, s);
4605 return 0;
4606 }
4607 return float16_to_int16_round_to_zero(f, s);
4608 }
4609
vfp_float16_to_int64_rtz(float16 f,float_status * s)4610 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4611 {
4612 if (float16_is_any_nan(f)) {
4613 float_raise(float_flag_invalid, s);
4614 return 0;
4615 }
4616 return float16_to_int64_round_to_zero(f, s);
4617 }
4618
vfp_float32_to_int64_rtz(float32 f,float_status * s)4619 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4620 {
4621 if (float32_is_any_nan(f)) {
4622 float_raise(float_flag_invalid, s);
4623 return 0;
4624 }
4625 return float32_to_int64_round_to_zero(f, s);
4626 }
4627
vfp_float64_to_int64_rtz(float64 f,float_status * s)4628 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4629 {
4630 if (float64_is_any_nan(f)) {
4631 float_raise(float_flag_invalid, s);
4632 return 0;
4633 }
4634 return float64_to_int64_round_to_zero(f, s);
4635 }
4636
vfp_float16_to_uint16_rtz(float16 f,float_status * s)4637 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4638 {
4639 if (float16_is_any_nan(f)) {
4640 float_raise(float_flag_invalid, s);
4641 return 0;
4642 }
4643 return float16_to_uint16_round_to_zero(f, s);
4644 }
4645
vfp_float16_to_uint64_rtz(float16 f,float_status * s)4646 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4647 {
4648 if (float16_is_any_nan(f)) {
4649 float_raise(float_flag_invalid, s);
4650 return 0;
4651 }
4652 return float16_to_uint64_round_to_zero(f, s);
4653 }
4654
vfp_float32_to_uint64_rtz(float32 f,float_status * s)4655 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4656 {
4657 if (float32_is_any_nan(f)) {
4658 float_raise(float_flag_invalid, s);
4659 return 0;
4660 }
4661 return float32_to_uint64_round_to_zero(f, s);
4662 }
4663
vfp_float64_to_uint64_rtz(float64 f,float_status * s)4664 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4665 {
4666 if (float64_is_any_nan(f)) {
4667 float_raise(float_flag_invalid, s);
4668 return 0;
4669 }
4670 return float64_to_uint64_round_to_zero(f, s);
4671 }
4672
DO_ZPZ_FP(sve_fcvt_sh,uint32_t,H1_4,sve_f32_to_f16)4673 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4674 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4675 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
4676 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4677 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4678 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4679 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4680
4681 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4682 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4683 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4684 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4685 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4686 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4687 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4688
4689 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4690 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4691 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4692 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4693 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4694 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4695 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4696
4697 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4698 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4699 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4700
4701 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4702 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4703 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4704
4705 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4706 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4707 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4708
4709 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4710 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4711 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4712
4713 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4714 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4715 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4716 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4717 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4718 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4719 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4720
4721 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4722 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4723 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4724 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4725 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4726 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4727 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4728
4729 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4730 {
4731 /* Extract frac to the top of the uint32_t. */
4732 uint32_t frac = (uint32_t)a << (16 + 6);
4733 int16_t exp = extract32(a, 10, 5);
4734
4735 if (unlikely(exp == 0)) {
4736 if (frac != 0) {
4737 if (!get_flush_inputs_to_zero(s)) {
4738 /* denormal: bias - fractional_zeros */
4739 return -15 - clz32(frac);
4740 }
4741 /* flush to zero */
4742 float_raise(float_flag_input_denormal_flushed, s);
4743 }
4744 } else if (unlikely(exp == 0x1f)) {
4745 if (frac == 0) {
4746 return INT16_MAX; /* infinity */
4747 }
4748 } else {
4749 /* normal: exp - bias */
4750 return exp - 15;
4751 }
4752 /* nan or zero */
4753 float_raise(float_flag_invalid, s);
4754 return INT16_MIN;
4755 }
4756
do_float32_logb_as_int(float32 a,float_status * s)4757 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4758 {
4759 /* Extract frac to the top of the uint32_t. */
4760 uint32_t frac = a << 9;
4761 int32_t exp = extract32(a, 23, 8);
4762
4763 if (unlikely(exp == 0)) {
4764 if (frac != 0) {
4765 if (!get_flush_inputs_to_zero(s)) {
4766 /* denormal: bias - fractional_zeros */
4767 return -127 - clz32(frac);
4768 }
4769 /* flush to zero */
4770 float_raise(float_flag_input_denormal_flushed, s);
4771 }
4772 } else if (unlikely(exp == 0xff)) {
4773 if (frac == 0) {
4774 return INT32_MAX; /* infinity */
4775 }
4776 } else {
4777 /* normal: exp - bias */
4778 return exp - 127;
4779 }
4780 /* nan or zero */
4781 float_raise(float_flag_invalid, s);
4782 return INT32_MIN;
4783 }
4784
do_float64_logb_as_int(float64 a,float_status * s)4785 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4786 {
4787 /* Extract frac to the top of the uint64_t. */
4788 uint64_t frac = a << 12;
4789 int64_t exp = extract64(a, 52, 11);
4790
4791 if (unlikely(exp == 0)) {
4792 if (frac != 0) {
4793 if (!get_flush_inputs_to_zero(s)) {
4794 /* denormal: bias - fractional_zeros */
4795 return -1023 - clz64(frac);
4796 }
4797 /* flush to zero */
4798 float_raise(float_flag_input_denormal_flushed, s);
4799 }
4800 } else if (unlikely(exp == 0x7ff)) {
4801 if (frac == 0) {
4802 return INT64_MAX; /* infinity */
4803 }
4804 } else {
4805 /* normal: exp - bias */
4806 return exp - 1023;
4807 }
4808 /* nan or zero */
4809 float_raise(float_flag_invalid, s);
4810 return INT64_MIN;
4811 }
4812
DO_ZPZ_FP(flogb_h,float16,H1_2,do_float16_logb_as_int)4813 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4814 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4815 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4816
4817 #undef DO_ZPZ_FP
4818
4819 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4820 float_status *status, uint32_t desc,
4821 uint16_t neg1, uint16_t neg3, int flags)
4822 {
4823 intptr_t i = simd_oprsz(desc);
4824 uint64_t *g = vg;
4825
4826 do {
4827 uint64_t pg = g[(i - 1) >> 6];
4828 do {
4829 i -= 2;
4830 if (likely((pg >> (i & 63)) & 1)) {
4831 float16 e1, e2, e3, r;
4832
4833 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4834 e2 = *(uint16_t *)(vm + H1_2(i));
4835 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4836 r = float16_muladd(e1, e2, e3, flags, status);
4837 *(uint16_t *)(vd + H1_2(i)) = r;
4838 }
4839 } while (i & 63);
4840 } while (i != 0);
4841 }
4842
HELPER(sve_fmla_zpzzz_h)4843 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4844 void *vg, float_status *status, uint32_t desc)
4845 {
4846 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4847 }
4848
HELPER(sve_fmls_zpzzz_h)4849 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4850 void *vg, float_status *status, uint32_t desc)
4851 {
4852 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0);
4853 }
4854
HELPER(sve_fnmla_zpzzz_h)4855 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4856 void *vg, float_status *status, uint32_t desc)
4857 {
4858 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0);
4859 }
4860
HELPER(sve_fnmls_zpzzz_h)4861 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4862 void *vg, float_status *status, uint32_t desc)
4863 {
4864 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0);
4865 }
4866
HELPER(sve_ah_fmls_zpzzz_h)4867 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4868 void *vg, float_status *status, uint32_t desc)
4869 {
4870 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4871 float_muladd_negate_product);
4872 }
4873
HELPER(sve_ah_fnmla_zpzzz_h)4874 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4875 void *vg, float_status *status, uint32_t desc)
4876 {
4877 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4878 float_muladd_negate_product | float_muladd_negate_c);
4879 }
4880
HELPER(sve_ah_fnmls_zpzzz_h)4881 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4882 void *vg, float_status *status, uint32_t desc)
4883 {
4884 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4885 float_muladd_negate_c);
4886 }
4887
do_fmla_zpzzz_s(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint32_t neg1,uint32_t neg3,int flags)4888 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4889 float_status *status, uint32_t desc,
4890 uint32_t neg1, uint32_t neg3, int flags)
4891 {
4892 intptr_t i = simd_oprsz(desc);
4893 uint64_t *g = vg;
4894
4895 do {
4896 uint64_t pg = g[(i - 1) >> 6];
4897 do {
4898 i -= 4;
4899 if (likely((pg >> (i & 63)) & 1)) {
4900 float32 e1, e2, e3, r;
4901
4902 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4903 e2 = *(uint32_t *)(vm + H1_4(i));
4904 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4905 r = float32_muladd(e1, e2, e3, flags, status);
4906 *(uint32_t *)(vd + H1_4(i)) = r;
4907 }
4908 } while (i & 63);
4909 } while (i != 0);
4910 }
4911
HELPER(sve_fmla_zpzzz_s)4912 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4913 void *vg, float_status *status, uint32_t desc)
4914 {
4915 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4916 }
4917
HELPER(sve_fmls_zpzzz_s)4918 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4919 void *vg, float_status *status, uint32_t desc)
4920 {
4921 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0);
4922 }
4923
HELPER(sve_fnmla_zpzzz_s)4924 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4925 void *vg, float_status *status, uint32_t desc)
4926 {
4927 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0);
4928 }
4929
HELPER(sve_fnmls_zpzzz_s)4930 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4931 void *vg, float_status *status, uint32_t desc)
4932 {
4933 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0);
4934 }
4935
HELPER(sve_ah_fmls_zpzzz_s)4936 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4937 void *vg, float_status *status, uint32_t desc)
4938 {
4939 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4940 float_muladd_negate_product);
4941 }
4942
HELPER(sve_ah_fnmla_zpzzz_s)4943 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4944 void *vg, float_status *status, uint32_t desc)
4945 {
4946 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4947 float_muladd_negate_product | float_muladd_negate_c);
4948 }
4949
HELPER(sve_ah_fnmls_zpzzz_s)4950 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4951 void *vg, float_status *status, uint32_t desc)
4952 {
4953 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4954 float_muladd_negate_c);
4955 }
4956
do_fmla_zpzzz_d(void * vd,void * vn,void * vm,void * va,void * vg,float_status * status,uint32_t desc,uint64_t neg1,uint64_t neg3,int flags)4957 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4958 float_status *status, uint32_t desc,
4959 uint64_t neg1, uint64_t neg3, int flags)
4960 {
4961 intptr_t i = simd_oprsz(desc);
4962 uint64_t *g = vg;
4963
4964 do {
4965 uint64_t pg = g[(i - 1) >> 6];
4966 do {
4967 i -= 8;
4968 if (likely((pg >> (i & 63)) & 1)) {
4969 float64 e1, e2, e3, r;
4970
4971 e1 = *(uint64_t *)(vn + i) ^ neg1;
4972 e2 = *(uint64_t *)(vm + i);
4973 e3 = *(uint64_t *)(va + i) ^ neg3;
4974 r = float64_muladd(e1, e2, e3, flags, status);
4975 *(uint64_t *)(vd + i) = r;
4976 }
4977 } while (i & 63);
4978 } while (i != 0);
4979 }
4980
HELPER(sve_fmla_zpzzz_d)4981 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4982 void *vg, float_status *status, uint32_t desc)
4983 {
4984 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4985 }
4986
HELPER(sve_fmls_zpzzz_d)4987 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4988 void *vg, float_status *status, uint32_t desc)
4989 {
4990 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0);
4991 }
4992
HELPER(sve_fnmla_zpzzz_d)4993 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4994 void *vg, float_status *status, uint32_t desc)
4995 {
4996 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0);
4997 }
4998
HELPER(sve_fnmls_zpzzz_d)4999 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5000 void *vg, float_status *status, uint32_t desc)
5001 {
5002 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0);
5003 }
5004
HELPER(sve_ah_fmls_zpzzz_d)5005 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5006 void *vg, float_status *status, uint32_t desc)
5007 {
5008 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5009 float_muladd_negate_product);
5010 }
5011
HELPER(sve_ah_fnmla_zpzzz_d)5012 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5013 void *vg, float_status *status, uint32_t desc)
5014 {
5015 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5016 float_muladd_negate_product | float_muladd_negate_c);
5017 }
5018
HELPER(sve_ah_fnmls_zpzzz_d)5019 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5020 void *vg, float_status *status, uint32_t desc)
5021 {
5022 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5023 float_muladd_negate_c);
5024 }
5025
5026 /* Two operand floating-point comparison controlled by a predicate.
5027 * Unlike the integer version, we are not allowed to optimistically
5028 * compare operands, since the comparison may have side effects wrt
5029 * the FPSR.
5030 */
5031 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
5032 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
5033 float_status *status, uint32_t desc) \
5034 { \
5035 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
5036 uint64_t *d = vd, *g = vg; \
5037 do { \
5038 uint64_t out = 0, pg = g[j]; \
5039 do { \
5040 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
5041 if (likely((pg >> (i & 63)) & 1)) { \
5042 TYPE nn = *(TYPE *)(vn + H(i)); \
5043 TYPE mm = *(TYPE *)(vm + H(i)); \
5044 out |= OP(TYPE, nn, mm, status); \
5045 } \
5046 } while (i & 63); \
5047 d[j--] = out; \
5048 } while (i > 0); \
5049 }
5050
5051 #define DO_FPCMP_PPZZ_H(NAME, OP) \
5052 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
5053 #define DO_FPCMP_PPZZ_S(NAME, OP) \
5054 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
5055 #define DO_FPCMP_PPZZ_D(NAME, OP) \
5056 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
5057
5058 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
5059 DO_FPCMP_PPZZ_H(NAME, OP) \
5060 DO_FPCMP_PPZZ_S(NAME, OP) \
5061 DO_FPCMP_PPZZ_D(NAME, OP)
5062
5063 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
5064 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
5065 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
5066 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
5067 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
5068 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
5069 #define DO_FCMUO(TYPE, X, Y, ST) \
5070 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
5071 #define DO_FACGE(TYPE, X, Y, ST) \
5072 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
5073 #define DO_FACGT(TYPE, X, Y, ST) \
5074 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
5075
DO_FPCMP_PPZZ_ALL(sve_fcmge,DO_FCMGE)5076 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
5077 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
5078 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
5079 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
5080 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
5081 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
5082 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
5083
5084 #undef DO_FPCMP_PPZZ_ALL
5085 #undef DO_FPCMP_PPZZ_D
5086 #undef DO_FPCMP_PPZZ_S
5087 #undef DO_FPCMP_PPZZ_H
5088 #undef DO_FPCMP_PPZZ
5089
5090 /* One operand floating-point comparison against zero, controlled
5091 * by a predicate.
5092 */
5093 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
5094 void HELPER(NAME)(void *vd, void *vn, void *vg, \
5095 float_status *status, uint32_t desc) \
5096 { \
5097 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
5098 uint64_t *d = vd, *g = vg; \
5099 do { \
5100 uint64_t out = 0, pg = g[j]; \
5101 do { \
5102 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
5103 if ((pg >> (i & 63)) & 1) { \
5104 TYPE nn = *(TYPE *)(vn + H(i)); \
5105 out |= OP(TYPE, nn, 0, status); \
5106 } \
5107 } while (i & 63); \
5108 d[j--] = out; \
5109 } while (i > 0); \
5110 }
5111
5112 #define DO_FPCMP_PPZ0_H(NAME, OP) \
5113 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
5114 #define DO_FPCMP_PPZ0_S(NAME, OP) \
5115 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
5116 #define DO_FPCMP_PPZ0_D(NAME, OP) \
5117 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
5118
5119 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
5120 DO_FPCMP_PPZ0_H(NAME, OP) \
5121 DO_FPCMP_PPZ0_S(NAME, OP) \
5122 DO_FPCMP_PPZ0_D(NAME, OP)
5123
5124 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
5125 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
5126 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
5127 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
5128 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
5129 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
5130
5131 /* FP Trig Multiply-Add. */
5132
5133 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm,
5134 float_status *s, uint32_t desc)
5135 {
5136 static const float16 coeff[16] = {
5137 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5138 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5139 };
5140 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
5141 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5142 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5143 float16 *d = vd, *n = vn, *m = vm;
5144
5145 for (i = 0; i < opr_sz; i++) {
5146 float16 mm = m[i];
5147 intptr_t xx = x;
5148 int flags = 0;
5149
5150 if (float16_is_neg(mm)) {
5151 if (fpcr_ah) {
5152 flags = float_muladd_negate_product;
5153 } else {
5154 mm = float16_abs(mm);
5155 }
5156 xx += 8;
5157 }
5158 d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s);
5159 }
5160 }
5161
HELPER(sve_ftmad_s)5162 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm,
5163 float_status *s, uint32_t desc)
5164 {
5165 static const float32 coeff[16] = {
5166 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5167 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5168 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5169 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5170 };
5171 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5172 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5173 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5174 float32 *d = vd, *n = vn, *m = vm;
5175
5176 for (i = 0; i < opr_sz; i++) {
5177 float32 mm = m[i];
5178 intptr_t xx = x;
5179 int flags = 0;
5180
5181 if (float32_is_neg(mm)) {
5182 if (fpcr_ah) {
5183 flags = float_muladd_negate_product;
5184 } else {
5185 mm = float32_abs(mm);
5186 }
5187 xx += 8;
5188 }
5189 d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s);
5190 }
5191 }
5192
HELPER(sve_ftmad_d)5193 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm,
5194 float_status *s, uint32_t desc)
5195 {
5196 static const float64 coeff[16] = {
5197 0x3ff0000000000000ull, 0xbfc5555555555543ull,
5198 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5199 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5200 0x3de5d8408868552full, 0x0000000000000000ull,
5201 0x3ff0000000000000ull, 0xbfe0000000000000ull,
5202 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5203 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5204 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5205 };
5206 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5207 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5208 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5209 float64 *d = vd, *n = vn, *m = vm;
5210
5211 for (i = 0; i < opr_sz; i++) {
5212 float64 mm = m[i];
5213 intptr_t xx = x;
5214 int flags = 0;
5215
5216 if (float64_is_neg(mm)) {
5217 if (fpcr_ah) {
5218 flags = float_muladd_negate_product;
5219 } else {
5220 mm = float64_abs(mm);
5221 }
5222 xx += 8;
5223 }
5224 d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s);
5225 }
5226 }
5227
5228 /*
5229 * FP Complex Add
5230 */
5231
HELPER(sve_fcadd_h)5232 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5233 float_status *s, uint32_t desc)
5234 {
5235 intptr_t j, i = simd_oprsz(desc);
5236 uint64_t *g = vg;
5237 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5238 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5239
5240 do {
5241 uint64_t pg = g[(i - 1) >> 6];
5242 do {
5243 float16 e0, e1, e2, e3;
5244
5245 /* I holds the real index; J holds the imag index. */
5246 j = i - sizeof(float16);
5247 i -= 2 * sizeof(float16);
5248
5249 e0 = *(float16 *)(vn + H1_2(i));
5250 e1 = *(float16 *)(vm + H1_2(j));
5251 e2 = *(float16 *)(vn + H1_2(j));
5252 e3 = *(float16 *)(vm + H1_2(i));
5253
5254 if (rot) {
5255 e3 = float16_maybe_ah_chs(e3, fpcr_ah);
5256 } else {
5257 e1 = float16_maybe_ah_chs(e1, fpcr_ah);
5258 }
5259
5260 if (likely((pg >> (i & 63)) & 1)) {
5261 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s);
5262 }
5263 if (likely((pg >> (j & 63)) & 1)) {
5264 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s);
5265 }
5266 } while (i & 63);
5267 } while (i != 0);
5268 }
5269
HELPER(sve_fcadd_s)5270 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5271 float_status *s, uint32_t desc)
5272 {
5273 intptr_t j, i = simd_oprsz(desc);
5274 uint64_t *g = vg;
5275 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5276 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5277
5278 do {
5279 uint64_t pg = g[(i - 1) >> 6];
5280 do {
5281 float32 e0, e1, e2, e3;
5282
5283 /* I holds the real index; J holds the imag index. */
5284 j = i - sizeof(float32);
5285 i -= 2 * sizeof(float32);
5286
5287 e0 = *(float32 *)(vn + H1_2(i));
5288 e1 = *(float32 *)(vm + H1_2(j));
5289 e2 = *(float32 *)(vn + H1_2(j));
5290 e3 = *(float32 *)(vm + H1_2(i));
5291
5292 if (rot) {
5293 e3 = float32_maybe_ah_chs(e3, fpcr_ah);
5294 } else {
5295 e1 = float32_maybe_ah_chs(e1, fpcr_ah);
5296 }
5297
5298 if (likely((pg >> (i & 63)) & 1)) {
5299 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s);
5300 }
5301 if (likely((pg >> (j & 63)) & 1)) {
5302 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s);
5303 }
5304 } while (i & 63);
5305 } while (i != 0);
5306 }
5307
HELPER(sve_fcadd_d)5308 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5309 float_status *s, uint32_t desc)
5310 {
5311 intptr_t j, i = simd_oprsz(desc);
5312 uint64_t *g = vg;
5313 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5314 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5315
5316 do {
5317 uint64_t pg = g[(i - 1) >> 6];
5318 do {
5319 float64 e0, e1, e2, e3;
5320
5321 /* I holds the real index; J holds the imag index. */
5322 j = i - sizeof(float64);
5323 i -= 2 * sizeof(float64);
5324
5325 e0 = *(float64 *)(vn + H1_2(i));
5326 e1 = *(float64 *)(vm + H1_2(j));
5327 e2 = *(float64 *)(vn + H1_2(j));
5328 e3 = *(float64 *)(vm + H1_2(i));
5329
5330 if (rot) {
5331 e3 = float64_maybe_ah_chs(e3, fpcr_ah);
5332 } else {
5333 e1 = float64_maybe_ah_chs(e1, fpcr_ah);
5334 }
5335
5336 if (likely((pg >> (i & 63)) & 1)) {
5337 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s);
5338 }
5339 if (likely((pg >> (j & 63)) & 1)) {
5340 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s);
5341 }
5342 } while (i & 63);
5343 } while (i != 0);
5344 }
5345
5346 /*
5347 * FP Complex Multiply
5348 */
5349
HELPER(sve_fcmla_zpzzz_h)5350 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5351 void *vg, float_status *status, uint32_t desc)
5352 {
5353 intptr_t j, i = simd_oprsz(desc);
5354 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5355 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5356 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5357 uint32_t negf_real = flip ^ negf_imag;
5358 float16 negx_imag, negx_real;
5359 uint64_t *g = vg;
5360
5361 /* With AH=0, use negx; with AH=1 use negf. */
5362 negx_real = (negf_real & ~fpcr_ah) << 15;
5363 negx_imag = (negf_imag & ~fpcr_ah) << 15;
5364 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5365 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5366
5367 do {
5368 uint64_t pg = g[(i - 1) >> 6];
5369 do {
5370 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5371
5372 /* I holds the real index; J holds the imag index. */
5373 j = i - sizeof(float16);
5374 i -= 2 * sizeof(float16);
5375
5376 nr = *(float16 *)(vn + H1_2(i));
5377 ni = *(float16 *)(vn + H1_2(j));
5378 mr = *(float16 *)(vm + H1_2(i));
5379 mi = *(float16 *)(vm + H1_2(j));
5380
5381 e2 = (flip ? ni : nr);
5382 e1 = (flip ? mi : mr) ^ negx_real;
5383 e4 = e2;
5384 e3 = (flip ? mr : mi) ^ negx_imag;
5385
5386 if (likely((pg >> (i & 63)) & 1)) {
5387 d = *(float16 *)(va + H1_2(i));
5388 d = float16_muladd(e2, e1, d, negf_real, status);
5389 *(float16 *)(vd + H1_2(i)) = d;
5390 }
5391 if (likely((pg >> (j & 63)) & 1)) {
5392 d = *(float16 *)(va + H1_2(j));
5393 d = float16_muladd(e4, e3, d, negf_imag, status);
5394 *(float16 *)(vd + H1_2(j)) = d;
5395 }
5396 } while (i & 63);
5397 } while (i != 0);
5398 }
5399
HELPER(sve_fcmla_zpzzz_s)5400 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5401 void *vg, float_status *status, uint32_t desc)
5402 {
5403 intptr_t j, i = simd_oprsz(desc);
5404 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5405 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5406 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5407 uint32_t negf_real = flip ^ negf_imag;
5408 float32 negx_imag, negx_real;
5409 uint64_t *g = vg;
5410
5411 /* With AH=0, use negx; with AH=1 use negf. */
5412 negx_real = (negf_real & ~fpcr_ah) << 31;
5413 negx_imag = (negf_imag & ~fpcr_ah) << 31;
5414 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5415 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5416
5417 do {
5418 uint64_t pg = g[(i - 1) >> 6];
5419 do {
5420 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5421
5422 /* I holds the real index; J holds the imag index. */
5423 j = i - sizeof(float32);
5424 i -= 2 * sizeof(float32);
5425
5426 nr = *(float32 *)(vn + H1_2(i));
5427 ni = *(float32 *)(vn + H1_2(j));
5428 mr = *(float32 *)(vm + H1_2(i));
5429 mi = *(float32 *)(vm + H1_2(j));
5430
5431 e2 = (flip ? ni : nr);
5432 e1 = (flip ? mi : mr) ^ negx_real;
5433 e4 = e2;
5434 e3 = (flip ? mr : mi) ^ negx_imag;
5435
5436 if (likely((pg >> (i & 63)) & 1)) {
5437 d = *(float32 *)(va + H1_2(i));
5438 d = float32_muladd(e2, e1, d, negf_real, status);
5439 *(float32 *)(vd + H1_2(i)) = d;
5440 }
5441 if (likely((pg >> (j & 63)) & 1)) {
5442 d = *(float32 *)(va + H1_2(j));
5443 d = float32_muladd(e4, e3, d, negf_imag, status);
5444 *(float32 *)(vd + H1_2(j)) = d;
5445 }
5446 } while (i & 63);
5447 } while (i != 0);
5448 }
5449
HELPER(sve_fcmla_zpzzz_d)5450 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5451 void *vg, float_status *status, uint32_t desc)
5452 {
5453 intptr_t j, i = simd_oprsz(desc);
5454 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5455 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5456 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5457 uint32_t negf_real = flip ^ negf_imag;
5458 float64 negx_imag, negx_real;
5459 uint64_t *g = vg;
5460
5461 /* With AH=0, use negx; with AH=1 use negf. */
5462 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
5463 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
5464 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5465 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5466
5467 do {
5468 uint64_t pg = g[(i - 1) >> 6];
5469 do {
5470 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5471
5472 /* I holds the real index; J holds the imag index. */
5473 j = i - sizeof(float64);
5474 i -= 2 * sizeof(float64);
5475
5476 nr = *(float64 *)(vn + H1_2(i));
5477 ni = *(float64 *)(vn + H1_2(j));
5478 mr = *(float64 *)(vm + H1_2(i));
5479 mi = *(float64 *)(vm + H1_2(j));
5480
5481 e2 = (flip ? ni : nr);
5482 e1 = (flip ? mi : mr) ^ negx_real;
5483 e4 = e2;
5484 e3 = (flip ? mr : mi) ^ negx_imag;
5485
5486 if (likely((pg >> (i & 63)) & 1)) {
5487 d = *(float64 *)(va + H1_2(i));
5488 d = float64_muladd(e2, e1, d, negf_real, status);
5489 *(float64 *)(vd + H1_2(i)) = d;
5490 }
5491 if (likely((pg >> (j & 63)) & 1)) {
5492 d = *(float64 *)(va + H1_2(j));
5493 d = float64_muladd(e4, e3, d, negf_imag, status);
5494 *(float64 *)(vd + H1_2(j)) = d;
5495 }
5496 } while (i & 63);
5497 } while (i != 0);
5498 }
5499
5500 /*
5501 * Load contiguous data, protected by a governing predicate.
5502 */
5503
5504 /*
5505 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5506 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5507 * element >= @reg_off, or @reg_max if there were no active elements at all.
5508 */
find_next_active(uint64_t * vg,intptr_t reg_off,intptr_t reg_max,int esz)5509 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5510 intptr_t reg_max, int esz)
5511 {
5512 uint64_t pg_mask = pred_esz_masks[esz];
5513 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5514
5515 /* In normal usage, the first element is active. */
5516 if (likely(pg & 1)) {
5517 return reg_off;
5518 }
5519
5520 if (pg == 0) {
5521 reg_off &= -64;
5522 do {
5523 reg_off += 64;
5524 if (unlikely(reg_off >= reg_max)) {
5525 /* The entire predicate was false. */
5526 return reg_max;
5527 }
5528 pg = vg[reg_off >> 6] & pg_mask;
5529 } while (pg == 0);
5530 }
5531 reg_off += ctz64(pg);
5532
5533 /* We should never see an out of range predicate bit set. */
5534 tcg_debug_assert(reg_off < reg_max);
5535 return reg_off;
5536 }
5537
5538 /*
5539 * Resolve the guest virtual address to info->host and info->flags.
5540 * If @nofault, return false if the page is invalid, otherwise
5541 * exit via page fault exception.
5542 */
5543
sve_probe_page(SVEHostPage * info,bool nofault,CPUARMState * env,target_ulong addr,int mem_off,MMUAccessType access_type,int mmu_idx,uintptr_t retaddr)5544 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5545 target_ulong addr, int mem_off, MMUAccessType access_type,
5546 int mmu_idx, uintptr_t retaddr)
5547 {
5548 int flags;
5549
5550 addr += mem_off;
5551
5552 /*
5553 * User-only currently always issues with TBI. See the comment
5554 * above useronly_clean_ptr. Usually we clean this top byte away
5555 * during translation, but we can't do that for e.g. vector + imm
5556 * addressing modes.
5557 *
5558 * We currently always enable TBI for user-only, and do not provide
5559 * a way to turn it off. So clean the pointer unconditionally here,
5560 * rather than look it up here, or pass it down from above.
5561 */
5562 addr = useronly_clean_ptr(addr);
5563
5564 #ifdef CONFIG_USER_ONLY
5565 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5566 &info->host, retaddr);
5567 #else
5568 CPUTLBEntryFull *full;
5569 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5570 &info->host, &full, retaddr);
5571 #endif
5572 info->flags = flags;
5573
5574 if (flags & TLB_INVALID_MASK) {
5575 g_assert(nofault);
5576 return false;
5577 }
5578
5579 #ifdef CONFIG_USER_ONLY
5580 memset(&info->attrs, 0, sizeof(info->attrs));
5581 /* Require both ANON and MTE; see allocation_tag_mem(). */
5582 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5583 #else
5584 info->attrs = full->attrs;
5585 info->tagged = full->extra.arm.pte_attrs == 0xf0;
5586 #endif
5587
5588 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5589 info->host -= mem_off;
5590 return true;
5591 }
5592
5593 /*
5594 * Find first active element on each page, and a loose bound for the
5595 * final element on each page. Identify any single element that spans
5596 * the page boundary. Return true if there are any active elements.
5597 */
sve_cont_ldst_elements(SVEContLdSt * info,target_ulong addr,uint64_t * vg,intptr_t reg_max,int esz,int msize)5598 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5599 intptr_t reg_max, int esz, int msize)
5600 {
5601 const int esize = 1 << esz;
5602 const uint64_t pg_mask = pred_esz_masks[esz];
5603 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5604 intptr_t mem_off_last, mem_off_split;
5605 intptr_t page_split, elt_split;
5606 intptr_t i;
5607
5608 /* Set all of the element indices to -1, and the TLB data to 0. */
5609 memset(info, -1, offsetof(SVEContLdSt, page));
5610 memset(info->page, 0, sizeof(info->page));
5611
5612 /* Gross scan over the entire predicate to find bounds. */
5613 i = 0;
5614 do {
5615 uint64_t pg = vg[i] & pg_mask;
5616 if (pg) {
5617 reg_off_last = i * 64 + 63 - clz64(pg);
5618 if (reg_off_first < 0) {
5619 reg_off_first = i * 64 + ctz64(pg);
5620 }
5621 }
5622 } while (++i * 64 < reg_max);
5623
5624 if (unlikely(reg_off_first < 0)) {
5625 /* No active elements, no pages touched. */
5626 return false;
5627 }
5628 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5629
5630 info->reg_off_first[0] = reg_off_first;
5631 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5632 mem_off_last = (reg_off_last >> esz) * msize;
5633
5634 page_split = -(addr | TARGET_PAGE_MASK);
5635 if (likely(mem_off_last + msize <= page_split)) {
5636 /* The entire operation fits within a single page. */
5637 info->reg_off_last[0] = reg_off_last;
5638 return true;
5639 }
5640
5641 info->page_split = page_split;
5642 elt_split = page_split / msize;
5643 reg_off_split = elt_split << esz;
5644 mem_off_split = elt_split * msize;
5645
5646 /*
5647 * This is the last full element on the first page, but it is not
5648 * necessarily active. If there is no full element, i.e. the first
5649 * active element is the one that's split, this value remains -1.
5650 * It is useful as iteration bounds.
5651 */
5652 if (elt_split != 0) {
5653 info->reg_off_last[0] = reg_off_split - esize;
5654 }
5655
5656 /* Determine if an unaligned element spans the pages. */
5657 if (page_split % msize != 0) {
5658 /* It is helpful to know if the split element is active. */
5659 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5660 info->reg_off_split = reg_off_split;
5661 info->mem_off_split = mem_off_split;
5662
5663 if (reg_off_split == reg_off_last) {
5664 /* The page crossing element is last. */
5665 return true;
5666 }
5667 }
5668 reg_off_split += esize;
5669 mem_off_split += msize;
5670 }
5671
5672 /*
5673 * We do want the first active element on the second page, because
5674 * this may affect the address reported in an exception.
5675 */
5676 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5677 tcg_debug_assert(reg_off_split <= reg_off_last);
5678 info->reg_off_first[1] = reg_off_split;
5679 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5680 info->reg_off_last[1] = reg_off_last;
5681 return true;
5682 }
5683
5684 /*
5685 * Resolve the guest virtual addresses to info->page[].
5686 * Control the generation of page faults with @fault. Return false if
5687 * there is no work to do, which can only happen with @fault == FAULT_NO.
5688 */
sve_cont_ldst_pages(SVEContLdSt * info,SVEContFault fault,CPUARMState * env,target_ulong addr,MMUAccessType access_type,uintptr_t retaddr)5689 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5690 CPUARMState *env, target_ulong addr,
5691 MMUAccessType access_type, uintptr_t retaddr)
5692 {
5693 int mmu_idx = arm_env_mmu_index(env);
5694 int mem_off = info->mem_off_first[0];
5695 bool nofault = fault == FAULT_NO;
5696 bool have_work = true;
5697
5698 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5699 access_type, mmu_idx, retaddr)) {
5700 /* No work to be done. */
5701 return false;
5702 }
5703
5704 if (likely(info->page_split < 0)) {
5705 /* The entire operation was on the one page. */
5706 return true;
5707 }
5708
5709 /*
5710 * If the second page is invalid, then we want the fault address to be
5711 * the first byte on that page which is accessed.
5712 */
5713 if (info->mem_off_split >= 0) {
5714 /*
5715 * There is an element split across the pages. The fault address
5716 * should be the first byte of the second page.
5717 */
5718 mem_off = info->page_split;
5719 /*
5720 * If the split element is also the first active element
5721 * of the vector, then: For first-fault we should continue
5722 * to generate faults for the second page. For no-fault,
5723 * we have work only if the second page is valid.
5724 */
5725 if (info->mem_off_first[0] < info->mem_off_split) {
5726 nofault = FAULT_FIRST;
5727 have_work = false;
5728 }
5729 } else {
5730 /*
5731 * There is no element split across the pages. The fault address
5732 * should be the first active element on the second page.
5733 */
5734 mem_off = info->mem_off_first[1];
5735 /*
5736 * There must have been one active element on the first page,
5737 * so we're out of first-fault territory.
5738 */
5739 nofault = fault != FAULT_ALL;
5740 }
5741
5742 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5743 access_type, mmu_idx, retaddr);
5744 return have_work;
5745 }
5746
5747 #ifndef CONFIG_USER_ONLY
sve_cont_ldst_watchpoints(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,int wp_access,uintptr_t retaddr)5748 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5749 uint64_t *vg, target_ulong addr,
5750 int esize, int msize, int wp_access,
5751 uintptr_t retaddr)
5752 {
5753 intptr_t mem_off, reg_off, reg_last;
5754 int flags0 = info->page[0].flags;
5755 int flags1 = info->page[1].flags;
5756
5757 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5758 return;
5759 }
5760
5761 /* Indicate that watchpoints are handled. */
5762 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5763 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5764
5765 if (flags0 & TLB_WATCHPOINT) {
5766 mem_off = info->mem_off_first[0];
5767 reg_off = info->reg_off_first[0];
5768 reg_last = info->reg_off_last[0];
5769
5770 while (reg_off <= reg_last) {
5771 uint64_t pg = vg[reg_off >> 6];
5772 do {
5773 if ((pg >> (reg_off & 63)) & 1) {
5774 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5775 msize, info->page[0].attrs,
5776 wp_access, retaddr);
5777 }
5778 reg_off += esize;
5779 mem_off += msize;
5780 } while (reg_off <= reg_last && (reg_off & 63));
5781 }
5782 }
5783
5784 mem_off = info->mem_off_split;
5785 if (mem_off >= 0) {
5786 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5787 info->page[0].attrs, wp_access, retaddr);
5788 }
5789
5790 mem_off = info->mem_off_first[1];
5791 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5792 reg_off = info->reg_off_first[1];
5793 reg_last = info->reg_off_last[1];
5794
5795 do {
5796 uint64_t pg = vg[reg_off >> 6];
5797 do {
5798 if ((pg >> (reg_off & 63)) & 1) {
5799 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5800 msize, info->page[1].attrs,
5801 wp_access, retaddr);
5802 }
5803 reg_off += esize;
5804 mem_off += msize;
5805 } while (reg_off & 63);
5806 } while (reg_off <= reg_last);
5807 }
5808 }
5809 #endif
5810
sve_cont_ldst_mte_check(SVEContLdSt * info,CPUARMState * env,uint64_t * vg,target_ulong addr,int esize,int msize,uint32_t mtedesc,uintptr_t ra)5811 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5812 uint64_t *vg, target_ulong addr, int esize,
5813 int msize, uint32_t mtedesc, uintptr_t ra)
5814 {
5815 intptr_t mem_off, reg_off, reg_last;
5816
5817 /* Process the page only if MemAttr == Tagged. */
5818 if (info->page[0].tagged) {
5819 mem_off = info->mem_off_first[0];
5820 reg_off = info->reg_off_first[0];
5821 reg_last = info->reg_off_split;
5822 if (reg_last < 0) {
5823 reg_last = info->reg_off_last[0];
5824 }
5825
5826 do {
5827 uint64_t pg = vg[reg_off >> 6];
5828 do {
5829 if ((pg >> (reg_off & 63)) & 1) {
5830 mte_check(env, mtedesc, addr, ra);
5831 }
5832 reg_off += esize;
5833 mem_off += msize;
5834 } while (reg_off <= reg_last && (reg_off & 63));
5835 } while (reg_off <= reg_last);
5836 }
5837
5838 mem_off = info->mem_off_first[1];
5839 if (mem_off >= 0 && info->page[1].tagged) {
5840 reg_off = info->reg_off_first[1];
5841 reg_last = info->reg_off_last[1];
5842
5843 do {
5844 uint64_t pg = vg[reg_off >> 6];
5845 do {
5846 if ((pg >> (reg_off & 63)) & 1) {
5847 mte_check(env, mtedesc, addr, ra);
5848 }
5849 reg_off += esize;
5850 mem_off += msize;
5851 } while (reg_off & 63);
5852 } while (reg_off <= reg_last);
5853 }
5854 }
5855
5856 /*
5857 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5858 */
5859 static inline QEMU_ALWAYS_INLINE
sve_ldN_r(CPUARMState * env,uint64_t * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const int N,uint32_t mtedesc,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)5860 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5861 uint32_t desc, const uintptr_t retaddr,
5862 const int esz, const int msz, const int N, uint32_t mtedesc,
5863 sve_ldst1_host_fn *host_fn,
5864 sve_ldst1_tlb_fn *tlb_fn)
5865 {
5866 const unsigned rd = simd_data(desc);
5867 const intptr_t reg_max = simd_oprsz(desc);
5868 intptr_t reg_off, reg_last, mem_off;
5869 SVEContLdSt info;
5870 void *host;
5871 int flags, i;
5872
5873 /* Find the active elements. */
5874 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5875 /* The entire predicate was false; no load occurs. */
5876 for (i = 0; i < N; ++i) {
5877 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5878 }
5879 return;
5880 }
5881
5882 /* Probe the page(s). Exit with exception for any invalid page. */
5883 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5884
5885 /* Handle watchpoints for all active elements. */
5886 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5887 BP_MEM_READ, retaddr);
5888
5889 /*
5890 * Handle mte checks for all active elements.
5891 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5892 */
5893 if (mtedesc) {
5894 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5895 mtedesc, retaddr);
5896 }
5897
5898 flags = info.page[0].flags | info.page[1].flags;
5899 if (unlikely(flags != 0)) {
5900 /*
5901 * At least one page includes MMIO.
5902 * Any bus operation can fail with cpu_transaction_failed,
5903 * which for ARM will raise SyncExternal. Perform the load
5904 * into scratch memory to preserve register state until the end.
5905 */
5906 ARMVectorReg scratch[4] = { };
5907
5908 mem_off = info.mem_off_first[0];
5909 reg_off = info.reg_off_first[0];
5910 reg_last = info.reg_off_last[1];
5911 if (reg_last < 0) {
5912 reg_last = info.reg_off_split;
5913 if (reg_last < 0) {
5914 reg_last = info.reg_off_last[0];
5915 }
5916 }
5917
5918 do {
5919 uint64_t pg = vg[reg_off >> 6];
5920 do {
5921 if ((pg >> (reg_off & 63)) & 1) {
5922 for (i = 0; i < N; ++i) {
5923 tlb_fn(env, &scratch[i], reg_off,
5924 addr + mem_off + (i << msz), retaddr);
5925 }
5926 }
5927 reg_off += 1 << esz;
5928 mem_off += N << msz;
5929 } while (reg_off & 63);
5930 } while (reg_off <= reg_last);
5931
5932 for (i = 0; i < N; ++i) {
5933 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5934 }
5935 return;
5936 }
5937
5938 /* The entire operation is in RAM, on valid pages. */
5939
5940 for (i = 0; i < N; ++i) {
5941 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5942 }
5943
5944 mem_off = info.mem_off_first[0];
5945 reg_off = info.reg_off_first[0];
5946 reg_last = info.reg_off_last[0];
5947 host = info.page[0].host;
5948
5949 set_helper_retaddr(retaddr);
5950
5951 while (reg_off <= reg_last) {
5952 uint64_t pg = vg[reg_off >> 6];
5953 do {
5954 if ((pg >> (reg_off & 63)) & 1) {
5955 for (i = 0; i < N; ++i) {
5956 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5957 host + mem_off + (i << msz));
5958 }
5959 }
5960 reg_off += 1 << esz;
5961 mem_off += N << msz;
5962 } while (reg_off <= reg_last && (reg_off & 63));
5963 }
5964
5965 clear_helper_retaddr();
5966
5967 /*
5968 * Use the slow path to manage the cross-page misalignment.
5969 * But we know this is RAM and cannot trap.
5970 */
5971 mem_off = info.mem_off_split;
5972 if (unlikely(mem_off >= 0)) {
5973 reg_off = info.reg_off_split;
5974 for (i = 0; i < N; ++i) {
5975 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5976 addr + mem_off + (i << msz), retaddr);
5977 }
5978 }
5979
5980 mem_off = info.mem_off_first[1];
5981 if (unlikely(mem_off >= 0)) {
5982 reg_off = info.reg_off_first[1];
5983 reg_last = info.reg_off_last[1];
5984 host = info.page[1].host;
5985
5986 set_helper_retaddr(retaddr);
5987
5988 do {
5989 uint64_t pg = vg[reg_off >> 6];
5990 do {
5991 if ((pg >> (reg_off & 63)) & 1) {
5992 for (i = 0; i < N; ++i) {
5993 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5994 host + mem_off + (i << msz));
5995 }
5996 }
5997 reg_off += 1 << esz;
5998 mem_off += N << msz;
5999 } while (reg_off & 63);
6000 } while (reg_off <= reg_last);
6001
6002 clear_helper_retaddr();
6003 }
6004 }
6005
6006 static inline QEMU_ALWAYS_INLINE
sve_ldN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint32_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6007 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6008 uint32_t desc, const uintptr_t ra,
6009 const int esz, const int msz, const int N,
6010 sve_ldst1_host_fn *host_fn,
6011 sve_ldst1_tlb_fn *tlb_fn)
6012 {
6013 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6014 int bit55 = extract64(addr, 55, 1);
6015
6016 /* Remove mtedesc from the normal sve descriptor. */
6017 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6018
6019 /* Perform gross MTE suppression early. */
6020 if (!tbi_check(mtedesc, bit55) ||
6021 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6022 mtedesc = 0;
6023 }
6024
6025 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6026 }
6027
6028 #define DO_LD1_1(NAME, ESZ) \
6029 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
6030 target_ulong addr, uint32_t desc) \
6031 { \
6032 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
6033 sve_##NAME##_host, sve_##NAME##_tlb); \
6034 } \
6035 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
6036 target_ulong addr, uint32_t desc) \
6037 { \
6038 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
6039 sve_##NAME##_host, sve_##NAME##_tlb); \
6040 }
6041
6042 #define DO_LD1_2(NAME, ESZ, MSZ) \
6043 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
6044 target_ulong addr, uint32_t desc) \
6045 { \
6046 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6047 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6048 } \
6049 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
6050 target_ulong addr, uint32_t desc) \
6051 { \
6052 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6053 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6054 } \
6055 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6056 target_ulong addr, uint32_t desc) \
6057 { \
6058 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6059 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6060 } \
6061 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6062 target_ulong addr, uint32_t desc) \
6063 { \
6064 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6065 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6066 }
6067
DO_LD1_1(ld1bb,MO_8)6068 DO_LD1_1(ld1bb, MO_8)
6069 DO_LD1_1(ld1bhu, MO_16)
6070 DO_LD1_1(ld1bhs, MO_16)
6071 DO_LD1_1(ld1bsu, MO_32)
6072 DO_LD1_1(ld1bss, MO_32)
6073 DO_LD1_1(ld1bdu, MO_64)
6074 DO_LD1_1(ld1bds, MO_64)
6075
6076 DO_LD1_2(ld1hh, MO_16, MO_16)
6077 DO_LD1_2(ld1hsu, MO_32, MO_16)
6078 DO_LD1_2(ld1hss, MO_32, MO_16)
6079 DO_LD1_2(ld1hdu, MO_64, MO_16)
6080 DO_LD1_2(ld1hds, MO_64, MO_16)
6081
6082 DO_LD1_2(ld1ss, MO_32, MO_32)
6083 DO_LD1_2(ld1sdu, MO_64, MO_32)
6084 DO_LD1_2(ld1sds, MO_64, MO_32)
6085
6086 DO_LD1_2(ld1dd, MO_64, MO_64)
6087
6088 #undef DO_LD1_1
6089 #undef DO_LD1_2
6090
6091 #define DO_LDN_1(N) \
6092 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
6093 target_ulong addr, uint32_t desc) \
6094 { \
6095 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
6096 sve_ld1bb_host, sve_ld1bb_tlb); \
6097 } \
6098 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
6099 target_ulong addr, uint32_t desc) \
6100 { \
6101 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
6102 sve_ld1bb_host, sve_ld1bb_tlb); \
6103 }
6104
6105 #define DO_LDN_2(N, SUFF, ESZ) \
6106 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
6107 target_ulong addr, uint32_t desc) \
6108 { \
6109 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6110 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6111 } \
6112 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
6113 target_ulong addr, uint32_t desc) \
6114 { \
6115 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6116 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6117 } \
6118 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
6119 target_ulong addr, uint32_t desc) \
6120 { \
6121 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6122 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6123 } \
6124 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
6125 target_ulong addr, uint32_t desc) \
6126 { \
6127 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6128 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6129 }
6130
6131 DO_LDN_1(2)
6132 DO_LDN_1(3)
6133 DO_LDN_1(4)
6134
6135 DO_LDN_2(2, hh, MO_16)
6136 DO_LDN_2(3, hh, MO_16)
6137 DO_LDN_2(4, hh, MO_16)
6138
6139 DO_LDN_2(2, ss, MO_32)
6140 DO_LDN_2(3, ss, MO_32)
6141 DO_LDN_2(4, ss, MO_32)
6142
6143 DO_LDN_2(2, dd, MO_64)
6144 DO_LDN_2(3, dd, MO_64)
6145 DO_LDN_2(4, dd, MO_64)
6146
6147 #undef DO_LDN_1
6148 #undef DO_LDN_2
6149
6150 /*
6151 * Load contiguous data, first-fault and no-fault.
6152 *
6153 * For user-only, we control the race between page_check_range and
6154 * another thread's munmap by using set/clear_helper_retaddr. Any
6155 * SEGV that occurs between those markers is assumed to be because
6156 * the guest page vanished. Keep that block as small as possible
6157 * so that unrelated QEMU bugs are not blamed on the guest.
6158 */
6159
6160 /* Fault on byte I. All bits in FFR from I are cleared. The vector
6161 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6162 * option, which leaves subsequent data unchanged.
6163 */
6164 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6165 {
6166 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6167
6168 if (i & 63) {
6169 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6170 i = ROUND_UP(i, 64);
6171 }
6172 for (; i < oprsz; i += 64) {
6173 ffr[i / 64] = 0;
6174 }
6175 }
6176
6177 /*
6178 * Common helper for all contiguous no-fault and first-fault loads.
6179 */
6180 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r(CPUARMState * env,void * vg,const target_ulong addr,uint32_t desc,const uintptr_t retaddr,uint32_t mtedesc,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6181 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6182 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6183 const int esz, const int msz, const SVEContFault fault,
6184 sve_ldst1_host_fn *host_fn,
6185 sve_ldst1_tlb_fn *tlb_fn)
6186 {
6187 const unsigned rd = simd_data(desc);
6188 void *vd = &env->vfp.zregs[rd];
6189 const intptr_t reg_max = simd_oprsz(desc);
6190 intptr_t reg_off, mem_off, reg_last;
6191 SVEContLdSt info;
6192 int flags;
6193 void *host;
6194
6195 /* Find the active elements. */
6196 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6197 /* The entire predicate was false; no load occurs. */
6198 memset(vd, 0, reg_max);
6199 return;
6200 }
6201 reg_off = info.reg_off_first[0];
6202
6203 /* Probe the page(s). */
6204 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6205 /* Fault on first element. */
6206 tcg_debug_assert(fault == FAULT_NO);
6207 memset(vd, 0, reg_max);
6208 goto do_fault;
6209 }
6210
6211 mem_off = info.mem_off_first[0];
6212 flags = info.page[0].flags;
6213
6214 /*
6215 * Disable MTE checking if the Tagged bit is not set. Since TBI must
6216 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6217 */
6218 if (!info.page[0].tagged) {
6219 mtedesc = 0;
6220 }
6221
6222 if (fault == FAULT_FIRST) {
6223 /* Trapping mte check for the first-fault element. */
6224 if (mtedesc) {
6225 mte_check(env, mtedesc, addr + mem_off, retaddr);
6226 }
6227
6228 /*
6229 * Special handling of the first active element,
6230 * if it crosses a page boundary or is MMIO.
6231 */
6232 bool is_split = mem_off == info.mem_off_split;
6233 if (unlikely(flags != 0) || unlikely(is_split)) {
6234 /*
6235 * Use the slow path for cross-page handling.
6236 * Might trap for MMIO or watchpoints.
6237 */
6238 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6239
6240 /* After any fault, zero the other elements. */
6241 swap_memzero(vd, reg_off);
6242 reg_off += 1 << esz;
6243 mem_off += 1 << msz;
6244 swap_memzero(vd + reg_off, reg_max - reg_off);
6245
6246 if (is_split) {
6247 goto second_page;
6248 }
6249 } else {
6250 memset(vd, 0, reg_max);
6251 }
6252 } else {
6253 memset(vd, 0, reg_max);
6254 if (unlikely(mem_off == info.mem_off_split)) {
6255 /* The first active element crosses a page boundary. */
6256 flags |= info.page[1].flags;
6257 if (unlikely(flags & TLB_MMIO)) {
6258 /* Some page is MMIO, see below. */
6259 goto do_fault;
6260 }
6261 if (unlikely(flags & TLB_WATCHPOINT) &&
6262 (cpu_watchpoint_address_matches
6263 (env_cpu(env), addr + mem_off, 1 << msz)
6264 & BP_MEM_READ)) {
6265 /* Watchpoint hit, see below. */
6266 goto do_fault;
6267 }
6268 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6269 goto do_fault;
6270 }
6271 /*
6272 * Use the slow path for cross-page handling.
6273 * This is RAM, without a watchpoint, and will not trap.
6274 */
6275 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6276 goto second_page;
6277 }
6278 }
6279
6280 /*
6281 * From this point on, all memory operations are MemSingleNF.
6282 *
6283 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6284 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6285 *
6286 * Unfortuately we do not have access to the memory attributes from the
6287 * PTE to tell Device memory from Normal memory. So we make a mostly
6288 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6289 * This gives the right answer for the common cases of "Normal memory,
6290 * backed by host RAM" and "Device memory, backed by MMIO".
6291 * The architecture allows us to suppress an NF load and return
6292 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6293 * case of "Normal memory, backed by MMIO" is permitted. The case we
6294 * get wrong is "Device memory, backed by host RAM", for which we
6295 * should return (UNKNOWN, FAULT) for but do not.
6296 *
6297 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6298 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6299 * architectural breakpoints the same.
6300 */
6301 if (unlikely(flags & TLB_MMIO)) {
6302 goto do_fault;
6303 }
6304
6305 reg_last = info.reg_off_last[0];
6306 host = info.page[0].host;
6307
6308 set_helper_retaddr(retaddr);
6309
6310 do {
6311 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6312 do {
6313 if ((pg >> (reg_off & 63)) & 1) {
6314 if (unlikely(flags & TLB_WATCHPOINT) &&
6315 (cpu_watchpoint_address_matches
6316 (env_cpu(env), addr + mem_off, 1 << msz)
6317 & BP_MEM_READ)) {
6318 clear_helper_retaddr();
6319 goto do_fault;
6320 }
6321 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6322 clear_helper_retaddr();
6323 goto do_fault;
6324 }
6325 host_fn(vd, reg_off, host + mem_off);
6326 }
6327 reg_off += 1 << esz;
6328 mem_off += 1 << msz;
6329 } while (reg_off <= reg_last && (reg_off & 63));
6330 } while (reg_off <= reg_last);
6331
6332 clear_helper_retaddr();
6333
6334 /*
6335 * MemSingleNF is allowed to fail for any reason. We have special
6336 * code above to handle the first element crossing a page boundary.
6337 * As an implementation choice, decline to handle a cross-page element
6338 * in any other position.
6339 */
6340 reg_off = info.reg_off_split;
6341 if (reg_off >= 0) {
6342 goto do_fault;
6343 }
6344
6345 second_page:
6346 reg_off = info.reg_off_first[1];
6347 if (likely(reg_off < 0)) {
6348 /* No active elements on the second page. All done. */
6349 return;
6350 }
6351
6352 /*
6353 * MemSingleNF is allowed to fail for any reason. As an implementation
6354 * choice, decline to handle elements on the second page. This should
6355 * be low frequency as the guest walks through memory -- the next
6356 * iteration of the guest's loop should be aligned on the page boundary,
6357 * and then all following iterations will stay aligned.
6358 */
6359
6360 do_fault:
6361 record_fault(env, reg_off, reg_max);
6362 }
6363
6364 static inline QEMU_ALWAYS_INLINE
sve_ldnfff1_r_mte(CPUARMState * env,void * vg,target_ulong addr,uint32_t desc,const uintptr_t retaddr,const int esz,const int msz,const SVEContFault fault,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6365 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6366 uint32_t desc, const uintptr_t retaddr,
6367 const int esz, const int msz, const SVEContFault fault,
6368 sve_ldst1_host_fn *host_fn,
6369 sve_ldst1_tlb_fn *tlb_fn)
6370 {
6371 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6372 int bit55 = extract64(addr, 55, 1);
6373
6374 /* Remove mtedesc from the normal sve descriptor. */
6375 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6376
6377 /* Perform gross MTE suppression early. */
6378 if (!tbi_check(mtedesc, bit55) ||
6379 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6380 mtedesc = 0;
6381 }
6382
6383 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6384 esz, msz, fault, host_fn, tlb_fn);
6385 }
6386
6387 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6388 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6389 target_ulong addr, uint32_t desc) \
6390 { \
6391 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6392 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6393 } \
6394 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6395 target_ulong addr, uint32_t desc) \
6396 { \
6397 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6398 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6399 } \
6400 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6401 target_ulong addr, uint32_t desc) \
6402 { \
6403 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6404 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6405 } \
6406 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6407 target_ulong addr, uint32_t desc) \
6408 { \
6409 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6410 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6411 }
6412
6413 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6414 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6415 target_ulong addr, uint32_t desc) \
6416 { \
6417 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6418 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6419 } \
6420 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6421 target_ulong addr, uint32_t desc) \
6422 { \
6423 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6424 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6425 } \
6426 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6427 target_ulong addr, uint32_t desc) \
6428 { \
6429 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6430 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6431 } \
6432 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6433 target_ulong addr, uint32_t desc) \
6434 { \
6435 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6436 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6437 } \
6438 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6439 target_ulong addr, uint32_t desc) \
6440 { \
6441 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6442 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6443 } \
6444 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6445 target_ulong addr, uint32_t desc) \
6446 { \
6447 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6448 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6449 } \
6450 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6451 target_ulong addr, uint32_t desc) \
6452 { \
6453 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6454 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6455 } \
6456 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6457 target_ulong addr, uint32_t desc) \
6458 { \
6459 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6460 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6461 }
6462
DO_LDFF1_LDNF1_1(bb,MO_8)6463 DO_LDFF1_LDNF1_1(bb, MO_8)
6464 DO_LDFF1_LDNF1_1(bhu, MO_16)
6465 DO_LDFF1_LDNF1_1(bhs, MO_16)
6466 DO_LDFF1_LDNF1_1(bsu, MO_32)
6467 DO_LDFF1_LDNF1_1(bss, MO_32)
6468 DO_LDFF1_LDNF1_1(bdu, MO_64)
6469 DO_LDFF1_LDNF1_1(bds, MO_64)
6470
6471 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6472 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6473 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6474 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6475 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6476
6477 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6478 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6479 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6480
6481 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
6482
6483 #undef DO_LDFF1_LDNF1_1
6484 #undef DO_LDFF1_LDNF1_2
6485
6486 /*
6487 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6488 */
6489
6490 static inline QEMU_ALWAYS_INLINE
6491 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6492 uint32_t desc, const uintptr_t retaddr,
6493 const int esz, const int msz, const int N, uint32_t mtedesc,
6494 sve_ldst1_host_fn *host_fn,
6495 sve_ldst1_tlb_fn *tlb_fn)
6496 {
6497 const unsigned rd = simd_data(desc);
6498 const intptr_t reg_max = simd_oprsz(desc);
6499 intptr_t reg_off, reg_last, mem_off;
6500 SVEContLdSt info;
6501 void *host;
6502 int i, flags;
6503
6504 /* Find the active elements. */
6505 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6506 /* The entire predicate was false; no store occurs. */
6507 return;
6508 }
6509
6510 /* Probe the page(s). Exit with exception for any invalid page. */
6511 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6512
6513 /* Handle watchpoints for all active elements. */
6514 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6515 BP_MEM_WRITE, retaddr);
6516
6517 /*
6518 * Handle mte checks for all active elements.
6519 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6520 */
6521 if (mtedesc) {
6522 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6523 mtedesc, retaddr);
6524 }
6525
6526 flags = info.page[0].flags | info.page[1].flags;
6527 if (unlikely(flags != 0)) {
6528 /*
6529 * At least one page includes MMIO.
6530 * Any bus operation can fail with cpu_transaction_failed,
6531 * which for ARM will raise SyncExternal. We cannot avoid
6532 * this fault and will leave with the store incomplete.
6533 */
6534 mem_off = info.mem_off_first[0];
6535 reg_off = info.reg_off_first[0];
6536 reg_last = info.reg_off_last[1];
6537 if (reg_last < 0) {
6538 reg_last = info.reg_off_split;
6539 if (reg_last < 0) {
6540 reg_last = info.reg_off_last[0];
6541 }
6542 }
6543
6544 do {
6545 uint64_t pg = vg[reg_off >> 6];
6546 do {
6547 if ((pg >> (reg_off & 63)) & 1) {
6548 for (i = 0; i < N; ++i) {
6549 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6550 addr + mem_off + (i << msz), retaddr);
6551 }
6552 }
6553 reg_off += 1 << esz;
6554 mem_off += N << msz;
6555 } while (reg_off & 63);
6556 } while (reg_off <= reg_last);
6557 return;
6558 }
6559
6560 mem_off = info.mem_off_first[0];
6561 reg_off = info.reg_off_first[0];
6562 reg_last = info.reg_off_last[0];
6563 host = info.page[0].host;
6564
6565 set_helper_retaddr(retaddr);
6566
6567 while (reg_off <= reg_last) {
6568 uint64_t pg = vg[reg_off >> 6];
6569 do {
6570 if ((pg >> (reg_off & 63)) & 1) {
6571 for (i = 0; i < N; ++i) {
6572 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6573 host + mem_off + (i << msz));
6574 }
6575 }
6576 reg_off += 1 << esz;
6577 mem_off += N << msz;
6578 } while (reg_off <= reg_last && (reg_off & 63));
6579 }
6580
6581 clear_helper_retaddr();
6582
6583 /*
6584 * Use the slow path to manage the cross-page misalignment.
6585 * But we know this is RAM and cannot trap.
6586 */
6587 mem_off = info.mem_off_split;
6588 if (unlikely(mem_off >= 0)) {
6589 reg_off = info.reg_off_split;
6590 for (i = 0; i < N; ++i) {
6591 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6592 addr + mem_off + (i << msz), retaddr);
6593 }
6594 }
6595
6596 mem_off = info.mem_off_first[1];
6597 if (unlikely(mem_off >= 0)) {
6598 reg_off = info.reg_off_first[1];
6599 reg_last = info.reg_off_last[1];
6600 host = info.page[1].host;
6601
6602 set_helper_retaddr(retaddr);
6603
6604 do {
6605 uint64_t pg = vg[reg_off >> 6];
6606 do {
6607 if ((pg >> (reg_off & 63)) & 1) {
6608 for (i = 0; i < N; ++i) {
6609 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6610 host + mem_off + (i << msz));
6611 }
6612 }
6613 reg_off += 1 << esz;
6614 mem_off += N << msz;
6615 } while (reg_off & 63);
6616 } while (reg_off <= reg_last);
6617
6618 clear_helper_retaddr();
6619 }
6620 }
6621
6622 static inline QEMU_ALWAYS_INLINE
sve_stN_r_mte(CPUARMState * env,uint64_t * vg,target_ulong addr,uint32_t desc,const uintptr_t ra,const int esz,const int msz,const int N,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6623 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6624 uint32_t desc, const uintptr_t ra,
6625 const int esz, const int msz, const int N,
6626 sve_ldst1_host_fn *host_fn,
6627 sve_ldst1_tlb_fn *tlb_fn)
6628 {
6629 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6630 int bit55 = extract64(addr, 55, 1);
6631
6632 /* Remove mtedesc from the normal sve descriptor. */
6633 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6634
6635 /* Perform gross MTE suppression early. */
6636 if (!tbi_check(mtedesc, bit55) ||
6637 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6638 mtedesc = 0;
6639 }
6640
6641 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6642 }
6643
6644 #define DO_STN_1(N, NAME, ESZ) \
6645 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6646 target_ulong addr, uint32_t desc) \
6647 { \
6648 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6649 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6650 } \
6651 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6652 target_ulong addr, uint32_t desc) \
6653 { \
6654 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6655 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6656 }
6657
6658 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6659 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6660 target_ulong addr, uint32_t desc) \
6661 { \
6662 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6663 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6664 } \
6665 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6666 target_ulong addr, uint32_t desc) \
6667 { \
6668 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6669 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6670 } \
6671 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6672 target_ulong addr, uint32_t desc) \
6673 { \
6674 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6675 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6676 } \
6677 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6678 target_ulong addr, uint32_t desc) \
6679 { \
6680 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6681 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6682 }
6683
6684 DO_STN_1(1, bb, MO_8)
6685 DO_STN_1(1, bh, MO_16)
6686 DO_STN_1(1, bs, MO_32)
6687 DO_STN_1(1, bd, MO_64)
6688 DO_STN_1(2, bb, MO_8)
6689 DO_STN_1(3, bb, MO_8)
6690 DO_STN_1(4, bb, MO_8)
6691
6692 DO_STN_2(1, hh, MO_16, MO_16)
6693 DO_STN_2(1, hs, MO_32, MO_16)
6694 DO_STN_2(1, hd, MO_64, MO_16)
6695 DO_STN_2(2, hh, MO_16, MO_16)
6696 DO_STN_2(3, hh, MO_16, MO_16)
6697 DO_STN_2(4, hh, MO_16, MO_16)
6698
6699 DO_STN_2(1, ss, MO_32, MO_32)
6700 DO_STN_2(1, sd, MO_64, MO_32)
6701 DO_STN_2(2, ss, MO_32, MO_32)
6702 DO_STN_2(3, ss, MO_32, MO_32)
6703 DO_STN_2(4, ss, MO_32, MO_32)
6704
6705 DO_STN_2(1, dd, MO_64, MO_64)
6706 DO_STN_2(2, dd, MO_64, MO_64)
6707 DO_STN_2(3, dd, MO_64, MO_64)
6708 DO_STN_2(4, dd, MO_64, MO_64)
6709
6710 #undef DO_STN_1
6711 #undef DO_STN_2
6712
6713 /*
6714 * Loads with a vector index.
6715 */
6716
6717 /*
6718 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6719 */
6720 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6721
off_zsu_s(void * reg,intptr_t reg_ofs)6722 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6723 {
6724 return *(uint32_t *)(reg + H1_4(reg_ofs));
6725 }
6726
off_zss_s(void * reg,intptr_t reg_ofs)6727 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6728 {
6729 return *(int32_t *)(reg + H1_4(reg_ofs));
6730 }
6731
off_zsu_d(void * reg,intptr_t reg_ofs)6732 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6733 {
6734 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6735 }
6736
off_zss_d(void * reg,intptr_t reg_ofs)6737 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6738 {
6739 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6740 }
6741
off_zd_d(void * reg,intptr_t reg_ofs)6742 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6743 {
6744 return *(uint64_t *)(reg + reg_ofs);
6745 }
6746
6747 static inline QEMU_ALWAYS_INLINE
sve_ld1_z(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,uint32_t mtedesc,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6748 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6749 target_ulong base, uint32_t desc, uintptr_t retaddr,
6750 uint32_t mtedesc, int esize, int msize,
6751 zreg_off_fn *off_fn,
6752 sve_ldst1_host_fn *host_fn,
6753 sve_ldst1_tlb_fn *tlb_fn)
6754 {
6755 const int mmu_idx = arm_env_mmu_index(env);
6756 const intptr_t reg_max = simd_oprsz(desc);
6757 const int scale = simd_data(desc);
6758 ARMVectorReg scratch;
6759 intptr_t reg_off;
6760 SVEHostPage info, info2;
6761
6762 memset(&scratch, 0, reg_max);
6763 reg_off = 0;
6764 do {
6765 uint64_t pg = vg[reg_off >> 6];
6766 do {
6767 if (likely(pg & 1)) {
6768 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6769 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6770
6771 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6772 mmu_idx, retaddr);
6773
6774 if (likely(in_page >= msize)) {
6775 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6776 cpu_check_watchpoint(env_cpu(env), addr, msize,
6777 info.attrs, BP_MEM_READ, retaddr);
6778 }
6779 if (mtedesc && info.tagged) {
6780 mte_check(env, mtedesc, addr, retaddr);
6781 }
6782 if (unlikely(info.flags & TLB_MMIO)) {
6783 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6784 } else {
6785 set_helper_retaddr(retaddr);
6786 host_fn(&scratch, reg_off, info.host);
6787 clear_helper_retaddr();
6788 }
6789 } else {
6790 /* Element crosses the page boundary. */
6791 sve_probe_page(&info2, false, env, addr + in_page, 0,
6792 MMU_DATA_LOAD, mmu_idx, retaddr);
6793 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6794 cpu_check_watchpoint(env_cpu(env), addr,
6795 msize, info.attrs,
6796 BP_MEM_READ, retaddr);
6797 }
6798 if (mtedesc && info.tagged) {
6799 mte_check(env, mtedesc, addr, retaddr);
6800 }
6801 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6802 }
6803 }
6804 reg_off += esize;
6805 pg >>= esize;
6806 } while (reg_off & 63);
6807 } while (reg_off < reg_max);
6808
6809 /* Wait until all exceptions have been raised to write back. */
6810 memcpy(vd, &scratch, reg_max);
6811 }
6812
6813 static inline QEMU_ALWAYS_INLINE
sve_ld1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)6814 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6815 target_ulong base, uint32_t desc, uintptr_t retaddr,
6816 int esize, int msize, zreg_off_fn *off_fn,
6817 sve_ldst1_host_fn *host_fn,
6818 sve_ldst1_tlb_fn *tlb_fn)
6819 {
6820 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6821 /* Remove mtedesc from the normal sve descriptor. */
6822 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6823
6824 /*
6825 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6826 * offset base entirely over the address space hole to change the
6827 * pointer tag, or change the bit55 selector. So we could here
6828 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6829 */
6830 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6831 esize, msize, off_fn, host_fn, tlb_fn);
6832 }
6833
6834 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6835 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6836 void *vm, target_ulong base, uint32_t desc) \
6837 { \
6838 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6839 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6840 } \
6841 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6842 void *vm, target_ulong base, uint32_t desc) \
6843 { \
6844 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6845 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6846 }
6847
6848 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6849 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6850 void *vm, target_ulong base, uint32_t desc) \
6851 { \
6852 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6853 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6854 } \
6855 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6856 void *vm, target_ulong base, uint32_t desc) \
6857 { \
6858 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6859 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6860 }
6861
DO_LD1_ZPZ_S(bsu,zsu,MO_8)6862 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6863 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6864 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6865 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6866 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6867
6868 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6869 DO_LD1_ZPZ_S(bss, zss, MO_8)
6870 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6871 DO_LD1_ZPZ_D(bds, zss, MO_8)
6872 DO_LD1_ZPZ_D(bds, zd, MO_8)
6873
6874 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6875 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6876 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6877 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6878 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6879
6880 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6881 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6882 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6883 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6884 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6885
6886 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6887 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6888 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6889 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6890 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6891
6892 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6893 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6894 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6895 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6896 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6897
6898 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6899 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6900 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6901 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6902 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6903
6904 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6905 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6906 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6907 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6908 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6909
6910 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6911 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6912 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6913
6914 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6915 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6916 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6917
6918 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6919 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6920 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6921
6922 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6923 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6924 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6925
6926 #undef DO_LD1_ZPZ_S
6927 #undef DO_LD1_ZPZ_D
6928
6929 /* First fault loads with a vector index. */
6930
6931 /*
6932 * Common helpers for all gather first-faulting loads.
6933 */
6934
6935 static inline QEMU_ALWAYS_INLINE
6936 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6937 target_ulong base, uint32_t desc, uintptr_t retaddr,
6938 uint32_t mtedesc, const int esz, const int msz,
6939 zreg_off_fn *off_fn,
6940 sve_ldst1_host_fn *host_fn,
6941 sve_ldst1_tlb_fn *tlb_fn)
6942 {
6943 const int mmu_idx = arm_env_mmu_index(env);
6944 const intptr_t reg_max = simd_oprsz(desc);
6945 const int scale = simd_data(desc);
6946 const int esize = 1 << esz;
6947 const int msize = 1 << msz;
6948 intptr_t reg_off;
6949 SVEHostPage info;
6950 target_ulong addr, in_page;
6951 ARMVectorReg scratch;
6952
6953 /* Skip to the first true predicate. */
6954 reg_off = find_next_active(vg, 0, reg_max, esz);
6955 if (unlikely(reg_off >= reg_max)) {
6956 /* The entire predicate was false; no load occurs. */
6957 memset(vd, 0, reg_max);
6958 return;
6959 }
6960
6961 /* Protect against overlap between vd and vm. */
6962 if (unlikely(vd == vm)) {
6963 vm = memcpy(&scratch, vm, reg_max);
6964 }
6965
6966 /*
6967 * Probe the first element, allowing faults.
6968 */
6969 addr = base + (off_fn(vm, reg_off) << scale);
6970 if (mtedesc) {
6971 mte_check(env, mtedesc, addr, retaddr);
6972 }
6973 tlb_fn(env, vd, reg_off, addr, retaddr);
6974
6975 /* After any fault, zero the other elements. */
6976 swap_memzero(vd, reg_off);
6977 reg_off += esize;
6978 swap_memzero(vd + reg_off, reg_max - reg_off);
6979
6980 /*
6981 * Probe the remaining elements, not allowing faults.
6982 */
6983 while (reg_off < reg_max) {
6984 uint64_t pg = vg[reg_off >> 6];
6985 do {
6986 if (likely((pg >> (reg_off & 63)) & 1)) {
6987 addr = base + (off_fn(vm, reg_off) << scale);
6988 in_page = -(addr | TARGET_PAGE_MASK);
6989
6990 if (unlikely(in_page < msize)) {
6991 /* Stop if the element crosses a page boundary. */
6992 goto fault;
6993 }
6994
6995 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6996 mmu_idx, retaddr);
6997 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6998 goto fault;
6999 }
7000 if (unlikely(info.flags & TLB_WATCHPOINT) &&
7001 (cpu_watchpoint_address_matches
7002 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
7003 goto fault;
7004 }
7005 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
7006 goto fault;
7007 }
7008
7009 set_helper_retaddr(retaddr);
7010 host_fn(vd, reg_off, info.host);
7011 clear_helper_retaddr();
7012 }
7013 reg_off += esize;
7014 } while (reg_off & 63);
7015 }
7016 return;
7017
7018 fault:
7019 record_fault(env, reg_off, reg_max);
7020 }
7021
7022 static inline QEMU_ALWAYS_INLINE
sve_ldff1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,const int esz,const int msz,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7023 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7024 target_ulong base, uint32_t desc, uintptr_t retaddr,
7025 const int esz, const int msz,
7026 zreg_off_fn *off_fn,
7027 sve_ldst1_host_fn *host_fn,
7028 sve_ldst1_tlb_fn *tlb_fn)
7029 {
7030 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7031 /* Remove mtedesc from the normal sve descriptor. */
7032 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7033
7034 /*
7035 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7036 * offset base entirely over the address space hole to change the
7037 * pointer tag, or change the bit55 selector. So we could here
7038 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7039 */
7040 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7041 esz, msz, off_fn, host_fn, tlb_fn);
7042 }
7043
7044 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
7045 void HELPER(sve_ldff##MEM##_##OFS) \
7046 (CPUARMState *env, void *vd, void *vg, \
7047 void *vm, target_ulong base, uint32_t desc) \
7048 { \
7049 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
7050 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7051 } \
7052 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7053 (CPUARMState *env, void *vd, void *vg, \
7054 void *vm, target_ulong base, uint32_t desc) \
7055 { \
7056 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
7057 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7058 }
7059
7060 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
7061 void HELPER(sve_ldff##MEM##_##OFS) \
7062 (CPUARMState *env, void *vd, void *vg, \
7063 void *vm, target_ulong base, uint32_t desc) \
7064 { \
7065 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
7066 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7067 } \
7068 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7069 (CPUARMState *env, void *vd, void *vg, \
7070 void *vm, target_ulong base, uint32_t desc) \
7071 { \
7072 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
7073 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7074 }
7075
DO_LDFF1_ZPZ_S(bsu,zsu,MO_8)7076 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7077 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7078 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7079 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7080 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7081
7082 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7083 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7084 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7085 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7086 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7087
7088 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7089 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7090 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7091 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7092 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7093
7094 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7095 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7096 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7097 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7098 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7099
7100 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7101 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7102 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7103 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7104 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7105
7106 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7107 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7108 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7109 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7110 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7111
7112 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
7113 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
7114 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7115 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7116 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7117
7118 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
7119 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
7120 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7121 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7122 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7123
7124 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7125 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7126 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7127
7128 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7129 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7130 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7131
7132 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7133 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7134 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7135
7136 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7137 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7138 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7139
7140 /* Stores with a vector index. */
7141
7142 static inline QEMU_ALWAYS_INLINE
7143 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7144 target_ulong base, uint32_t desc, uintptr_t retaddr,
7145 uint32_t mtedesc, int esize, int msize,
7146 zreg_off_fn *off_fn,
7147 sve_ldst1_host_fn *host_fn,
7148 sve_ldst1_tlb_fn *tlb_fn)
7149 {
7150 const int mmu_idx = arm_env_mmu_index(env);
7151 const intptr_t reg_max = simd_oprsz(desc);
7152 const int scale = simd_data(desc);
7153 void *host[ARM_MAX_VQ * 4];
7154 intptr_t reg_off, i;
7155 SVEHostPage info, info2;
7156
7157 /*
7158 * Probe all of the elements for host addresses and flags.
7159 */
7160 i = reg_off = 0;
7161 do {
7162 uint64_t pg = vg[reg_off >> 6];
7163 do {
7164 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7165 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7166
7167 host[i] = NULL;
7168 if (likely((pg >> (reg_off & 63)) & 1)) {
7169 if (likely(in_page >= msize)) {
7170 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7171 mmu_idx, retaddr);
7172 if (!(info.flags & TLB_MMIO)) {
7173 host[i] = info.host;
7174 }
7175 } else {
7176 /*
7177 * Element crosses the page boundary.
7178 * Probe both pages, but do not record the host address,
7179 * so that we use the slow path.
7180 */
7181 sve_probe_page(&info, false, env, addr, 0,
7182 MMU_DATA_STORE, mmu_idx, retaddr);
7183 sve_probe_page(&info2, false, env, addr + in_page, 0,
7184 MMU_DATA_STORE, mmu_idx, retaddr);
7185 info.flags |= info2.flags;
7186 }
7187
7188 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7189 cpu_check_watchpoint(env_cpu(env), addr, msize,
7190 info.attrs, BP_MEM_WRITE, retaddr);
7191 }
7192
7193 if (mtedesc && info.tagged) {
7194 mte_check(env, mtedesc, addr, retaddr);
7195 }
7196 }
7197 i += 1;
7198 reg_off += esize;
7199 } while (reg_off & 63);
7200 } while (reg_off < reg_max);
7201
7202 /*
7203 * Now that we have recognized all exceptions except SyncExternal
7204 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7205 *
7206 * Note for the common case of an element in RAM, not crossing a page
7207 * boundary, we have stored the host address in host[]. This doubles
7208 * as a first-level check against the predicate, since only enabled
7209 * elements have non-null host addresses.
7210 */
7211 i = reg_off = 0;
7212 do {
7213 void *h = host[i];
7214 if (likely(h != NULL)) {
7215 set_helper_retaddr(retaddr);
7216 host_fn(vd, reg_off, h);
7217 clear_helper_retaddr();
7218 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7219 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7220 tlb_fn(env, vd, reg_off, addr, retaddr);
7221 }
7222 i += 1;
7223 reg_off += esize;
7224 } while (reg_off < reg_max);
7225 }
7226
7227 static inline QEMU_ALWAYS_INLINE
sve_st1_z_mte(CPUARMState * env,void * vd,uint64_t * vg,void * vm,target_ulong base,uint32_t desc,uintptr_t retaddr,int esize,int msize,zreg_off_fn * off_fn,sve_ldst1_host_fn * host_fn,sve_ldst1_tlb_fn * tlb_fn)7228 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7229 target_ulong base, uint32_t desc, uintptr_t retaddr,
7230 int esize, int msize, zreg_off_fn *off_fn,
7231 sve_ldst1_host_fn *host_fn,
7232 sve_ldst1_tlb_fn *tlb_fn)
7233 {
7234 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7235 /* Remove mtedesc from the normal sve descriptor. */
7236 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7237
7238 /*
7239 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7240 * offset base entirely over the address space hole to change the
7241 * pointer tag, or change the bit55 selector. So we could here
7242 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7243 */
7244 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7245 esize, msize, off_fn, host_fn, tlb_fn);
7246 }
7247
7248 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7249 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7250 void *vm, target_ulong base, uint32_t desc) \
7251 { \
7252 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7253 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7254 } \
7255 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7256 void *vm, target_ulong base, uint32_t desc) \
7257 { \
7258 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7259 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7260 }
7261
7262 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7263 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7264 void *vm, target_ulong base, uint32_t desc) \
7265 { \
7266 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7267 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7268 } \
7269 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7270 void *vm, target_ulong base, uint32_t desc) \
7271 { \
7272 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7273 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7274 }
7275
DO_ST1_ZPZ_S(bs,zsu,MO_8)7276 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7277 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7278 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7279 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7280 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7281
7282 DO_ST1_ZPZ_S(bs, zss, MO_8)
7283 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7284 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7285 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7286 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7287
7288 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7289 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7290 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7291 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7292 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7293 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7294 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7295
7296 DO_ST1_ZPZ_D(bd, zss, MO_8)
7297 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7298 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7299 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7300 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7301 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7302 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7303
7304 DO_ST1_ZPZ_D(bd, zd, MO_8)
7305 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7306 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7307 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7308 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7309 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7310 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7311
7312 #undef DO_ST1_ZPZ_S
7313 #undef DO_ST1_ZPZ_D
7314
7315 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7316 {
7317 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7318 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7319
7320 for (i = 0; i < opr_sz; ++i) {
7321 d[i] = n[i] ^ m[i] ^ k[i];
7322 }
7323 }
7324
HELPER(sve2_bcax)7325 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7326 {
7327 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7328 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7329
7330 for (i = 0; i < opr_sz; ++i) {
7331 d[i] = n[i] ^ (m[i] & ~k[i]);
7332 }
7333 }
7334
HELPER(sve2_bsl1n)7335 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7336 {
7337 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7338 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7339
7340 for (i = 0; i < opr_sz; ++i) {
7341 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7342 }
7343 }
7344
HELPER(sve2_bsl2n)7345 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7346 {
7347 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7348 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7349
7350 for (i = 0; i < opr_sz; ++i) {
7351 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7352 }
7353 }
7354
HELPER(sve2_nbsl)7355 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7356 {
7357 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7358 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7359
7360 for (i = 0; i < opr_sz; ++i) {
7361 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7362 }
7363 }
7364
7365 /*
7366 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7367 * See hasless(v,1) from
7368 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7369 */
do_match2(uint64_t n,uint64_t m0,uint64_t m1,int esz)7370 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7371 {
7372 int bits = 8 << esz;
7373 uint64_t ones = dup_const(esz, 1);
7374 uint64_t signs = ones << (bits - 1);
7375 uint64_t cmp0, cmp1;
7376
7377 cmp1 = dup_const(esz, n);
7378 cmp0 = cmp1 ^ m0;
7379 cmp1 = cmp1 ^ m1;
7380 cmp0 = (cmp0 - ones) & ~cmp0;
7381 cmp1 = (cmp1 - ones) & ~cmp1;
7382 return (cmp0 | cmp1) & signs;
7383 }
7384
do_match(void * vd,void * vn,void * vm,void * vg,uint32_t desc,int esz,bool nmatch)7385 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7386 uint32_t desc, int esz, bool nmatch)
7387 {
7388 uint16_t esz_mask = pred_esz_masks[esz];
7389 intptr_t opr_sz = simd_oprsz(desc);
7390 uint32_t flags = PREDTEST_INIT;
7391 intptr_t i, j, k;
7392
7393 for (i = 0; i < opr_sz; i += 16) {
7394 uint64_t m0 = *(uint64_t *)(vm + i);
7395 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7396 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7397 uint16_t out = 0;
7398
7399 for (j = 0; j < 16; j += 8) {
7400 uint64_t n = *(uint64_t *)(vn + i + j);
7401
7402 for (k = 0; k < 8; k += 1 << esz) {
7403 if (pg & (1 << (j + k))) {
7404 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7405 out |= (o ^ nmatch) << (j + k);
7406 }
7407 }
7408 }
7409 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7410 flags = iter_predtest_fwd(out, pg, flags);
7411 }
7412 return flags;
7413 }
7414
7415 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7416 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7417 { \
7418 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7419 }
7420
DO_PPZZ_MATCH(sve2_match_ppzz_b,MO_8,false)7421 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7422 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7423
7424 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7425 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7426
7427 #undef DO_PPZZ_MATCH
7428
7429 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7430 uint32_t desc)
7431 {
7432 ARMVectorReg scratch;
7433 intptr_t i, j;
7434 intptr_t opr_sz = simd_oprsz(desc);
7435 uint32_t *d = vd, *n = vn, *m = vm;
7436 uint8_t *pg = vg;
7437
7438 if (d == n) {
7439 n = memcpy(&scratch, n, opr_sz);
7440 if (d == m) {
7441 m = n;
7442 }
7443 } else if (d == m) {
7444 m = memcpy(&scratch, m, opr_sz);
7445 }
7446
7447 for (i = 0; i < opr_sz; i += 4) {
7448 uint64_t count = 0;
7449 uint8_t pred;
7450
7451 pred = pg[H1(i >> 3)] >> (i & 7);
7452 if (pred & 1) {
7453 uint32_t nn = n[H4(i >> 2)];
7454
7455 for (j = 0; j <= i; j += 4) {
7456 pred = pg[H1(j >> 3)] >> (j & 7);
7457 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7458 ++count;
7459 }
7460 }
7461 }
7462 d[H4(i >> 2)] = count;
7463 }
7464 }
7465
HELPER(sve2_histcnt_d)7466 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7467 uint32_t desc)
7468 {
7469 ARMVectorReg scratch;
7470 intptr_t i, j;
7471 intptr_t opr_sz = simd_oprsz(desc);
7472 uint64_t *d = vd, *n = vn, *m = vm;
7473 uint8_t *pg = vg;
7474
7475 if (d == n) {
7476 n = memcpy(&scratch, n, opr_sz);
7477 if (d == m) {
7478 m = n;
7479 }
7480 } else if (d == m) {
7481 m = memcpy(&scratch, m, opr_sz);
7482 }
7483
7484 for (i = 0; i < opr_sz / 8; ++i) {
7485 uint64_t count = 0;
7486 if (pg[H1(i)] & 1) {
7487 uint64_t nn = n[i];
7488 for (j = 0; j <= i; ++j) {
7489 if ((pg[H1(j)] & 1) && nn == m[j]) {
7490 ++count;
7491 }
7492 }
7493 }
7494 d[i] = count;
7495 }
7496 }
7497
7498 /*
7499 * Returns the number of bytes in m0 and m1 that match n.
7500 * Unlike do_match2 we don't just need true/false, we need an exact count.
7501 * This requires two extra logical operations.
7502 */
do_histseg_cnt(uint8_t n,uint64_t m0,uint64_t m1)7503 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7504 {
7505 const uint64_t mask = dup_const(MO_8, 0x7f);
7506 uint64_t cmp0, cmp1;
7507
7508 cmp1 = dup_const(MO_8, n);
7509 cmp0 = cmp1 ^ m0;
7510 cmp1 = cmp1 ^ m1;
7511
7512 /*
7513 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7514 * 2: carry in to msb if byte != 0 (+ mask)
7515 * 3: set msb if cmp has msb set (| cmp)
7516 * 4: set ~msb to ignore them (| mask)
7517 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7518 * 5: invert, resulting in 0x80 if and only if byte == 0.
7519 */
7520 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7521 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7522
7523 /*
7524 * Combine the two compares in a way that the bits do
7525 * not overlap, and so preserves the count of set bits.
7526 * If the host has an efficient instruction for ctpop,
7527 * then ctpop(x) + ctpop(y) has the same number of
7528 * operations as ctpop(x | (y >> 1)). If the host does
7529 * not have an efficient ctpop, then we only want to
7530 * use it once.
7531 */
7532 return ctpop64(cmp0 | (cmp1 >> 1));
7533 }
7534
HELPER(sve2_histseg)7535 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7536 {
7537 intptr_t i, j;
7538 intptr_t opr_sz = simd_oprsz(desc);
7539
7540 for (i = 0; i < opr_sz; i += 16) {
7541 uint64_t n0 = *(uint64_t *)(vn + i);
7542 uint64_t m0 = *(uint64_t *)(vm + i);
7543 uint64_t n1 = *(uint64_t *)(vn + i + 8);
7544 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7545 uint64_t out0 = 0;
7546 uint64_t out1 = 0;
7547
7548 for (j = 0; j < 64; j += 8) {
7549 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7550 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7551 out0 |= cnt0 << j;
7552 out1 |= cnt1 << j;
7553 }
7554
7555 *(uint64_t *)(vd + i) = out0;
7556 *(uint64_t *)(vd + i + 8) = out1;
7557 }
7558 }
7559
HELPER(sve2_xar_b)7560 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7561 {
7562 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7563 int shr = simd_data(desc);
7564 int shl = 8 - shr;
7565 uint64_t mask = dup_const(MO_8, 0xff >> shr);
7566 uint64_t *d = vd, *n = vn, *m = vm;
7567
7568 for (i = 0; i < opr_sz; ++i) {
7569 uint64_t t = n[i] ^ m[i];
7570 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7571 }
7572 }
7573
HELPER(sve2_xar_h)7574 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7575 {
7576 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7577 int shr = simd_data(desc);
7578 int shl = 16 - shr;
7579 uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7580 uint64_t *d = vd, *n = vn, *m = vm;
7581
7582 for (i = 0; i < opr_sz; ++i) {
7583 uint64_t t = n[i] ^ m[i];
7584 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7585 }
7586 }
7587
HELPER(sve2_xar_s)7588 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7589 {
7590 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7591 int shr = simd_data(desc);
7592 uint32_t *d = vd, *n = vn, *m = vm;
7593
7594 for (i = 0; i < opr_sz; ++i) {
7595 d[i] = ror32(n[i] ^ m[i], shr);
7596 }
7597 }
7598
HELPER(fmmla_s)7599 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7600 float_status *status, uint32_t desc)
7601 {
7602 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7603
7604 for (s = 0; s < opr_sz; ++s) {
7605 float32 *n = vn + s * sizeof(float32) * 4;
7606 float32 *m = vm + s * sizeof(float32) * 4;
7607 float32 *a = va + s * sizeof(float32) * 4;
7608 float32 *d = vd + s * sizeof(float32) * 4;
7609 float32 n00 = n[H4(0)], n01 = n[H4(1)];
7610 float32 n10 = n[H4(2)], n11 = n[H4(3)];
7611 float32 m00 = m[H4(0)], m01 = m[H4(1)];
7612 float32 m10 = m[H4(2)], m11 = m[H4(3)];
7613 float32 p0, p1;
7614
7615 /* i = 0, j = 0 */
7616 p0 = float32_mul(n00, m00, status);
7617 p1 = float32_mul(n01, m01, status);
7618 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7619
7620 /* i = 0, j = 1 */
7621 p0 = float32_mul(n00, m10, status);
7622 p1 = float32_mul(n01, m11, status);
7623 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7624
7625 /* i = 1, j = 0 */
7626 p0 = float32_mul(n10, m00, status);
7627 p1 = float32_mul(n11, m01, status);
7628 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7629
7630 /* i = 1, j = 1 */
7631 p0 = float32_mul(n10, m10, status);
7632 p1 = float32_mul(n11, m11, status);
7633 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7634 }
7635 }
7636
HELPER(fmmla_d)7637 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7638 float_status *status, uint32_t desc)
7639 {
7640 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7641
7642 for (s = 0; s < opr_sz; ++s) {
7643 float64 *n = vn + s * sizeof(float64) * 4;
7644 float64 *m = vm + s * sizeof(float64) * 4;
7645 float64 *a = va + s * sizeof(float64) * 4;
7646 float64 *d = vd + s * sizeof(float64) * 4;
7647 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7648 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7649 float64 p0, p1;
7650
7651 /* i = 0, j = 0 */
7652 p0 = float64_mul(n00, m00, status);
7653 p1 = float64_mul(n01, m01, status);
7654 d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7655
7656 /* i = 0, j = 1 */
7657 p0 = float64_mul(n00, m10, status);
7658 p1 = float64_mul(n01, m11, status);
7659 d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7660
7661 /* i = 1, j = 0 */
7662 p0 = float64_mul(n10, m00, status);
7663 p1 = float64_mul(n11, m01, status);
7664 d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7665
7666 /* i = 1, j = 1 */
7667 p0 = float64_mul(n10, m10, status);
7668 p1 = float64_mul(n11, m11, status);
7669 d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7670 }
7671 }
7672
7673 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7674 void HELPER(NAME)(void *vd, void *vn, void *vg, \
7675 float_status *status, uint32_t desc) \
7676 { \
7677 intptr_t i = simd_oprsz(desc); \
7678 uint64_t *g = vg; \
7679 do { \
7680 uint64_t pg = g[(i - 1) >> 6]; \
7681 do { \
7682 i -= sizeof(TYPEW); \
7683 if (likely((pg >> (i & 63)) & 1)) { \
7684 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7685 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7686 } \
7687 } while (i & 63); \
7688 } while (i != 0); \
7689 }
7690
7691 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7692 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7693 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7694
7695 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7696 void HELPER(NAME)(void *vd, void *vn, void *vg, \
7697 float_status *status, uint32_t desc) \
7698 { \
7699 intptr_t i = simd_oprsz(desc); \
7700 uint64_t *g = vg; \
7701 do { \
7702 uint64_t pg = g[(i - 1) >> 6]; \
7703 do { \
7704 i -= sizeof(TYPEW); \
7705 if (likely((pg >> (i & 63)) & 1)) { \
7706 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
7707 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
7708 } \
7709 } while (i & 63); \
7710 } while (i != 0); \
7711 }
7712
7713 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7714 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7715
7716 #undef DO_FCVTLT
7717 #undef DO_FCVTNT
7718