1 /*
2 * Generic vector operation expansion
3 *
4 * Copyright (c) 2018 Linaro
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "tcg/tcg.h"
22 #include "tcg/tcg-temp-internal.h"
23 #include "tcg/tcg-op-common.h"
24 #include "tcg/tcg-op-gvec-common.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "tcg-has.h"
27
28 #define MAX_UNROLL 4
29
30 #ifdef CONFIG_DEBUG_TCG
31 static const TCGOpcode vecop_list_empty[1] = { 0 };
32 #else
33 #define vecop_list_empty NULL
34 #endif
35
36
37 /* Verify vector size and alignment rules. OFS should be the OR of all
38 of the operand offsets so that we can check them all at once. */
check_size_align(uint32_t oprsz,uint32_t maxsz,uint32_t ofs)39 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
40 {
41 uint32_t max_align;
42
43 switch (oprsz) {
44 case 8:
45 case 16:
46 case 32:
47 tcg_debug_assert(oprsz <= maxsz);
48 break;
49 default:
50 tcg_debug_assert(oprsz == maxsz);
51 break;
52 }
53 tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
54
55 max_align = maxsz >= 16 ? 15 : 7;
56 tcg_debug_assert((maxsz & max_align) == 0);
57 tcg_debug_assert((ofs & max_align) == 0);
58 }
59
60 /*
61 * Verify vector overlap rules for two operands.
62 * When dbase and abase are not the same pointer, we cannot check for
63 * overlap at compile-time, but the runtime restrictions remain.
64 */
check_overlap_2(TCGv_ptr dbase,uint32_t d,TCGv_ptr abase,uint32_t a,uint32_t s)65 static void check_overlap_2(TCGv_ptr dbase, uint32_t d,
66 TCGv_ptr abase, uint32_t a, uint32_t s)
67 {
68 tcg_debug_assert(dbase != abase || d == a || d + s <= a || a + s <= d);
69 }
70
71 /* Verify vector overlap rules for three operands. */
check_overlap_3(TCGv_ptr dbase,uint32_t d,TCGv_ptr abase,uint32_t a,TCGv_ptr bbase,uint32_t b,uint32_t s)72 static void check_overlap_3(TCGv_ptr dbase, uint32_t d,
73 TCGv_ptr abase, uint32_t a,
74 TCGv_ptr bbase, uint32_t b, uint32_t s)
75 {
76 check_overlap_2(dbase, d, abase, a, s);
77 check_overlap_2(dbase, d, bbase, b, s);
78 check_overlap_2(abase, a, bbase, b, s);
79 }
80
81 /* Verify vector overlap rules for four operands. */
check_overlap_4(TCGv_ptr dbase,uint32_t d,TCGv_ptr abase,uint32_t a,TCGv_ptr bbase,uint32_t b,TCGv_ptr cbase,uint32_t c,uint32_t s)82 static void check_overlap_4(TCGv_ptr dbase, uint32_t d,
83 TCGv_ptr abase, uint32_t a,
84 TCGv_ptr bbase, uint32_t b,
85 TCGv_ptr cbase, uint32_t c, uint32_t s)
86 {
87 check_overlap_2(dbase, d, abase, a, s);
88 check_overlap_2(dbase, d, bbase, b, s);
89 check_overlap_2(dbase, d, cbase, c, s);
90 check_overlap_2(abase, a, bbase, b, s);
91 check_overlap_2(abase, a, cbase, c, s);
92 check_overlap_2(bbase, b, cbase, c, s);
93 }
94
95 /* Create a descriptor from components. */
simd_desc(uint32_t oprsz,uint32_t maxsz,int32_t data)96 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
97 {
98 uint32_t desc = 0;
99
100 check_size_align(oprsz, maxsz, 0);
101
102 /*
103 * We want to check that 'data' will fit into SIMD_DATA_BITS.
104 * However, some callers want to treat the data as a signed
105 * value (which they can later get back with simd_data())
106 * and some want to treat it as an unsigned value.
107 * So here we assert only that the data will fit into the
108 * field in at least one way. This means that some invalid
109 * values from the caller will not be detected, e.g. if the
110 * caller wants to handle the value as a signed integer but
111 * incorrectly passes us 1 << (SIMD_DATA_BITS - 1).
112 */
113 tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS) ||
114 data == extract32(data, 0, SIMD_DATA_BITS));
115
116 oprsz = (oprsz / 8) - 1;
117 maxsz = (maxsz / 8) - 1;
118
119 /*
120 * We have just asserted in check_size_align that either
121 * oprsz is {8,16,32} or matches maxsz. Encode the final
122 * case with '2', as that would otherwise map to 24.
123 */
124 if (oprsz == maxsz) {
125 oprsz = 2;
126 }
127
128 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
129 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
130 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
131
132 return desc;
133 }
134
135 /* Generate a call to a gvec-style helper with two vector operands. */
expand_2_ool(TCGv_ptr dbase,uint32_t dofs,TCGv_ptr abase,uint32_t aofs,uint32_t oprsz,uint32_t maxsz,int32_t data,gen_helper_gvec_2 * fn)136 static void expand_2_ool(TCGv_ptr dbase, uint32_t dofs,
137 TCGv_ptr abase, uint32_t aofs,
138 uint32_t oprsz, uint32_t maxsz,
139 int32_t data, gen_helper_gvec_2 *fn)
140 {
141 TCGv_ptr a0, a1;
142 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
143
144 a0 = tcg_temp_ebb_new_ptr();
145 a1 = tcg_temp_ebb_new_ptr();
146
147 tcg_gen_addi_ptr(a0, dbase, dofs);
148 tcg_gen_addi_ptr(a1, abase, aofs);
149
150 fn(a0, a1, desc);
151
152 tcg_temp_free_ptr(a0);
153 tcg_temp_free_ptr(a1);
154 }
155
tcg_gen_gvec_2_ool(uint32_t dofs,uint32_t aofs,uint32_t oprsz,uint32_t maxsz,int32_t data,gen_helper_gvec_2 * fn)156 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
157 uint32_t oprsz, uint32_t maxsz, int32_t data,
158 gen_helper_gvec_2 *fn)
159 {
160 expand_2_ool(tcg_env, dofs, tcg_env, aofs, oprsz, maxsz, data, fn);
161 }
162
163 /* Generate a call to a gvec-style helper with two vector operands
164 and one scalar operand. */
tcg_gen_gvec_2i_ool(uint32_t dofs,uint32_t aofs,TCGv_i64 c,uint32_t oprsz,uint32_t maxsz,int32_t data,gen_helper_gvec_2i * fn)165 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
166 uint32_t oprsz, uint32_t maxsz, int32_t data,
167 gen_helper_gvec_2i *fn)
168 {
169 TCGv_ptr a0, a1;
170 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
171
172 a0 = tcg_temp_ebb_new_ptr();
173 a1 = tcg_temp_ebb_new_ptr();
174
175 tcg_gen_addi_ptr(a0, tcg_env, dofs);
176 tcg_gen_addi_ptr(a1, tcg_env, aofs);
177
178 fn(a0, a1, c, desc);
179
180 tcg_temp_free_ptr(a0);
181 tcg_temp_free_ptr(a1);
182 }
183
184 /* Generate a call to a gvec-style helper with three vector operands. */
expand_3_ool(TCGv_ptr dbase,uint32_t dofs,TCGv_ptr abase,uint32_t aofs,TCGv_ptr bbase,uint32_t bofs,uint32_t oprsz,uint32_t maxsz,int32_t data,gen_helper_gvec_3 * fn)185 static void expand_3_ool(TCGv_ptr dbase, uint32_t dofs,
186 TCGv_ptr abase, uint32_t aofs,
187 TCGv_ptr bbase, uint32_t bofs,
188 uint32_t oprsz, uint32_t maxsz,
189 int32_t data, gen_helper_gvec_3 *fn)
190 {
191 TCGv_ptr a0, a1, a2;
192 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
193
194 a0 = tcg_temp_ebb_new_ptr();
195 a1 = tcg_temp_ebb_new_ptr();
196 a2 = tcg_temp_ebb_new_ptr();
197
198 tcg_gen_addi_ptr(a0, dbase, dofs);
199 tcg_gen_addi_ptr(a1, abase, aofs);
200 tcg_gen_addi_ptr(a2, bbase, bofs);
201
202 fn(a0, a1, a2, desc);
203
204 tcg_temp_free_ptr(a0);
205 tcg_temp_free_ptr(a1);
206 tcg_temp_free_ptr(a2);
207 }
208
tcg_gen_gvec_3_ool(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz,int32_t data,gen_helper_gvec_3 * fn)209 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
210 uint32_t oprsz, uint32_t maxsz, int32_t data,
211 gen_helper_gvec_3 *fn)
212 {
213 expand_3_ool(tcg_env, dofs, tcg_env, aofs, tcg_env, bofs,
214 oprsz, maxsz, data, fn);
215 }
216
217 /* Generate a call to a gvec-style helper with four vector operands. */
tcg_gen_gvec_4_ool(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t cofs,uint32_t oprsz,uint32_t maxsz,int32_t data,gen_helper_gvec_4 * fn)218 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
219 uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
220 int32_t data, gen_helper_gvec_4 *fn)
221 {
222 TCGv_ptr a0, a1, a2, a3;
223 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
224
225 a0 = tcg_temp_ebb_new_ptr();
226 a1 = tcg_temp_ebb_new_ptr();
227 a2 = tcg_temp_ebb_new_ptr();
228 a3 = tcg_temp_ebb_new_ptr();
229
230 tcg_gen_addi_ptr(a0, tcg_env, dofs);
231 tcg_gen_addi_ptr(a1, tcg_env, aofs);
232 tcg_gen_addi_ptr(a2, tcg_env, bofs);
233 tcg_gen_addi_ptr(a3, tcg_env, cofs);
234
235 fn(a0, a1, a2, a3, desc);
236
237 tcg_temp_free_ptr(a0);
238 tcg_temp_free_ptr(a1);
239 tcg_temp_free_ptr(a2);
240 tcg_temp_free_ptr(a3);
241 }
242
243 /* Generate a call to a gvec-style helper with five vector operands. */
tcg_gen_gvec_5_ool(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t cofs,uint32_t xofs,uint32_t oprsz,uint32_t maxsz,int32_t data,gen_helper_gvec_5 * fn)244 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
245 uint32_t cofs, uint32_t xofs, uint32_t oprsz,
246 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
247 {
248 TCGv_ptr a0, a1, a2, a3, a4;
249 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
250
251 a0 = tcg_temp_ebb_new_ptr();
252 a1 = tcg_temp_ebb_new_ptr();
253 a2 = tcg_temp_ebb_new_ptr();
254 a3 = tcg_temp_ebb_new_ptr();
255 a4 = tcg_temp_ebb_new_ptr();
256
257 tcg_gen_addi_ptr(a0, tcg_env, dofs);
258 tcg_gen_addi_ptr(a1, tcg_env, aofs);
259 tcg_gen_addi_ptr(a2, tcg_env, bofs);
260 tcg_gen_addi_ptr(a3, tcg_env, cofs);
261 tcg_gen_addi_ptr(a4, tcg_env, xofs);
262
263 fn(a0, a1, a2, a3, a4, desc);
264
265 tcg_temp_free_ptr(a0);
266 tcg_temp_free_ptr(a1);
267 tcg_temp_free_ptr(a2);
268 tcg_temp_free_ptr(a3);
269 tcg_temp_free_ptr(a4);
270 }
271
272 /* Generate a call to a gvec-style helper with three vector operands
273 and an extra pointer operand. */
tcg_gen_gvec_2_ptr(uint32_t dofs,uint32_t aofs,TCGv_ptr ptr,uint32_t oprsz,uint32_t maxsz,int32_t data,gen_helper_gvec_2_ptr * fn)274 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
275 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
276 int32_t data, gen_helper_gvec_2_ptr *fn)
277 {
278 TCGv_ptr a0, a1;
279 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
280
281 a0 = tcg_temp_ebb_new_ptr();
282 a1 = tcg_temp_ebb_new_ptr();
283
284 tcg_gen_addi_ptr(a0, tcg_env, dofs);
285 tcg_gen_addi_ptr(a1, tcg_env, aofs);
286
287 fn(a0, a1, ptr, desc);
288
289 tcg_temp_free_ptr(a0);
290 tcg_temp_free_ptr(a1);
291 }
292
293 /* Generate a call to a gvec-style helper with three vector operands
294 and an extra pointer operand. */
tcg_gen_gvec_3_ptr(uint32_t dofs,uint32_t aofs,uint32_t bofs,TCGv_ptr ptr,uint32_t oprsz,uint32_t maxsz,int32_t data,gen_helper_gvec_3_ptr * fn)295 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
296 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
297 int32_t data, gen_helper_gvec_3_ptr *fn)
298 {
299 TCGv_ptr a0, a1, a2;
300 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
301
302 a0 = tcg_temp_ebb_new_ptr();
303 a1 = tcg_temp_ebb_new_ptr();
304 a2 = tcg_temp_ebb_new_ptr();
305
306 tcg_gen_addi_ptr(a0, tcg_env, dofs);
307 tcg_gen_addi_ptr(a1, tcg_env, aofs);
308 tcg_gen_addi_ptr(a2, tcg_env, bofs);
309
310 fn(a0, a1, a2, ptr, desc);
311
312 tcg_temp_free_ptr(a0);
313 tcg_temp_free_ptr(a1);
314 tcg_temp_free_ptr(a2);
315 }
316
317 /* Generate a call to a gvec-style helper with four vector operands
318 and an extra pointer operand. */
tcg_gen_gvec_4_ptr(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t cofs,TCGv_ptr ptr,uint32_t oprsz,uint32_t maxsz,int32_t data,gen_helper_gvec_4_ptr * fn)319 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
320 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
321 uint32_t maxsz, int32_t data,
322 gen_helper_gvec_4_ptr *fn)
323 {
324 TCGv_ptr a0, a1, a2, a3;
325 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
326
327 a0 = tcg_temp_ebb_new_ptr();
328 a1 = tcg_temp_ebb_new_ptr();
329 a2 = tcg_temp_ebb_new_ptr();
330 a3 = tcg_temp_ebb_new_ptr();
331
332 tcg_gen_addi_ptr(a0, tcg_env, dofs);
333 tcg_gen_addi_ptr(a1, tcg_env, aofs);
334 tcg_gen_addi_ptr(a2, tcg_env, bofs);
335 tcg_gen_addi_ptr(a3, tcg_env, cofs);
336
337 fn(a0, a1, a2, a3, ptr, desc);
338
339 tcg_temp_free_ptr(a0);
340 tcg_temp_free_ptr(a1);
341 tcg_temp_free_ptr(a2);
342 tcg_temp_free_ptr(a3);
343 }
344
345 /* Generate a call to a gvec-style helper with five vector operands
346 and an extra pointer operand. */
tcg_gen_gvec_5_ptr(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t cofs,uint32_t eofs,TCGv_ptr ptr,uint32_t oprsz,uint32_t maxsz,int32_t data,gen_helper_gvec_5_ptr * fn)347 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
348 uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
349 uint32_t oprsz, uint32_t maxsz, int32_t data,
350 gen_helper_gvec_5_ptr *fn)
351 {
352 TCGv_ptr a0, a1, a2, a3, a4;
353 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
354
355 a0 = tcg_temp_ebb_new_ptr();
356 a1 = tcg_temp_ebb_new_ptr();
357 a2 = tcg_temp_ebb_new_ptr();
358 a3 = tcg_temp_ebb_new_ptr();
359 a4 = tcg_temp_ebb_new_ptr();
360
361 tcg_gen_addi_ptr(a0, tcg_env, dofs);
362 tcg_gen_addi_ptr(a1, tcg_env, aofs);
363 tcg_gen_addi_ptr(a2, tcg_env, bofs);
364 tcg_gen_addi_ptr(a3, tcg_env, cofs);
365 tcg_gen_addi_ptr(a4, tcg_env, eofs);
366
367 fn(a0, a1, a2, a3, a4, ptr, desc);
368
369 tcg_temp_free_ptr(a0);
370 tcg_temp_free_ptr(a1);
371 tcg_temp_free_ptr(a2);
372 tcg_temp_free_ptr(a3);
373 tcg_temp_free_ptr(a4);
374 }
375
376 /* Return true if we want to implement something of OPRSZ bytes
377 in units of LNSZ. This limits the expansion of inline code. */
check_size_impl(uint32_t oprsz,uint32_t lnsz)378 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
379 {
380 uint32_t q, r;
381
382 if (oprsz < lnsz) {
383 return false;
384 }
385
386 q = oprsz / lnsz;
387 r = oprsz % lnsz;
388 tcg_debug_assert((r & 7) == 0);
389
390 if (lnsz < 16) {
391 /* For sizes below 16, accept no remainder. */
392 if (r != 0) {
393 return false;
394 }
395 } else {
396 /*
397 * Recall that ARM SVE allows vector sizes that are not a
398 * power of 2, but always a multiple of 16. The intent is
399 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
400 * In addition, expand_clr needs to handle a multiple of 8.
401 * Thus we can handle the tail with one more operation per
402 * diminishing power of 2.
403 */
404 q += ctpop32(r);
405 }
406
407 return q <= MAX_UNROLL;
408 }
409
410 static void expand_clr(TCGv_ptr dbase, uint32_t dofs, uint32_t maxsz);
411
412 /* Duplicate C as per VECE. */
uint64_t(dup_const)413 uint64_t (dup_const)(unsigned vece, uint64_t c)
414 {
415 switch (vece) {
416 case MO_8:
417 return 0x0101010101010101ull * (uint8_t)c;
418 case MO_16:
419 return 0x0001000100010001ull * (uint16_t)c;
420 case MO_32:
421 return 0x0000000100000001ull * (uint32_t)c;
422 case MO_64:
423 return c;
424 default:
425 g_assert_not_reached();
426 }
427 }
428
429 /* Duplicate IN into OUT as per VECE. */
tcg_gen_dup_i32(unsigned vece,TCGv_i32 out,TCGv_i32 in)430 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
431 {
432 switch (vece) {
433 case MO_8:
434 tcg_gen_ext8u_i32(out, in);
435 tcg_gen_muli_i32(out, out, 0x01010101);
436 break;
437 case MO_16:
438 tcg_gen_deposit_i32(out, in, in, 16, 16);
439 break;
440 case MO_32:
441 tcg_gen_mov_i32(out, in);
442 break;
443 default:
444 g_assert_not_reached();
445 }
446 }
447
tcg_gen_dup_i64(unsigned vece,TCGv_i64 out,TCGv_i64 in)448 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
449 {
450 switch (vece) {
451 case MO_8:
452 tcg_gen_ext8u_i64(out, in);
453 tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
454 break;
455 case MO_16:
456 tcg_gen_ext16u_i64(out, in);
457 tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
458 break;
459 case MO_32:
460 tcg_gen_deposit_i64(out, in, in, 32, 32);
461 break;
462 case MO_64:
463 tcg_gen_mov_i64(out, in);
464 break;
465 default:
466 g_assert_not_reached();
467 }
468 }
469
470 /* Select a supported vector type for implementing an operation on SIZE
471 * bytes. If OP is 0, assume that the real operation to be performed is
472 * required by all backends. Otherwise, make sure than OP can be performed
473 * on elements of size VECE in the selected type. Do not select V64 if
474 * PREFER_I64 is true. Return 0 if no vector type is selected.
475 */
choose_vector_type(const TCGOpcode * list,unsigned vece,uint32_t size,bool prefer_i64)476 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
477 uint32_t size, bool prefer_i64)
478 {
479 /*
480 * Recall that ARM SVE allows vector sizes that are not a
481 * power of 2, but always a multiple of 16. The intent is
482 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
483 * It is hard to imagine a case in which v256 is supported
484 * but v128 is not, but check anyway.
485 * In addition, expand_clr needs to handle a multiple of 8.
486 */
487 if (TCG_TARGET_HAS_v256 &&
488 check_size_impl(size, 32) &&
489 tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
490 (!(size & 16) ||
491 (TCG_TARGET_HAS_v128 &&
492 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
493 (!(size & 8) ||
494 (TCG_TARGET_HAS_v64 &&
495 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
496 return TCG_TYPE_V256;
497 }
498 if (TCG_TARGET_HAS_v128 &&
499 check_size_impl(size, 16) &&
500 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
501 (!(size & 8) ||
502 (TCG_TARGET_HAS_v64 &&
503 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
504 return TCG_TYPE_V128;
505 }
506 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
507 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
508 return TCG_TYPE_V64;
509 }
510 return 0;
511 }
512
do_dup_store(TCGType type,TCGv_ptr dbase,uint32_t dofs,uint32_t oprsz,uint32_t maxsz,TCGv_vec t_vec)513 static void do_dup_store(TCGType type, TCGv_ptr dbase, uint32_t dofs,
514 uint32_t oprsz, uint32_t maxsz, TCGv_vec t_vec)
515 {
516 uint32_t i = 0;
517
518 tcg_debug_assert(oprsz >= 8);
519
520 /*
521 * This may be expand_clr for the tail of an operation, e.g.
522 * oprsz == 8 && maxsz == 64. The first 8 bytes of this store
523 * are misaligned wrt the maximum vector size, so do that first.
524 */
525 if (dofs & 8) {
526 tcg_gen_stl_vec(t_vec, dbase, dofs + i, TCG_TYPE_V64);
527 i += 8;
528 }
529
530 switch (type) {
531 case TCG_TYPE_V256:
532 /*
533 * Recall that ARM SVE allows vector sizes that are not a
534 * power of 2, but always a multiple of 16. The intent is
535 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
536 */
537 for (; i + 32 <= oprsz; i += 32) {
538 tcg_gen_stl_vec(t_vec, dbase, dofs + i, TCG_TYPE_V256);
539 }
540 /* fallthru */
541 case TCG_TYPE_V128:
542 for (; i + 16 <= oprsz; i += 16) {
543 tcg_gen_stl_vec(t_vec, dbase, dofs + i, TCG_TYPE_V128);
544 }
545 break;
546 case TCG_TYPE_V64:
547 for (; i < oprsz; i += 8) {
548 tcg_gen_stl_vec(t_vec, dbase, dofs + i, TCG_TYPE_V64);
549 }
550 break;
551 default:
552 g_assert_not_reached();
553 }
554
555 if (oprsz < maxsz) {
556 expand_clr(dbase, dofs + oprsz, maxsz - oprsz);
557 }
558 }
559
560 /*
561 * Set OPRSZ bytes at DBASE + DOFS to replications of IN_32, IN_64 or IN_C.
562 * Only one of IN_32 or IN_64 may be set;
563 * IN_C is used if IN_32 and IN_64 are unset.
564 */
do_dup(unsigned vece,TCGv_ptr dbase,uint32_t dofs,uint32_t oprsz,uint32_t maxsz,TCGv_i32 in_32,TCGv_i64 in_64,uint64_t in_c)565 static void do_dup(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
566 uint32_t oprsz, uint32_t maxsz,
567 TCGv_i32 in_32, TCGv_i64 in_64, uint64_t in_c)
568 {
569 TCGType type;
570 TCGv_i64 t_64;
571 TCGv_i32 t_32, t_desc;
572 TCGv_ptr t_ptr;
573 uint32_t i;
574
575 assert(vece <= (in_32 ? MO_32 : MO_64));
576 assert(in_32 == NULL || in_64 == NULL);
577
578 /* If we're storing 0, expand oprsz to maxsz. */
579 if (in_32 == NULL && in_64 == NULL) {
580 in_c = dup_const(vece, in_c);
581 if (in_c == 0) {
582 oprsz = maxsz;
583 vece = MO_8;
584 } else if (in_c == dup_const(MO_8, in_c)) {
585 vece = MO_8;
586 }
587 }
588
589 /* Implement inline with a vector type, if possible.
590 * Prefer integer when 64-bit host and no variable dup.
591 */
592 type = choose_vector_type(NULL, vece, oprsz,
593 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
594 && (in_64 == NULL || vece == MO_64)));
595 if (type != 0) {
596 TCGv_vec t_vec = tcg_temp_new_vec(type);
597
598 if (in_32) {
599 tcg_gen_dup_i32_vec(vece, t_vec, in_32);
600 } else if (in_64) {
601 tcg_gen_dup_i64_vec(vece, t_vec, in_64);
602 } else {
603 tcg_gen_dupi_vec(vece, t_vec, in_c);
604 }
605 do_dup_store(type, dbase, dofs, oprsz, maxsz, t_vec);
606 return;
607 }
608
609 /* Otherwise, inline with an integer type, unless "large". */
610 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
611 t_64 = NULL;
612 t_32 = NULL;
613
614 if (in_32) {
615 /* We are given a 32-bit variable input. For a 64-bit host,
616 use a 64-bit operation unless the 32-bit operation would
617 be simple enough. */
618 if (TCG_TARGET_REG_BITS == 64
619 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
620 t_64 = tcg_temp_ebb_new_i64();
621 tcg_gen_extu_i32_i64(t_64, in_32);
622 tcg_gen_dup_i64(vece, t_64, t_64);
623 } else {
624 t_32 = tcg_temp_ebb_new_i32();
625 tcg_gen_dup_i32(vece, t_32, in_32);
626 }
627 } else if (in_64) {
628 /* We are given a 64-bit variable input. */
629 t_64 = tcg_temp_ebb_new_i64();
630 tcg_gen_dup_i64(vece, t_64, in_64);
631 } else {
632 /* We are given a constant input. */
633 /* For 64-bit hosts, use 64-bit constants for "simple" constants
634 or when we'd need too many 32-bit stores, or when a 64-bit
635 constant is really required. */
636 if (vece == MO_64
637 || (TCG_TARGET_REG_BITS == 64
638 && (in_c == 0 || in_c == -1
639 || !check_size_impl(oprsz, 4)))) {
640 t_64 = tcg_constant_i64(in_c);
641 } else {
642 t_32 = tcg_constant_i32(in_c);
643 }
644 }
645
646 /* Implement inline if we picked an implementation size above. */
647 if (t_32) {
648 for (i = 0; i < oprsz; i += 4) {
649 tcg_gen_st_i32(t_32, dbase, dofs + i);
650 }
651 tcg_temp_free_i32(t_32);
652 goto done;
653 }
654 if (t_64) {
655 for (i = 0; i < oprsz; i += 8) {
656 tcg_gen_st_i64(t_64, dbase, dofs + i);
657 }
658 tcg_temp_free_i64(t_64);
659 goto done;
660 }
661 }
662
663 /* Otherwise implement out of line. */
664 t_ptr = tcg_temp_ebb_new_ptr();
665 tcg_gen_addi_ptr(t_ptr, dbase, dofs);
666
667 /*
668 * This may be expand_clr for the tail of an operation, e.g.
669 * oprsz == 8 && maxsz == 64. The size of the clear is misaligned
670 * wrt simd_desc and will assert. Simply pass all replicated byte
671 * stores through to memset.
672 */
673 if (oprsz == maxsz && vece == MO_8) {
674 TCGv_ptr t_size = tcg_constant_ptr(oprsz);
675 TCGv_i32 t_val;
676
677 if (in_32) {
678 t_val = in_32;
679 } else if (in_64) {
680 t_val = tcg_temp_ebb_new_i32();
681 tcg_gen_extrl_i64_i32(t_val, in_64);
682 } else {
683 t_val = tcg_constant_i32(in_c);
684 }
685 gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
686
687 if (in_64) {
688 tcg_temp_free_i32(t_val);
689 }
690 tcg_temp_free_ptr(t_ptr);
691 return;
692 }
693
694 t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
695
696 if (vece == MO_64) {
697 if (in_64) {
698 gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
699 } else {
700 t_64 = tcg_constant_i64(in_c);
701 gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
702 }
703 } else {
704 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
705 static dup_fn * const fns[3] = {
706 gen_helper_gvec_dup8,
707 gen_helper_gvec_dup16,
708 gen_helper_gvec_dup32
709 };
710
711 if (in_32) {
712 fns[vece](t_ptr, t_desc, in_32);
713 } else if (in_64) {
714 t_32 = tcg_temp_ebb_new_i32();
715 tcg_gen_extrl_i64_i32(t_32, in_64);
716 fns[vece](t_ptr, t_desc, t_32);
717 tcg_temp_free_i32(t_32);
718 } else {
719 if (vece == MO_8) {
720 in_c &= 0xff;
721 } else if (vece == MO_16) {
722 in_c &= 0xffff;
723 }
724 t_32 = tcg_constant_i32(in_c);
725 fns[vece](t_ptr, t_desc, t_32);
726 }
727 }
728
729 tcg_temp_free_ptr(t_ptr);
730 return;
731
732 done:
733 if (oprsz < maxsz) {
734 expand_clr(dbase, dofs + oprsz, maxsz - oprsz);
735 }
736 }
737
738 /* Likewise, but with zero. */
expand_clr(TCGv_ptr dbase,uint32_t dofs,uint32_t maxsz)739 static void expand_clr(TCGv_ptr dbase, uint32_t dofs, uint32_t maxsz)
740 {
741 do_dup(MO_8, dbase, dofs, maxsz, maxsz, NULL, NULL, 0);
742 }
743
744 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */
expand_2_i32(TCGv_ptr dbase,uint32_t dofs,TCGv_ptr abase,uint32_t aofs,uint32_t oprsz,bool load_dest,void (* fni)(TCGv_i32,TCGv_i32))745 static void expand_2_i32(TCGv_ptr dbase, uint32_t dofs, TCGv_ptr abase,
746 uint32_t aofs, uint32_t oprsz, bool load_dest,
747 void (*fni)(TCGv_i32, TCGv_i32))
748 {
749 TCGv_i32 t0 = tcg_temp_new_i32();
750 TCGv_i32 t1 = tcg_temp_new_i32();
751 uint32_t i;
752
753 for (i = 0; i < oprsz; i += 4) {
754 tcg_gen_ld_i32(t0, abase, aofs + i);
755 if (load_dest) {
756 tcg_gen_ld_i32(t1, dbase, dofs + i);
757 }
758 fni(t1, t0);
759 tcg_gen_st_i32(t1, dbase, dofs + i);
760 }
761 tcg_temp_free_i32(t0);
762 tcg_temp_free_i32(t1);
763 }
764
expand_2i_i32(uint32_t dofs,uint32_t aofs,uint32_t oprsz,int32_t c,bool load_dest,void (* fni)(TCGv_i32,TCGv_i32,int32_t))765 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
766 int32_t c, bool load_dest,
767 void (*fni)(TCGv_i32, TCGv_i32, int32_t))
768 {
769 TCGv_i32 t0 = tcg_temp_new_i32();
770 TCGv_i32 t1 = tcg_temp_new_i32();
771 uint32_t i;
772
773 for (i = 0; i < oprsz; i += 4) {
774 tcg_gen_ld_i32(t0, tcg_env, aofs + i);
775 if (load_dest) {
776 tcg_gen_ld_i32(t1, tcg_env, dofs + i);
777 }
778 fni(t1, t0, c);
779 tcg_gen_st_i32(t1, tcg_env, dofs + i);
780 }
781 tcg_temp_free_i32(t0);
782 tcg_temp_free_i32(t1);
783 }
784
expand_2s_i32(uint32_t dofs,uint32_t aofs,uint32_t oprsz,TCGv_i32 c,bool scalar_first,void (* fni)(TCGv_i32,TCGv_i32,TCGv_i32))785 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
786 TCGv_i32 c, bool scalar_first,
787 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
788 {
789 TCGv_i32 t0 = tcg_temp_new_i32();
790 TCGv_i32 t1 = tcg_temp_new_i32();
791 uint32_t i;
792
793 for (i = 0; i < oprsz; i += 4) {
794 tcg_gen_ld_i32(t0, tcg_env, aofs + i);
795 if (scalar_first) {
796 fni(t1, c, t0);
797 } else {
798 fni(t1, t0, c);
799 }
800 tcg_gen_st_i32(t1, tcg_env, dofs + i);
801 }
802 tcg_temp_free_i32(t0);
803 tcg_temp_free_i32(t1);
804 }
805
806 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
expand_3_i32(TCGv_ptr dbase,uint32_t dofs,TCGv_ptr abase,uint32_t aofs,TCGv_ptr bbase,uint32_t bofs,uint32_t oprsz,bool load_dest,void (* fni)(TCGv_i32,TCGv_i32,TCGv_i32))807 static void expand_3_i32(TCGv_ptr dbase, uint32_t dofs,
808 TCGv_ptr abase, uint32_t aofs,
809 TCGv_ptr bbase, uint32_t bofs,
810 uint32_t oprsz, bool load_dest,
811 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
812 {
813 TCGv_i32 t0 = tcg_temp_new_i32();
814 TCGv_i32 t1 = tcg_temp_new_i32();
815 TCGv_i32 t2 = tcg_temp_new_i32();
816 uint32_t i;
817
818 for (i = 0; i < oprsz; i += 4) {
819 tcg_gen_ld_i32(t0, abase, aofs + i);
820 tcg_gen_ld_i32(t1, bbase, bofs + i);
821 if (load_dest) {
822 tcg_gen_ld_i32(t2, dbase, dofs + i);
823 }
824 fni(t2, t0, t1);
825 tcg_gen_st_i32(t2, dbase, dofs + i);
826 }
827 tcg_temp_free_i32(t2);
828 tcg_temp_free_i32(t1);
829 tcg_temp_free_i32(t0);
830 }
831
expand_3i_i32(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,int32_t c,bool load_dest,bool write_aofs,void (* fni)(TCGv_i32,TCGv_i32,TCGv_i32,int32_t))832 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
833 uint32_t oprsz, int32_t c,
834 bool load_dest, bool write_aofs,
835 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
836 {
837 TCGv_i32 t0 = tcg_temp_new_i32();
838 TCGv_i32 t1 = tcg_temp_new_i32();
839 TCGv_i32 t2 = tcg_temp_new_i32();
840 uint32_t i;
841
842 for (i = 0; i < oprsz; i += 4) {
843 tcg_gen_ld_i32(t0, tcg_env, aofs + i);
844 tcg_gen_ld_i32(t1, tcg_env, bofs + i);
845 if (load_dest) {
846 tcg_gen_ld_i32(t2, tcg_env, dofs + i);
847 }
848 fni(t2, t0, t1, c);
849 tcg_gen_st_i32(t2, tcg_env, dofs + i);
850 if (write_aofs) {
851 tcg_gen_st_i32(t0, tcg_env, aofs + i);
852 }
853 }
854 tcg_temp_free_i32(t0);
855 tcg_temp_free_i32(t1);
856 tcg_temp_free_i32(t2);
857 }
858
859 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
expand_4_i32(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t cofs,uint32_t oprsz,bool write_aofs,void (* fni)(TCGv_i32,TCGv_i32,TCGv_i32,TCGv_i32))860 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
861 uint32_t cofs, uint32_t oprsz, bool write_aofs,
862 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
863 {
864 TCGv_i32 t0 = tcg_temp_new_i32();
865 TCGv_i32 t1 = tcg_temp_new_i32();
866 TCGv_i32 t2 = tcg_temp_new_i32();
867 TCGv_i32 t3 = tcg_temp_new_i32();
868 uint32_t i;
869
870 for (i = 0; i < oprsz; i += 4) {
871 tcg_gen_ld_i32(t1, tcg_env, aofs + i);
872 tcg_gen_ld_i32(t2, tcg_env, bofs + i);
873 tcg_gen_ld_i32(t3, tcg_env, cofs + i);
874 fni(t0, t1, t2, t3);
875 tcg_gen_st_i32(t0, tcg_env, dofs + i);
876 if (write_aofs) {
877 tcg_gen_st_i32(t1, tcg_env, aofs + i);
878 }
879 }
880 tcg_temp_free_i32(t3);
881 tcg_temp_free_i32(t2);
882 tcg_temp_free_i32(t1);
883 tcg_temp_free_i32(t0);
884 }
885
expand_4i_i32(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t cofs,uint32_t oprsz,int32_t c,void (* fni)(TCGv_i32,TCGv_i32,TCGv_i32,TCGv_i32,int32_t))886 static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
887 uint32_t cofs, uint32_t oprsz, int32_t c,
888 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32,
889 int32_t))
890 {
891 TCGv_i32 t0 = tcg_temp_new_i32();
892 TCGv_i32 t1 = tcg_temp_new_i32();
893 TCGv_i32 t2 = tcg_temp_new_i32();
894 TCGv_i32 t3 = tcg_temp_new_i32();
895 uint32_t i;
896
897 for (i = 0; i < oprsz; i += 4) {
898 tcg_gen_ld_i32(t1, tcg_env, aofs + i);
899 tcg_gen_ld_i32(t2, tcg_env, bofs + i);
900 tcg_gen_ld_i32(t3, tcg_env, cofs + i);
901 fni(t0, t1, t2, t3, c);
902 tcg_gen_st_i32(t0, tcg_env, dofs + i);
903 }
904 tcg_temp_free_i32(t3);
905 tcg_temp_free_i32(t2);
906 tcg_temp_free_i32(t1);
907 tcg_temp_free_i32(t0);
908 }
909
910 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */
expand_2_i64(TCGv_ptr dbase,uint32_t dofs,TCGv_ptr abase,uint32_t aofs,uint32_t oprsz,bool load_dest,void (* fni)(TCGv_i64,TCGv_i64))911 static void expand_2_i64(TCGv_ptr dbase, uint32_t dofs, TCGv_ptr abase,
912 uint32_t aofs, uint32_t oprsz, bool load_dest,
913 void (*fni)(TCGv_i64, TCGv_i64))
914 {
915 TCGv_i64 t0 = tcg_temp_new_i64();
916 TCGv_i64 t1 = tcg_temp_new_i64();
917 uint32_t i;
918
919 for (i = 0; i < oprsz; i += 8) {
920 tcg_gen_ld_i64(t0, abase, aofs + i);
921 if (load_dest) {
922 tcg_gen_ld_i64(t1, dbase, dofs + i);
923 }
924 fni(t1, t0);
925 tcg_gen_st_i64(t1, dbase, dofs + i);
926 }
927 tcg_temp_free_i64(t0);
928 tcg_temp_free_i64(t1);
929 }
930
expand_2i_i64(uint32_t dofs,uint32_t aofs,uint32_t oprsz,int64_t c,bool load_dest,void (* fni)(TCGv_i64,TCGv_i64,int64_t))931 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
932 int64_t c, bool load_dest,
933 void (*fni)(TCGv_i64, TCGv_i64, int64_t))
934 {
935 TCGv_i64 t0 = tcg_temp_new_i64();
936 TCGv_i64 t1 = tcg_temp_new_i64();
937 uint32_t i;
938
939 for (i = 0; i < oprsz; i += 8) {
940 tcg_gen_ld_i64(t0, tcg_env, aofs + i);
941 if (load_dest) {
942 tcg_gen_ld_i64(t1, tcg_env, dofs + i);
943 }
944 fni(t1, t0, c);
945 tcg_gen_st_i64(t1, tcg_env, dofs + i);
946 }
947 tcg_temp_free_i64(t0);
948 tcg_temp_free_i64(t1);
949 }
950
expand_2s_i64(uint32_t dofs,uint32_t aofs,uint32_t oprsz,TCGv_i64 c,bool scalar_first,void (* fni)(TCGv_i64,TCGv_i64,TCGv_i64))951 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
952 TCGv_i64 c, bool scalar_first,
953 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
954 {
955 TCGv_i64 t0 = tcg_temp_new_i64();
956 TCGv_i64 t1 = tcg_temp_new_i64();
957 uint32_t i;
958
959 for (i = 0; i < oprsz; i += 8) {
960 tcg_gen_ld_i64(t0, tcg_env, aofs + i);
961 if (scalar_first) {
962 fni(t1, c, t0);
963 } else {
964 fni(t1, t0, c);
965 }
966 tcg_gen_st_i64(t1, tcg_env, dofs + i);
967 }
968 tcg_temp_free_i64(t0);
969 tcg_temp_free_i64(t1);
970 }
971
972 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
expand_3_i64(TCGv_ptr dbase,uint32_t dofs,TCGv_ptr abase,uint32_t aofs,TCGv_ptr bbase,uint32_t bofs,uint32_t oprsz,bool load_dest,void (* fni)(TCGv_i64,TCGv_i64,TCGv_i64))973 static void expand_3_i64(TCGv_ptr dbase, uint32_t dofs,
974 TCGv_ptr abase, uint32_t aofs,
975 TCGv_ptr bbase, uint32_t bofs,
976 uint32_t oprsz, bool load_dest,
977 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
978 {
979 TCGv_i64 t0 = tcg_temp_new_i64();
980 TCGv_i64 t1 = tcg_temp_new_i64();
981 TCGv_i64 t2 = tcg_temp_new_i64();
982 uint32_t i;
983
984 for (i = 0; i < oprsz; i += 8) {
985 tcg_gen_ld_i64(t0, abase, aofs + i);
986 tcg_gen_ld_i64(t1, bbase, bofs + i);
987 if (load_dest) {
988 tcg_gen_ld_i64(t2, dbase, dofs + i);
989 }
990 fni(t2, t0, t1);
991 tcg_gen_st_i64(t2, dbase, dofs + i);
992 }
993 tcg_temp_free_i64(t2);
994 tcg_temp_free_i64(t1);
995 tcg_temp_free_i64(t0);
996 }
997
expand_3i_i64(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,int64_t c,bool load_dest,bool write_aofs,void (* fni)(TCGv_i64,TCGv_i64,TCGv_i64,int64_t))998 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
999 uint32_t oprsz, int64_t c,
1000 bool load_dest, bool write_aofs,
1001 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
1002 {
1003 TCGv_i64 t0 = tcg_temp_new_i64();
1004 TCGv_i64 t1 = tcg_temp_new_i64();
1005 TCGv_i64 t2 = tcg_temp_new_i64();
1006 uint32_t i;
1007
1008 for (i = 0; i < oprsz; i += 8) {
1009 tcg_gen_ld_i64(t0, tcg_env, aofs + i);
1010 tcg_gen_ld_i64(t1, tcg_env, bofs + i);
1011 if (load_dest) {
1012 tcg_gen_ld_i64(t2, tcg_env, dofs + i);
1013 }
1014 fni(t2, t0, t1, c);
1015 tcg_gen_st_i64(t2, tcg_env, dofs + i);
1016 if (write_aofs) {
1017 tcg_gen_st_i64(t0, tcg_env, aofs + i);
1018 }
1019 }
1020 tcg_temp_free_i64(t0);
1021 tcg_temp_free_i64(t1);
1022 tcg_temp_free_i64(t2);
1023 }
1024
1025 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
expand_4_i64(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t cofs,uint32_t oprsz,bool write_aofs,void (* fni)(TCGv_i64,TCGv_i64,TCGv_i64,TCGv_i64))1026 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1027 uint32_t cofs, uint32_t oprsz, bool write_aofs,
1028 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
1029 {
1030 TCGv_i64 t0 = tcg_temp_new_i64();
1031 TCGv_i64 t1 = tcg_temp_new_i64();
1032 TCGv_i64 t2 = tcg_temp_new_i64();
1033 TCGv_i64 t3 = tcg_temp_new_i64();
1034 uint32_t i;
1035
1036 for (i = 0; i < oprsz; i += 8) {
1037 tcg_gen_ld_i64(t1, tcg_env, aofs + i);
1038 tcg_gen_ld_i64(t2, tcg_env, bofs + i);
1039 tcg_gen_ld_i64(t3, tcg_env, cofs + i);
1040 fni(t0, t1, t2, t3);
1041 tcg_gen_st_i64(t0, tcg_env, dofs + i);
1042 if (write_aofs) {
1043 tcg_gen_st_i64(t1, tcg_env, aofs + i);
1044 }
1045 }
1046 tcg_temp_free_i64(t3);
1047 tcg_temp_free_i64(t2);
1048 tcg_temp_free_i64(t1);
1049 tcg_temp_free_i64(t0);
1050 }
1051
expand_4i_i64(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t cofs,uint32_t oprsz,int64_t c,void (* fni)(TCGv_i64,TCGv_i64,TCGv_i64,TCGv_i64,int64_t))1052 static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1053 uint32_t cofs, uint32_t oprsz, int64_t c,
1054 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64,
1055 int64_t))
1056 {
1057 TCGv_i64 t0 = tcg_temp_new_i64();
1058 TCGv_i64 t1 = tcg_temp_new_i64();
1059 TCGv_i64 t2 = tcg_temp_new_i64();
1060 TCGv_i64 t3 = tcg_temp_new_i64();
1061 uint32_t i;
1062
1063 for (i = 0; i < oprsz; i += 8) {
1064 tcg_gen_ld_i64(t1, tcg_env, aofs + i);
1065 tcg_gen_ld_i64(t2, tcg_env, bofs + i);
1066 tcg_gen_ld_i64(t3, tcg_env, cofs + i);
1067 fni(t0, t1, t2, t3, c);
1068 tcg_gen_st_i64(t0, tcg_env, dofs + i);
1069 }
1070 tcg_temp_free_i64(t3);
1071 tcg_temp_free_i64(t2);
1072 tcg_temp_free_i64(t1);
1073 tcg_temp_free_i64(t0);
1074 }
1075
1076 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */
expand_2_vec(unsigned vece,TCGv_ptr dbase,uint32_t dofs,TCGv_ptr abase,uint32_t aofs,uint32_t oprsz,uint32_t tysz,TCGType type,bool load_dest,void (* fni)(unsigned,TCGv_vec,TCGv_vec))1077 static void expand_2_vec(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
1078 TCGv_ptr abase, uint32_t aofs,
1079 uint32_t oprsz, uint32_t tysz, TCGType type,
1080 bool load_dest,
1081 void (*fni)(unsigned, TCGv_vec, TCGv_vec))
1082 {
1083 for (uint32_t i = 0; i < oprsz; i += tysz) {
1084 TCGv_vec t0 = tcg_temp_new_vec(type);
1085 TCGv_vec t1 = tcg_temp_new_vec(type);
1086
1087 tcg_gen_ld_vec(t0, abase, aofs + i);
1088 if (load_dest) {
1089 tcg_gen_ld_vec(t1, dbase, dofs + i);
1090 }
1091 fni(vece, t1, t0);
1092 tcg_gen_st_vec(t1, dbase, dofs + i);
1093 }
1094 }
1095
1096 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
1097 using host vectors. */
expand_2i_vec(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t oprsz,uint32_t tysz,TCGType type,int64_t c,bool load_dest,void (* fni)(unsigned,TCGv_vec,TCGv_vec,int64_t))1098 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1099 uint32_t oprsz, uint32_t tysz, TCGType type,
1100 int64_t c, bool load_dest,
1101 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1102 {
1103 for (uint32_t i = 0; i < oprsz; i += tysz) {
1104 TCGv_vec t0 = tcg_temp_new_vec(type);
1105 TCGv_vec t1 = tcg_temp_new_vec(type);
1106
1107 tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1108 if (load_dest) {
1109 tcg_gen_ld_vec(t1, tcg_env, dofs + i);
1110 }
1111 fni(vece, t1, t0, c);
1112 tcg_gen_st_vec(t1, tcg_env, dofs + i);
1113 }
1114 }
1115
expand_2s_vec(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t oprsz,uint32_t tysz,TCGType type,TCGv_vec c,bool scalar_first,void (* fni)(unsigned,TCGv_vec,TCGv_vec,TCGv_vec))1116 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1117 uint32_t oprsz, uint32_t tysz, TCGType type,
1118 TCGv_vec c, bool scalar_first,
1119 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1120 {
1121 for (uint32_t i = 0; i < oprsz; i += tysz) {
1122 TCGv_vec t0 = tcg_temp_new_vec(type);
1123 TCGv_vec t1 = tcg_temp_new_vec(type);
1124
1125 tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1126 if (scalar_first) {
1127 fni(vece, t1, c, t0);
1128 } else {
1129 fni(vece, t1, t0, c);
1130 }
1131 tcg_gen_st_vec(t1, tcg_env, dofs + i);
1132 }
1133 }
1134
1135 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */
expand_3_vec(unsigned vece,TCGv_ptr dbase,uint32_t dofs,TCGv_ptr abase,uint32_t aofs,TCGv_ptr bbase,uint32_t bofs,uint32_t oprsz,uint32_t tysz,TCGType type,bool load_dest,void (* fni)(unsigned,TCGv_vec,TCGv_vec,TCGv_vec))1136 static void expand_3_vec(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
1137 TCGv_ptr abase, uint32_t aofs,
1138 TCGv_ptr bbase, uint32_t bofs, uint32_t oprsz,
1139 uint32_t tysz, TCGType type, bool load_dest,
1140 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1141 {
1142 for (uint32_t i = 0; i < oprsz; i += tysz) {
1143 TCGv_vec t0 = tcg_temp_new_vec(type);
1144 TCGv_vec t1 = tcg_temp_new_vec(type);
1145 TCGv_vec t2 = tcg_temp_new_vec(type);
1146
1147 tcg_gen_ld_vec(t0, abase, aofs + i);
1148 tcg_gen_ld_vec(t1, bbase, bofs + i);
1149 if (load_dest) {
1150 tcg_gen_ld_vec(t2, dbase, dofs + i);
1151 }
1152 fni(vece, t2, t0, t1);
1153 tcg_gen_st_vec(t2, dbase, dofs + i);
1154 }
1155 }
1156
1157 /*
1158 * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1159 * using host vectors.
1160 */
expand_3i_vec(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t tysz,TCGType type,int64_t c,bool load_dest,bool write_aofs,void (* fni)(unsigned,TCGv_vec,TCGv_vec,TCGv_vec,int64_t))1161 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1162 uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1163 TCGType type, int64_t c,
1164 bool load_dest, bool write_aofs,
1165 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1166 int64_t))
1167 {
1168 for (uint32_t i = 0; i < oprsz; i += tysz) {
1169 TCGv_vec t0 = tcg_temp_new_vec(type);
1170 TCGv_vec t1 = tcg_temp_new_vec(type);
1171 TCGv_vec t2 = tcg_temp_new_vec(type);
1172
1173 tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1174 tcg_gen_ld_vec(t1, tcg_env, bofs + i);
1175 if (load_dest) {
1176 tcg_gen_ld_vec(t2, tcg_env, dofs + i);
1177 }
1178 fni(vece, t2, t0, t1, c);
1179 tcg_gen_st_vec(t2, tcg_env, dofs + i);
1180 if (write_aofs) {
1181 tcg_gen_st_vec(t0, tcg_env, aofs + i);
1182 }
1183 }
1184 }
1185
1186 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */
expand_4_vec(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t cofs,uint32_t oprsz,uint32_t tysz,TCGType type,bool write_aofs,void (* fni)(unsigned,TCGv_vec,TCGv_vec,TCGv_vec,TCGv_vec))1187 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1188 uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1189 uint32_t tysz, TCGType type, bool write_aofs,
1190 void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1191 TCGv_vec, TCGv_vec))
1192 {
1193 for (uint32_t i = 0; i < oprsz; i += tysz) {
1194 TCGv_vec t0 = tcg_temp_new_vec(type);
1195 TCGv_vec t1 = tcg_temp_new_vec(type);
1196 TCGv_vec t2 = tcg_temp_new_vec(type);
1197 TCGv_vec t3 = tcg_temp_new_vec(type);
1198
1199 tcg_gen_ld_vec(t1, tcg_env, aofs + i);
1200 tcg_gen_ld_vec(t2, tcg_env, bofs + i);
1201 tcg_gen_ld_vec(t3, tcg_env, cofs + i);
1202 fni(vece, t0, t1, t2, t3);
1203 tcg_gen_st_vec(t0, tcg_env, dofs + i);
1204 if (write_aofs) {
1205 tcg_gen_st_vec(t1, tcg_env, aofs + i);
1206 }
1207 }
1208 }
1209
1210 /*
1211 * Expand OPSZ bytes worth of four-vector operands and an immediate operand
1212 * using host vectors.
1213 */
expand_4i_vec(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t cofs,uint32_t oprsz,uint32_t tysz,TCGType type,int64_t c,void (* fni)(unsigned,TCGv_vec,TCGv_vec,TCGv_vec,TCGv_vec,int64_t))1214 static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1215 uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1216 uint32_t tysz, TCGType type, int64_t c,
1217 void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1218 TCGv_vec, TCGv_vec, int64_t))
1219 {
1220 for (uint32_t i = 0; i < oprsz; i += tysz) {
1221 TCGv_vec t0 = tcg_temp_new_vec(type);
1222 TCGv_vec t1 = tcg_temp_new_vec(type);
1223 TCGv_vec t2 = tcg_temp_new_vec(type);
1224 TCGv_vec t3 = tcg_temp_new_vec(type);
1225
1226 tcg_gen_ld_vec(t1, tcg_env, aofs + i);
1227 tcg_gen_ld_vec(t2, tcg_env, bofs + i);
1228 tcg_gen_ld_vec(t3, tcg_env, cofs + i);
1229 fni(vece, t0, t1, t2, t3, c);
1230 tcg_gen_st_vec(t0, tcg_env, dofs + i);
1231 }
1232 }
1233
1234 /* Expand a vector two-operand operation. */
tcg_gen_gvec_2_var(TCGv_ptr dbase,uint32_t dofs,TCGv_ptr abase,uint32_t aofs,uint32_t oprsz,uint32_t maxsz,const GVecGen2 * g)1235 void tcg_gen_gvec_2_var(TCGv_ptr dbase, uint32_t dofs,
1236 TCGv_ptr abase, uint32_t aofs,
1237 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1238 {
1239 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1240 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1241 TCGType type;
1242 uint32_t some;
1243
1244 check_size_align(oprsz, maxsz, dofs | aofs);
1245 check_overlap_2(dbase, dofs, abase, aofs, maxsz);
1246
1247 type = 0;
1248 if (g->fniv) {
1249 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1250 }
1251 switch (type) {
1252 case TCG_TYPE_V256:
1253 /* Recall that ARM SVE allows vector sizes that are not a
1254 * power of 2, but always a multiple of 16. The intent is
1255 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1256 */
1257 some = QEMU_ALIGN_DOWN(oprsz, 32);
1258 expand_2_vec(g->vece, dbase, dofs, abase, aofs, some, 32,
1259 TCG_TYPE_V256, g->load_dest, g->fniv);
1260 if (some == oprsz) {
1261 break;
1262 }
1263 dofs += some;
1264 aofs += some;
1265 oprsz -= some;
1266 maxsz -= some;
1267 /* fallthru */
1268 case TCG_TYPE_V128:
1269 expand_2_vec(g->vece, dbase, dofs, abase, aofs, oprsz, 16,
1270 TCG_TYPE_V128, g->load_dest, g->fniv);
1271 break;
1272 case TCG_TYPE_V64:
1273 expand_2_vec(g->vece, dbase, dofs, abase, aofs, oprsz, 8,
1274 TCG_TYPE_V64, g->load_dest, g->fniv);
1275 break;
1276
1277 case 0:
1278 if (g->fni8 && check_size_impl(oprsz, 8)) {
1279 expand_2_i64(dbase, dofs, abase, aofs,
1280 oprsz, g->load_dest, g->fni8);
1281 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1282 expand_2_i32(dbase, dofs, abase, aofs,
1283 oprsz, g->load_dest, g->fni4);
1284 } else {
1285 assert(g->fno != NULL);
1286 expand_2_ool(dbase, dofs, abase, aofs,
1287 oprsz, maxsz, g->data, g->fno);
1288 oprsz = maxsz;
1289 }
1290 break;
1291
1292 default:
1293 g_assert_not_reached();
1294 }
1295 tcg_swap_vecop_list(hold_list);
1296
1297 if (oprsz < maxsz) {
1298 expand_clr(dbase, dofs + oprsz, maxsz - oprsz);
1299 }
1300 }
1301
tcg_gen_gvec_2(uint32_t dofs,uint32_t aofs,uint32_t oprsz,uint32_t maxsz,const GVecGen2 * g)1302 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1303 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1304 {
1305 tcg_gen_gvec_2_var(tcg_env, dofs, tcg_env, aofs, oprsz, maxsz, g);
1306 }
1307
1308 /* Expand a vector operation with two vectors and an immediate. */
tcg_gen_gvec_2i(uint32_t dofs,uint32_t aofs,uint32_t oprsz,uint32_t maxsz,int64_t c,const GVecGen2i * g)1309 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1310 uint32_t maxsz, int64_t c, const GVecGen2i *g)
1311 {
1312 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1313 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1314 TCGType type;
1315 uint32_t some;
1316
1317 check_size_align(oprsz, maxsz, dofs | aofs);
1318 check_overlap_2(tcg_env, dofs, tcg_env, aofs, maxsz);
1319
1320 type = 0;
1321 if (g->fniv) {
1322 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1323 }
1324 switch (type) {
1325 case TCG_TYPE_V256:
1326 /* Recall that ARM SVE allows vector sizes that are not a
1327 * power of 2, but always a multiple of 16. The intent is
1328 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1329 */
1330 some = QEMU_ALIGN_DOWN(oprsz, 32);
1331 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1332 c, g->load_dest, g->fniv);
1333 if (some == oprsz) {
1334 break;
1335 }
1336 dofs += some;
1337 aofs += some;
1338 oprsz -= some;
1339 maxsz -= some;
1340 /* fallthru */
1341 case TCG_TYPE_V128:
1342 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1343 c, g->load_dest, g->fniv);
1344 break;
1345 case TCG_TYPE_V64:
1346 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1347 c, g->load_dest, g->fniv);
1348 break;
1349
1350 case 0:
1351 if (g->fni8 && check_size_impl(oprsz, 8)) {
1352 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1353 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1354 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1355 } else {
1356 if (g->fno) {
1357 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1358 } else {
1359 TCGv_i64 tcg_c = tcg_constant_i64(c);
1360 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1361 maxsz, c, g->fnoi);
1362 }
1363 oprsz = maxsz;
1364 }
1365 break;
1366
1367 default:
1368 g_assert_not_reached();
1369 }
1370 tcg_swap_vecop_list(hold_list);
1371
1372 if (oprsz < maxsz) {
1373 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1374 }
1375 }
1376
1377 /* Expand a vector operation with two vectors and a scalar. */
tcg_gen_gvec_2s(uint32_t dofs,uint32_t aofs,uint32_t oprsz,uint32_t maxsz,TCGv_i64 c,const GVecGen2s * g)1378 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1379 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1380 {
1381 TCGType type;
1382
1383 check_size_align(oprsz, maxsz, dofs | aofs);
1384 check_overlap_2(tcg_env, dofs, tcg_env, aofs, maxsz);
1385
1386 type = 0;
1387 if (g->fniv) {
1388 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1389 }
1390 if (type != 0) {
1391 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1392 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1393 TCGv_vec t_vec = tcg_temp_new_vec(type);
1394 uint32_t some;
1395
1396 tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1397
1398 switch (type) {
1399 case TCG_TYPE_V256:
1400 /* Recall that ARM SVE allows vector sizes that are not a
1401 * power of 2, but always a multiple of 16. The intent is
1402 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1403 */
1404 some = QEMU_ALIGN_DOWN(oprsz, 32);
1405 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1406 t_vec, g->scalar_first, g->fniv);
1407 if (some == oprsz) {
1408 break;
1409 }
1410 dofs += some;
1411 aofs += some;
1412 oprsz -= some;
1413 maxsz -= some;
1414 /* fallthru */
1415
1416 case TCG_TYPE_V128:
1417 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1418 t_vec, g->scalar_first, g->fniv);
1419 break;
1420
1421 case TCG_TYPE_V64:
1422 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1423 t_vec, g->scalar_first, g->fniv);
1424 break;
1425
1426 default:
1427 g_assert_not_reached();
1428 }
1429 tcg_temp_free_vec(t_vec);
1430 tcg_swap_vecop_list(hold_list);
1431 } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1432 TCGv_i64 t64 = tcg_temp_new_i64();
1433
1434 tcg_gen_dup_i64(g->vece, t64, c);
1435 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1436 tcg_temp_free_i64(t64);
1437 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1438 TCGv_i32 t32 = tcg_temp_new_i32();
1439
1440 tcg_gen_extrl_i64_i32(t32, c);
1441 tcg_gen_dup_i32(g->vece, t32, t32);
1442 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1443 tcg_temp_free_i32(t32);
1444 } else {
1445 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1446 return;
1447 }
1448
1449 if (oprsz < maxsz) {
1450 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1451 }
1452 }
1453
1454 /* Expand a vector three-operand operation. */
tcg_gen_gvec_3_var(TCGv_ptr dbase,uint32_t dofs,TCGv_ptr abase,uint32_t aofs,TCGv_ptr bbase,uint32_t bofs,uint32_t oprsz,uint32_t maxsz,const GVecGen3 * g)1455 void tcg_gen_gvec_3_var(TCGv_ptr dbase, uint32_t dofs,
1456 TCGv_ptr abase, uint32_t aofs,
1457 TCGv_ptr bbase, uint32_t bofs,
1458 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1459 {
1460 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1461 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1462 TCGType type;
1463 uint32_t some;
1464
1465 check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1466 check_overlap_3(dbase, dofs, abase, aofs, bbase, bofs, maxsz);
1467
1468 type = 0;
1469 if (g->fniv) {
1470 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1471 }
1472 switch (type) {
1473 case TCG_TYPE_V256:
1474 /* Recall that ARM SVE allows vector sizes that are not a
1475 * power of 2, but always a multiple of 16. The intent is
1476 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1477 */
1478 some = QEMU_ALIGN_DOWN(oprsz, 32);
1479 expand_3_vec(g->vece, dbase, dofs, abase, aofs, bbase, bofs,
1480 some, 32, TCG_TYPE_V256, g->load_dest, g->fniv);
1481 if (some == oprsz) {
1482 break;
1483 }
1484 dofs += some;
1485 aofs += some;
1486 bofs += some;
1487 oprsz -= some;
1488 maxsz -= some;
1489 /* fallthru */
1490 case TCG_TYPE_V128:
1491 expand_3_vec(g->vece, dbase, dofs, abase, aofs, bbase, bofs,
1492 oprsz, 16, TCG_TYPE_V128, g->load_dest, g->fniv);
1493 break;
1494 case TCG_TYPE_V64:
1495 expand_3_vec(g->vece, dbase, dofs, abase, aofs, bbase, bofs,
1496 oprsz, 8, TCG_TYPE_V64, g->load_dest, g->fniv);
1497 break;
1498
1499 case 0:
1500 if (g->fni8 && check_size_impl(oprsz, 8)) {
1501 expand_3_i64(dbase, dofs, abase, aofs, bbase, bofs,
1502 oprsz, g->load_dest, g->fni8);
1503 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1504 expand_3_i32(dbase, dofs, abase, aofs, bbase, bofs,
1505 oprsz, g->load_dest, g->fni4);
1506 } else {
1507 assert(g->fno != NULL);
1508 expand_3_ool(dbase, dofs, abase, aofs, bbase, bofs,
1509 oprsz, maxsz, g->data, g->fno);
1510 oprsz = maxsz;
1511 }
1512 break;
1513
1514 default:
1515 g_assert_not_reached();
1516 }
1517 tcg_swap_vecop_list(hold_list);
1518
1519 if (oprsz < maxsz) {
1520 expand_clr(dbase, dofs + oprsz, maxsz - oprsz);
1521 }
1522 }
1523
tcg_gen_gvec_3(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz,const GVecGen3 * g)1524 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1525 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1526 {
1527 tcg_gen_gvec_3_var(tcg_env, dofs, tcg_env, aofs, tcg_env, bofs,
1528 oprsz, maxsz, g);
1529 }
1530
1531 /* Expand a vector operation with three vectors and an immediate. */
tcg_gen_gvec_3i(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz,int64_t c,const GVecGen3i * g)1532 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1533 uint32_t oprsz, uint32_t maxsz, int64_t c,
1534 const GVecGen3i *g)
1535 {
1536 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1537 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1538 TCGType type;
1539 uint32_t some;
1540
1541 check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1542 check_overlap_3(tcg_env, dofs, tcg_env, aofs, tcg_env, bofs, maxsz);
1543
1544 type = 0;
1545 if (g->fniv) {
1546 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1547 }
1548 switch (type) {
1549 case TCG_TYPE_V256:
1550 /*
1551 * Recall that ARM SVE allows vector sizes that are not a
1552 * power of 2, but always a multiple of 16. The intent is
1553 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1554 */
1555 some = QEMU_ALIGN_DOWN(oprsz, 32);
1556 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1557 c, g->load_dest, g->write_aofs, g->fniv);
1558 if (some == oprsz) {
1559 break;
1560 }
1561 dofs += some;
1562 aofs += some;
1563 bofs += some;
1564 oprsz -= some;
1565 maxsz -= some;
1566 /* fallthru */
1567 case TCG_TYPE_V128:
1568 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1569 c, g->load_dest, g->write_aofs, g->fniv);
1570 break;
1571 case TCG_TYPE_V64:
1572 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1573 c, g->load_dest, g->write_aofs, g->fniv);
1574 break;
1575
1576 case 0:
1577 if (g->fni8 && check_size_impl(oprsz, 8)) {
1578 expand_3i_i64(dofs, aofs, bofs, oprsz, c,
1579 g->load_dest, g->write_aofs, g->fni8);
1580 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1581 expand_3i_i32(dofs, aofs, bofs, oprsz, c,
1582 g->load_dest, g->write_aofs, g->fni4);
1583 } else {
1584 assert(g->fno != NULL);
1585 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1586 oprsz = maxsz;
1587 }
1588 break;
1589
1590 default:
1591 g_assert_not_reached();
1592 }
1593 tcg_swap_vecop_list(hold_list);
1594
1595 if (oprsz < maxsz) {
1596 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1597 }
1598 }
1599
1600 /* Expand a vector four-operand operation. */
tcg_gen_gvec_4(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t cofs,uint32_t oprsz,uint32_t maxsz,const GVecGen4 * g)1601 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1602 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1603 {
1604 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1605 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1606 TCGType type;
1607 uint32_t some;
1608
1609 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1610 check_overlap_4(tcg_env, dofs, tcg_env, aofs,
1611 tcg_env, bofs, tcg_env, cofs, maxsz);
1612
1613 type = 0;
1614 if (g->fniv) {
1615 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1616 }
1617 switch (type) {
1618 case TCG_TYPE_V256:
1619 /* Recall that ARM SVE allows vector sizes that are not a
1620 * power of 2, but always a multiple of 16. The intent is
1621 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1622 */
1623 some = QEMU_ALIGN_DOWN(oprsz, 32);
1624 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1625 32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1626 if (some == oprsz) {
1627 break;
1628 }
1629 dofs += some;
1630 aofs += some;
1631 bofs += some;
1632 cofs += some;
1633 oprsz -= some;
1634 maxsz -= some;
1635 /* fallthru */
1636 case TCG_TYPE_V128:
1637 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1638 16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1639 break;
1640 case TCG_TYPE_V64:
1641 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1642 8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1643 break;
1644
1645 case 0:
1646 if (g->fni8 && check_size_impl(oprsz, 8)) {
1647 expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1648 g->write_aofs, g->fni8);
1649 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1650 expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1651 g->write_aofs, g->fni4);
1652 } else {
1653 assert(g->fno != NULL);
1654 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1655 oprsz, maxsz, g->data, g->fno);
1656 oprsz = maxsz;
1657 }
1658 break;
1659
1660 default:
1661 g_assert_not_reached();
1662 }
1663 tcg_swap_vecop_list(hold_list);
1664
1665 if (oprsz < maxsz) {
1666 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1667 }
1668 }
1669
1670 /* Expand a vector four-operand operation. */
tcg_gen_gvec_4i(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t cofs,uint32_t oprsz,uint32_t maxsz,int64_t c,const GVecGen4i * g)1671 void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1672 uint32_t oprsz, uint32_t maxsz, int64_t c,
1673 const GVecGen4i *g)
1674 {
1675 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1676 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1677 TCGType type;
1678 uint32_t some;
1679
1680 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1681 check_overlap_4(tcg_env, dofs, tcg_env, aofs,
1682 tcg_env, bofs, tcg_env, cofs, maxsz);
1683
1684 type = 0;
1685 if (g->fniv) {
1686 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1687 }
1688 switch (type) {
1689 case TCG_TYPE_V256:
1690 /*
1691 * Recall that ARM SVE allows vector sizes that are not a
1692 * power of 2, but always a multiple of 16. The intent is
1693 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1694 */
1695 some = QEMU_ALIGN_DOWN(oprsz, 32);
1696 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some,
1697 32, TCG_TYPE_V256, c, g->fniv);
1698 if (some == oprsz) {
1699 break;
1700 }
1701 dofs += some;
1702 aofs += some;
1703 bofs += some;
1704 cofs += some;
1705 oprsz -= some;
1706 maxsz -= some;
1707 /* fallthru */
1708 case TCG_TYPE_V128:
1709 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1710 16, TCG_TYPE_V128, c, g->fniv);
1711 break;
1712 case TCG_TYPE_V64:
1713 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1714 8, TCG_TYPE_V64, c, g->fniv);
1715 break;
1716
1717 case 0:
1718 if (g->fni8 && check_size_impl(oprsz, 8)) {
1719 expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8);
1720 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1721 expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4);
1722 } else {
1723 assert(g->fno != NULL);
1724 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1725 oprsz, maxsz, c, g->fno);
1726 oprsz = maxsz;
1727 }
1728 break;
1729
1730 default:
1731 g_assert_not_reached();
1732 }
1733 tcg_swap_vecop_list(hold_list);
1734
1735 if (oprsz < maxsz) {
1736 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1737 }
1738 }
1739
1740 /*
1741 * Expand specific vector operations.
1742 */
1743
vec_mov2(unsigned vece,TCGv_vec a,TCGv_vec b)1744 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1745 {
1746 tcg_gen_mov_vec(a, b);
1747 }
1748
tcg_gen_gvec_mov_var(unsigned vece,TCGv_ptr dbase,uint32_t dofs,TCGv_ptr abase,uint32_t aofs,uint32_t oprsz,uint32_t maxsz)1749 void tcg_gen_gvec_mov_var(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
1750 TCGv_ptr abase, uint32_t aofs,
1751 uint32_t oprsz, uint32_t maxsz)
1752 {
1753 static const GVecGen2 g = {
1754 .fni8 = tcg_gen_mov_i64,
1755 .fniv = vec_mov2,
1756 .fno = gen_helper_gvec_mov,
1757 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1758 };
1759
1760 if (dofs == aofs && dbase == abase) {
1761 check_size_align(oprsz, maxsz, dofs);
1762 if (oprsz < maxsz) {
1763 expand_clr(dbase, dofs + oprsz, maxsz - oprsz);
1764 }
1765 return;
1766 }
1767
1768 tcg_gen_gvec_2_var(dbase, dofs, abase, aofs, oprsz, maxsz, &g);
1769 }
1770
tcg_gen_gvec_mov(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t oprsz,uint32_t maxsz)1771 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1772 uint32_t oprsz, uint32_t maxsz)
1773 {
1774 tcg_gen_gvec_mov_var(vece, tcg_env, dofs, tcg_env, aofs, oprsz, maxsz);
1775 }
1776
tcg_gen_gvec_dup_i32(unsigned vece,uint32_t dofs,uint32_t oprsz,uint32_t maxsz,TCGv_i32 in)1777 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1778 uint32_t maxsz, TCGv_i32 in)
1779 {
1780 check_size_align(oprsz, maxsz, dofs);
1781 tcg_debug_assert(vece <= MO_32);
1782 do_dup(vece, tcg_env, dofs, oprsz, maxsz, in, NULL, 0);
1783 }
1784
tcg_gen_gvec_dup_i64(unsigned vece,uint32_t dofs,uint32_t oprsz,uint32_t maxsz,TCGv_i64 in)1785 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1786 uint32_t maxsz, TCGv_i64 in)
1787 {
1788 check_size_align(oprsz, maxsz, dofs);
1789 tcg_debug_assert(vece <= MO_64);
1790 do_dup(vece, tcg_env, dofs, oprsz, maxsz, NULL, in, 0);
1791 }
1792
tcg_gen_gvec_dup_mem(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t oprsz,uint32_t maxsz)1793 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1794 uint32_t oprsz, uint32_t maxsz)
1795 {
1796 check_size_align(oprsz, maxsz, dofs);
1797 if (vece <= MO_64) {
1798 TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1799 if (type != 0) {
1800 TCGv_vec t_vec = tcg_temp_new_vec(type);
1801 tcg_gen_dup_mem_vec(vece, t_vec, tcg_env, aofs);
1802 do_dup_store(type, tcg_env, dofs, oprsz, maxsz, t_vec);
1803 } else if (vece <= MO_32) {
1804 TCGv_i32 in = tcg_temp_ebb_new_i32();
1805 switch (vece) {
1806 case MO_8:
1807 tcg_gen_ld8u_i32(in, tcg_env, aofs);
1808 break;
1809 case MO_16:
1810 tcg_gen_ld16u_i32(in, tcg_env, aofs);
1811 break;
1812 default:
1813 tcg_gen_ld_i32(in, tcg_env, aofs);
1814 break;
1815 }
1816 do_dup(vece, tcg_env, dofs, oprsz, maxsz, in, NULL, 0);
1817 tcg_temp_free_i32(in);
1818 } else {
1819 TCGv_i64 in = tcg_temp_ebb_new_i64();
1820 tcg_gen_ld_i64(in, tcg_env, aofs);
1821 do_dup(vece, tcg_env, dofs, oprsz, maxsz, NULL, in, 0);
1822 tcg_temp_free_i64(in);
1823 }
1824 } else if (vece == 4) {
1825 /* 128-bit duplicate. */
1826 int i;
1827
1828 tcg_debug_assert(oprsz >= 16);
1829 if (TCG_TARGET_HAS_v128) {
1830 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1831
1832 tcg_gen_ld_vec(in, tcg_env, aofs);
1833 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1834 tcg_gen_st_vec(in, tcg_env, dofs + i);
1835 }
1836 } else {
1837 TCGv_i64 in0 = tcg_temp_ebb_new_i64();
1838 TCGv_i64 in1 = tcg_temp_ebb_new_i64();
1839
1840 tcg_gen_ld_i64(in0, tcg_env, aofs);
1841 tcg_gen_ld_i64(in1, tcg_env, aofs + 8);
1842 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1843 tcg_gen_st_i64(in0, tcg_env, dofs + i);
1844 tcg_gen_st_i64(in1, tcg_env, dofs + i + 8);
1845 }
1846 tcg_temp_free_i64(in0);
1847 tcg_temp_free_i64(in1);
1848 }
1849 if (oprsz < maxsz) {
1850 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1851 }
1852 } else if (vece == 5) {
1853 /* 256-bit duplicate. */
1854 int i;
1855
1856 tcg_debug_assert(oprsz >= 32);
1857 tcg_debug_assert(oprsz % 32 == 0);
1858 if (TCG_TARGET_HAS_v256) {
1859 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1860
1861 tcg_gen_ld_vec(in, tcg_env, aofs);
1862 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1863 tcg_gen_st_vec(in, tcg_env, dofs + i);
1864 }
1865 } else if (TCG_TARGET_HAS_v128) {
1866 TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1867 TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1868
1869 tcg_gen_ld_vec(in0, tcg_env, aofs);
1870 tcg_gen_ld_vec(in1, tcg_env, aofs + 16);
1871 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1872 tcg_gen_st_vec(in0, tcg_env, dofs + i);
1873 tcg_gen_st_vec(in1, tcg_env, dofs + i + 16);
1874 }
1875 } else {
1876 TCGv_i64 in[4];
1877 int j;
1878
1879 for (j = 0; j < 4; ++j) {
1880 in[j] = tcg_temp_ebb_new_i64();
1881 tcg_gen_ld_i64(in[j], tcg_env, aofs + j * 8);
1882 }
1883 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1884 for (j = 0; j < 4; ++j) {
1885 tcg_gen_st_i64(in[j], tcg_env, dofs + i + j * 8);
1886 }
1887 }
1888 for (j = 0; j < 4; ++j) {
1889 tcg_temp_free_i64(in[j]);
1890 }
1891 }
1892 if (oprsz < maxsz) {
1893 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
1894 }
1895 } else {
1896 g_assert_not_reached();
1897 }
1898 }
1899
tcg_gen_gvec_dup_imm_var(unsigned vece,TCGv_ptr dbase,uint32_t dofs,uint32_t oprsz,uint32_t maxsz,uint64_t x)1900 void tcg_gen_gvec_dup_imm_var(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
1901 uint32_t oprsz, uint32_t maxsz, uint64_t x)
1902 {
1903 check_size_align(oprsz, maxsz, dofs);
1904 do_dup(vece, dbase, dofs, oprsz, maxsz, NULL, NULL, x);
1905 }
1906
tcg_gen_gvec_dup_imm(unsigned vece,uint32_t dofs,uint32_t oprsz,uint32_t maxsz,uint64_t x)1907 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1908 uint32_t maxsz, uint64_t x)
1909 {
1910 tcg_gen_gvec_dup_imm_var(vece, tcg_env, dofs, oprsz, maxsz, x);
1911 }
1912
tcg_gen_gvec_not(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t oprsz,uint32_t maxsz)1913 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1914 uint32_t oprsz, uint32_t maxsz)
1915 {
1916 static const GVecGen2 g = {
1917 .fni8 = tcg_gen_not_i64,
1918 .fniv = tcg_gen_not_vec,
1919 .fno = gen_helper_gvec_not,
1920 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1921 };
1922 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1923 }
1924
1925 /* Perform a vector addition using normal addition and a mask. The mask
1926 should be the sign bit of each lane. This 6-operation form is more
1927 efficient than separate additions when there are 4 or more lanes in
1928 the 64-bit operation. */
gen_addv_mask(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b,TCGv_i64 m)1929 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1930 {
1931 TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1932 TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1933 TCGv_i64 t3 = tcg_temp_ebb_new_i64();
1934
1935 tcg_gen_andc_i64(t1, a, m);
1936 tcg_gen_andc_i64(t2, b, m);
1937 tcg_gen_xor_i64(t3, a, b);
1938 tcg_gen_add_i64(d, t1, t2);
1939 tcg_gen_and_i64(t3, t3, m);
1940 tcg_gen_xor_i64(d, d, t3);
1941
1942 tcg_temp_free_i64(t1);
1943 tcg_temp_free_i64(t2);
1944 tcg_temp_free_i64(t3);
1945 }
1946
tcg_gen_vec_add8_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)1947 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1948 {
1949 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1950 gen_addv_mask(d, a, b, m);
1951 }
1952
tcg_gen_vec_add8_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)1953 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1954 {
1955 TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1956 TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1957 TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1958 TCGv_i32 t3 = tcg_temp_ebb_new_i32();
1959
1960 tcg_gen_andc_i32(t1, a, m);
1961 tcg_gen_andc_i32(t2, b, m);
1962 tcg_gen_xor_i32(t3, a, b);
1963 tcg_gen_add_i32(d, t1, t2);
1964 tcg_gen_and_i32(t3, t3, m);
1965 tcg_gen_xor_i32(d, d, t3);
1966
1967 tcg_temp_free_i32(t1);
1968 tcg_temp_free_i32(t2);
1969 tcg_temp_free_i32(t3);
1970 }
1971
tcg_gen_vec_add16_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)1972 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1973 {
1974 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1975 gen_addv_mask(d, a, b, m);
1976 }
1977
tcg_gen_vec_add16_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)1978 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1979 {
1980 TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1981 TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1982
1983 tcg_gen_andi_i32(t1, a, ~0xffff);
1984 tcg_gen_add_i32(t2, a, b);
1985 tcg_gen_add_i32(t1, t1, b);
1986 tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1987
1988 tcg_temp_free_i32(t1);
1989 tcg_temp_free_i32(t2);
1990 }
1991
tcg_gen_vec_add32_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)1992 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1993 {
1994 TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1995 TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1996
1997 tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1998 tcg_gen_add_i64(t2, a, b);
1999 tcg_gen_add_i64(t1, t1, b);
2000 tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2001
2002 tcg_temp_free_i64(t1);
2003 tcg_temp_free_i64(t2);
2004 }
2005
2006 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
2007
tcg_gen_gvec_add_var(unsigned vece,TCGv_ptr dbase,uint32_t dofs,TCGv_ptr abase,uint32_t aofs,TCGv_ptr bbase,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2008 void tcg_gen_gvec_add_var(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
2009 TCGv_ptr abase, uint32_t aofs,
2010 TCGv_ptr bbase, uint32_t bofs,
2011 uint32_t oprsz, uint32_t maxsz)
2012 {
2013 static const GVecGen3 g[4] = {
2014 { .fni8 = tcg_gen_vec_add8_i64,
2015 .fniv = tcg_gen_add_vec,
2016 .fno = gen_helper_gvec_add8,
2017 .opt_opc = vecop_list_add,
2018 .vece = MO_8 },
2019 { .fni8 = tcg_gen_vec_add16_i64,
2020 .fniv = tcg_gen_add_vec,
2021 .fno = gen_helper_gvec_add16,
2022 .opt_opc = vecop_list_add,
2023 .vece = MO_16 },
2024 { .fni4 = tcg_gen_add_i32,
2025 .fniv = tcg_gen_add_vec,
2026 .fno = gen_helper_gvec_add32,
2027 .opt_opc = vecop_list_add,
2028 .vece = MO_32 },
2029 { .fni8 = tcg_gen_add_i64,
2030 .fniv = tcg_gen_add_vec,
2031 .fno = gen_helper_gvec_add64,
2032 .opt_opc = vecop_list_add,
2033 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2034 .vece = MO_64 },
2035 };
2036
2037 tcg_debug_assert(vece <= MO_64);
2038 tcg_gen_gvec_3_var(dbase, dofs, abase, aofs, bbase, bofs,
2039 oprsz, maxsz, &g[vece]);
2040 }
2041
tcg_gen_gvec_add(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2042 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
2043 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2044 {
2045 tcg_gen_gvec_add_var(vece, tcg_env, dofs, tcg_env, aofs, tcg_env, bofs,
2046 oprsz, maxsz);
2047 }
2048
tcg_gen_gvec_adds(unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i64 c,uint32_t oprsz,uint32_t maxsz)2049 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
2050 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2051 {
2052 static const GVecGen2s g[4] = {
2053 { .fni8 = tcg_gen_vec_add8_i64,
2054 .fniv = tcg_gen_add_vec,
2055 .fno = gen_helper_gvec_adds8,
2056 .opt_opc = vecop_list_add,
2057 .vece = MO_8 },
2058 { .fni8 = tcg_gen_vec_add16_i64,
2059 .fniv = tcg_gen_add_vec,
2060 .fno = gen_helper_gvec_adds16,
2061 .opt_opc = vecop_list_add,
2062 .vece = MO_16 },
2063 { .fni4 = tcg_gen_add_i32,
2064 .fniv = tcg_gen_add_vec,
2065 .fno = gen_helper_gvec_adds32,
2066 .opt_opc = vecop_list_add,
2067 .vece = MO_32 },
2068 { .fni8 = tcg_gen_add_i64,
2069 .fniv = tcg_gen_add_vec,
2070 .fno = gen_helper_gvec_adds64,
2071 .opt_opc = vecop_list_add,
2072 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2073 .vece = MO_64 },
2074 };
2075
2076 tcg_debug_assert(vece <= MO_64);
2077 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2078 }
2079
tcg_gen_gvec_addi(unsigned vece,uint32_t dofs,uint32_t aofs,int64_t c,uint32_t oprsz,uint32_t maxsz)2080 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
2081 int64_t c, uint32_t oprsz, uint32_t maxsz)
2082 {
2083 TCGv_i64 tmp = tcg_constant_i64(c);
2084 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
2085 }
2086
2087 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
2088
tcg_gen_gvec_subs(unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i64 c,uint32_t oprsz,uint32_t maxsz)2089 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
2090 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2091 {
2092 static const GVecGen2s g[4] = {
2093 { .fni8 = tcg_gen_vec_sub8_i64,
2094 .fniv = tcg_gen_sub_vec,
2095 .fno = gen_helper_gvec_subs8,
2096 .opt_opc = vecop_list_sub,
2097 .vece = MO_8 },
2098 { .fni8 = tcg_gen_vec_sub16_i64,
2099 .fniv = tcg_gen_sub_vec,
2100 .fno = gen_helper_gvec_subs16,
2101 .opt_opc = vecop_list_sub,
2102 .vece = MO_16 },
2103 { .fni4 = tcg_gen_sub_i32,
2104 .fniv = tcg_gen_sub_vec,
2105 .fno = gen_helper_gvec_subs32,
2106 .opt_opc = vecop_list_sub,
2107 .vece = MO_32 },
2108 { .fni8 = tcg_gen_sub_i64,
2109 .fniv = tcg_gen_sub_vec,
2110 .fno = gen_helper_gvec_subs64,
2111 .opt_opc = vecop_list_sub,
2112 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2113 .vece = MO_64 },
2114 };
2115
2116 tcg_debug_assert(vece <= MO_64);
2117 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2118 }
2119
2120 /* Perform a vector subtraction using normal subtraction and a mask.
2121 Compare gen_addv_mask above. */
gen_subv_mask(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b,TCGv_i64 m)2122 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
2123 {
2124 TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2125 TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2126 TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2127
2128 tcg_gen_or_i64(t1, a, m);
2129 tcg_gen_andc_i64(t2, b, m);
2130 tcg_gen_eqv_i64(t3, a, b);
2131 tcg_gen_sub_i64(d, t1, t2);
2132 tcg_gen_and_i64(t3, t3, m);
2133 tcg_gen_xor_i64(d, d, t3);
2134
2135 tcg_temp_free_i64(t1);
2136 tcg_temp_free_i64(t2);
2137 tcg_temp_free_i64(t3);
2138 }
2139
tcg_gen_vec_sub8_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)2140 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2141 {
2142 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2143 gen_subv_mask(d, a, b, m);
2144 }
2145
tcg_gen_vec_sub8_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)2146 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2147 {
2148 TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
2149 TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2150 TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2151 TCGv_i32 t3 = tcg_temp_ebb_new_i32();
2152
2153 tcg_gen_or_i32(t1, a, m);
2154 tcg_gen_andc_i32(t2, b, m);
2155 tcg_gen_eqv_i32(t3, a, b);
2156 tcg_gen_sub_i32(d, t1, t2);
2157 tcg_gen_and_i32(t3, t3, m);
2158 tcg_gen_xor_i32(d, d, t3);
2159
2160 tcg_temp_free_i32(t1);
2161 tcg_temp_free_i32(t2);
2162 tcg_temp_free_i32(t3);
2163 }
2164
tcg_gen_vec_sub16_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)2165 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2166 {
2167 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2168 gen_subv_mask(d, a, b, m);
2169 }
2170
tcg_gen_vec_sub16_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)2171 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2172 {
2173 TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2174 TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2175
2176 tcg_gen_andi_i32(t1, b, ~0xffff);
2177 tcg_gen_sub_i32(t2, a, b);
2178 tcg_gen_sub_i32(t1, a, t1);
2179 tcg_gen_deposit_i32(d, t1, t2, 0, 16);
2180
2181 tcg_temp_free_i32(t1);
2182 tcg_temp_free_i32(t2);
2183 }
2184
tcg_gen_vec_sub32_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)2185 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2186 {
2187 TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2188 TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2189
2190 tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2191 tcg_gen_sub_i64(t2, a, b);
2192 tcg_gen_sub_i64(t1, a, t1);
2193 tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2194
2195 tcg_temp_free_i64(t1);
2196 tcg_temp_free_i64(t2);
2197 }
2198
tcg_gen_gvec_sub_var(unsigned vece,TCGv_ptr dbase,uint32_t dofs,TCGv_ptr abase,uint32_t aofs,TCGv_ptr bbase,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2199 void tcg_gen_gvec_sub_var(unsigned vece, TCGv_ptr dbase, uint32_t dofs,
2200 TCGv_ptr abase, uint32_t aofs,
2201 TCGv_ptr bbase, uint32_t bofs,
2202 uint32_t oprsz, uint32_t maxsz)
2203 {
2204 static const GVecGen3 g[4] = {
2205 { .fni8 = tcg_gen_vec_sub8_i64,
2206 .fniv = tcg_gen_sub_vec,
2207 .fno = gen_helper_gvec_sub8,
2208 .opt_opc = vecop_list_sub,
2209 .vece = MO_8 },
2210 { .fni8 = tcg_gen_vec_sub16_i64,
2211 .fniv = tcg_gen_sub_vec,
2212 .fno = gen_helper_gvec_sub16,
2213 .opt_opc = vecop_list_sub,
2214 .vece = MO_16 },
2215 { .fni4 = tcg_gen_sub_i32,
2216 .fniv = tcg_gen_sub_vec,
2217 .fno = gen_helper_gvec_sub32,
2218 .opt_opc = vecop_list_sub,
2219 .vece = MO_32 },
2220 { .fni8 = tcg_gen_sub_i64,
2221 .fniv = tcg_gen_sub_vec,
2222 .fno = gen_helper_gvec_sub64,
2223 .opt_opc = vecop_list_sub,
2224 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2225 .vece = MO_64 },
2226 };
2227
2228 tcg_debug_assert(vece <= MO_64);
2229 tcg_gen_gvec_3_var(dbase, dofs, abase, aofs, bbase, bofs,
2230 oprsz, maxsz, &g[vece]);
2231 }
2232
tcg_gen_gvec_sub(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2233 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
2234 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2235 {
2236 tcg_gen_gvec_sub_var(vece, tcg_env, dofs, tcg_env, aofs, tcg_env, bofs,
2237 oprsz, maxsz);
2238 }
2239
2240 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
2241
tcg_gen_gvec_mul(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2242 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
2243 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2244 {
2245 static const GVecGen3 g[4] = {
2246 { .fniv = tcg_gen_mul_vec,
2247 .fno = gen_helper_gvec_mul8,
2248 .opt_opc = vecop_list_mul,
2249 .vece = MO_8 },
2250 { .fniv = tcg_gen_mul_vec,
2251 .fno = gen_helper_gvec_mul16,
2252 .opt_opc = vecop_list_mul,
2253 .vece = MO_16 },
2254 { .fni4 = tcg_gen_mul_i32,
2255 .fniv = tcg_gen_mul_vec,
2256 .fno = gen_helper_gvec_mul32,
2257 .opt_opc = vecop_list_mul,
2258 .vece = MO_32 },
2259 { .fni8 = tcg_gen_mul_i64,
2260 .fniv = tcg_gen_mul_vec,
2261 .fno = gen_helper_gvec_mul64,
2262 .opt_opc = vecop_list_mul,
2263 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2264 .vece = MO_64 },
2265 };
2266
2267 tcg_debug_assert(vece <= MO_64);
2268 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2269 }
2270
tcg_gen_gvec_muls(unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i64 c,uint32_t oprsz,uint32_t maxsz)2271 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
2272 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2273 {
2274 static const GVecGen2s g[4] = {
2275 { .fniv = tcg_gen_mul_vec,
2276 .fno = gen_helper_gvec_muls8,
2277 .opt_opc = vecop_list_mul,
2278 .vece = MO_8 },
2279 { .fniv = tcg_gen_mul_vec,
2280 .fno = gen_helper_gvec_muls16,
2281 .opt_opc = vecop_list_mul,
2282 .vece = MO_16 },
2283 { .fni4 = tcg_gen_mul_i32,
2284 .fniv = tcg_gen_mul_vec,
2285 .fno = gen_helper_gvec_muls32,
2286 .opt_opc = vecop_list_mul,
2287 .vece = MO_32 },
2288 { .fni8 = tcg_gen_mul_i64,
2289 .fniv = tcg_gen_mul_vec,
2290 .fno = gen_helper_gvec_muls64,
2291 .opt_opc = vecop_list_mul,
2292 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2293 .vece = MO_64 },
2294 };
2295
2296 tcg_debug_assert(vece <= MO_64);
2297 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2298 }
2299
tcg_gen_gvec_muli(unsigned vece,uint32_t dofs,uint32_t aofs,int64_t c,uint32_t oprsz,uint32_t maxsz)2300 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2301 int64_t c, uint32_t oprsz, uint32_t maxsz)
2302 {
2303 TCGv_i64 tmp = tcg_constant_i64(c);
2304 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2305 }
2306
tcg_gen_gvec_ssadd(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2307 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2308 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2309 {
2310 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2311 static const GVecGen3 g[4] = {
2312 { .fniv = tcg_gen_ssadd_vec,
2313 .fno = gen_helper_gvec_ssadd8,
2314 .opt_opc = vecop_list,
2315 .vece = MO_8 },
2316 { .fniv = tcg_gen_ssadd_vec,
2317 .fno = gen_helper_gvec_ssadd16,
2318 .opt_opc = vecop_list,
2319 .vece = MO_16 },
2320 { .fniv = tcg_gen_ssadd_vec,
2321 .fno = gen_helper_gvec_ssadd32,
2322 .opt_opc = vecop_list,
2323 .vece = MO_32 },
2324 { .fniv = tcg_gen_ssadd_vec,
2325 .fno = gen_helper_gvec_ssadd64,
2326 .opt_opc = vecop_list,
2327 .vece = MO_64 },
2328 };
2329 tcg_debug_assert(vece <= MO_64);
2330 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2331 }
2332
tcg_gen_gvec_sssub(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2333 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2334 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2335 {
2336 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2337 static const GVecGen3 g[4] = {
2338 { .fniv = tcg_gen_sssub_vec,
2339 .fno = gen_helper_gvec_sssub8,
2340 .opt_opc = vecop_list,
2341 .vece = MO_8 },
2342 { .fniv = tcg_gen_sssub_vec,
2343 .fno = gen_helper_gvec_sssub16,
2344 .opt_opc = vecop_list,
2345 .vece = MO_16 },
2346 { .fniv = tcg_gen_sssub_vec,
2347 .fno = gen_helper_gvec_sssub32,
2348 .opt_opc = vecop_list,
2349 .vece = MO_32 },
2350 { .fniv = tcg_gen_sssub_vec,
2351 .fno = gen_helper_gvec_sssub64,
2352 .opt_opc = vecop_list,
2353 .vece = MO_64 },
2354 };
2355 tcg_debug_assert(vece <= MO_64);
2356 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2357 }
2358
tcg_gen_usadd_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)2359 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2360 {
2361 TCGv_i32 max = tcg_constant_i32(-1);
2362 tcg_gen_add_i32(d, a, b);
2363 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2364 }
2365
tcg_gen_usadd_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)2366 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2367 {
2368 TCGv_i64 max = tcg_constant_i64(-1);
2369 tcg_gen_add_i64(d, a, b);
2370 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2371 }
2372
tcg_gen_gvec_usadd(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2373 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2374 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2375 {
2376 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2377 static const GVecGen3 g[4] = {
2378 { .fniv = tcg_gen_usadd_vec,
2379 .fno = gen_helper_gvec_usadd8,
2380 .opt_opc = vecop_list,
2381 .vece = MO_8 },
2382 { .fniv = tcg_gen_usadd_vec,
2383 .fno = gen_helper_gvec_usadd16,
2384 .opt_opc = vecop_list,
2385 .vece = MO_16 },
2386 { .fni4 = tcg_gen_usadd_i32,
2387 .fniv = tcg_gen_usadd_vec,
2388 .fno = gen_helper_gvec_usadd32,
2389 .opt_opc = vecop_list,
2390 .vece = MO_32 },
2391 { .fni8 = tcg_gen_usadd_i64,
2392 .fniv = tcg_gen_usadd_vec,
2393 .fno = gen_helper_gvec_usadd64,
2394 .opt_opc = vecop_list,
2395 .vece = MO_64 }
2396 };
2397 tcg_debug_assert(vece <= MO_64);
2398 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2399 }
2400
tcg_gen_ussub_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)2401 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2402 {
2403 TCGv_i32 min = tcg_constant_i32(0);
2404 tcg_gen_sub_i32(d, a, b);
2405 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2406 }
2407
tcg_gen_ussub_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)2408 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2409 {
2410 TCGv_i64 min = tcg_constant_i64(0);
2411 tcg_gen_sub_i64(d, a, b);
2412 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2413 }
2414
tcg_gen_gvec_ussub(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2415 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2416 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2417 {
2418 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2419 static const GVecGen3 g[4] = {
2420 { .fniv = tcg_gen_ussub_vec,
2421 .fno = gen_helper_gvec_ussub8,
2422 .opt_opc = vecop_list,
2423 .vece = MO_8 },
2424 { .fniv = tcg_gen_ussub_vec,
2425 .fno = gen_helper_gvec_ussub16,
2426 .opt_opc = vecop_list,
2427 .vece = MO_16 },
2428 { .fni4 = tcg_gen_ussub_i32,
2429 .fniv = tcg_gen_ussub_vec,
2430 .fno = gen_helper_gvec_ussub32,
2431 .opt_opc = vecop_list,
2432 .vece = MO_32 },
2433 { .fni8 = tcg_gen_ussub_i64,
2434 .fniv = tcg_gen_ussub_vec,
2435 .fno = gen_helper_gvec_ussub64,
2436 .opt_opc = vecop_list,
2437 .vece = MO_64 }
2438 };
2439 tcg_debug_assert(vece <= MO_64);
2440 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2441 }
2442
tcg_gen_gvec_smin(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2443 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2444 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2445 {
2446 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2447 static const GVecGen3 g[4] = {
2448 { .fniv = tcg_gen_smin_vec,
2449 .fno = gen_helper_gvec_smin8,
2450 .opt_opc = vecop_list,
2451 .vece = MO_8 },
2452 { .fniv = tcg_gen_smin_vec,
2453 .fno = gen_helper_gvec_smin16,
2454 .opt_opc = vecop_list,
2455 .vece = MO_16 },
2456 { .fni4 = tcg_gen_smin_i32,
2457 .fniv = tcg_gen_smin_vec,
2458 .fno = gen_helper_gvec_smin32,
2459 .opt_opc = vecop_list,
2460 .vece = MO_32 },
2461 { .fni8 = tcg_gen_smin_i64,
2462 .fniv = tcg_gen_smin_vec,
2463 .fno = gen_helper_gvec_smin64,
2464 .opt_opc = vecop_list,
2465 .vece = MO_64 }
2466 };
2467 tcg_debug_assert(vece <= MO_64);
2468 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2469 }
2470
tcg_gen_gvec_umin(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2471 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2472 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2473 {
2474 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2475 static const GVecGen3 g[4] = {
2476 { .fniv = tcg_gen_umin_vec,
2477 .fno = gen_helper_gvec_umin8,
2478 .opt_opc = vecop_list,
2479 .vece = MO_8 },
2480 { .fniv = tcg_gen_umin_vec,
2481 .fno = gen_helper_gvec_umin16,
2482 .opt_opc = vecop_list,
2483 .vece = MO_16 },
2484 { .fni4 = tcg_gen_umin_i32,
2485 .fniv = tcg_gen_umin_vec,
2486 .fno = gen_helper_gvec_umin32,
2487 .opt_opc = vecop_list,
2488 .vece = MO_32 },
2489 { .fni8 = tcg_gen_umin_i64,
2490 .fniv = tcg_gen_umin_vec,
2491 .fno = gen_helper_gvec_umin64,
2492 .opt_opc = vecop_list,
2493 .vece = MO_64 }
2494 };
2495 tcg_debug_assert(vece <= MO_64);
2496 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2497 }
2498
tcg_gen_gvec_smax(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2499 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2500 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2501 {
2502 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2503 static const GVecGen3 g[4] = {
2504 { .fniv = tcg_gen_smax_vec,
2505 .fno = gen_helper_gvec_smax8,
2506 .opt_opc = vecop_list,
2507 .vece = MO_8 },
2508 { .fniv = tcg_gen_smax_vec,
2509 .fno = gen_helper_gvec_smax16,
2510 .opt_opc = vecop_list,
2511 .vece = MO_16 },
2512 { .fni4 = tcg_gen_smax_i32,
2513 .fniv = tcg_gen_smax_vec,
2514 .fno = gen_helper_gvec_smax32,
2515 .opt_opc = vecop_list,
2516 .vece = MO_32 },
2517 { .fni8 = tcg_gen_smax_i64,
2518 .fniv = tcg_gen_smax_vec,
2519 .fno = gen_helper_gvec_smax64,
2520 .opt_opc = vecop_list,
2521 .vece = MO_64 }
2522 };
2523 tcg_debug_assert(vece <= MO_64);
2524 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2525 }
2526
tcg_gen_gvec_umax(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2527 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2528 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2529 {
2530 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2531 static const GVecGen3 g[4] = {
2532 { .fniv = tcg_gen_umax_vec,
2533 .fno = gen_helper_gvec_umax8,
2534 .opt_opc = vecop_list,
2535 .vece = MO_8 },
2536 { .fniv = tcg_gen_umax_vec,
2537 .fno = gen_helper_gvec_umax16,
2538 .opt_opc = vecop_list,
2539 .vece = MO_16 },
2540 { .fni4 = tcg_gen_umax_i32,
2541 .fniv = tcg_gen_umax_vec,
2542 .fno = gen_helper_gvec_umax32,
2543 .opt_opc = vecop_list,
2544 .vece = MO_32 },
2545 { .fni8 = tcg_gen_umax_i64,
2546 .fniv = tcg_gen_umax_vec,
2547 .fno = gen_helper_gvec_umax64,
2548 .opt_opc = vecop_list,
2549 .vece = MO_64 }
2550 };
2551 tcg_debug_assert(vece <= MO_64);
2552 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2553 }
2554
2555 /* Perform a vector negation using normal negation and a mask.
2556 Compare gen_subv_mask above. */
gen_negv_mask(TCGv_i64 d,TCGv_i64 b,TCGv_i64 m)2557 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2558 {
2559 TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2560 TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2561
2562 tcg_gen_andc_i64(t3, m, b);
2563 tcg_gen_andc_i64(t2, b, m);
2564 tcg_gen_sub_i64(d, m, t2);
2565 tcg_gen_xor_i64(d, d, t3);
2566
2567 tcg_temp_free_i64(t2);
2568 tcg_temp_free_i64(t3);
2569 }
2570
tcg_gen_vec_neg8_i64(TCGv_i64 d,TCGv_i64 b)2571 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2572 {
2573 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2574 gen_negv_mask(d, b, m);
2575 }
2576
tcg_gen_vec_neg16_i64(TCGv_i64 d,TCGv_i64 b)2577 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2578 {
2579 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2580 gen_negv_mask(d, b, m);
2581 }
2582
tcg_gen_vec_neg32_i64(TCGv_i64 d,TCGv_i64 b)2583 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2584 {
2585 TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2586 TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2587
2588 tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2589 tcg_gen_neg_i64(t2, b);
2590 tcg_gen_neg_i64(t1, t1);
2591 tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2592
2593 tcg_temp_free_i64(t1);
2594 tcg_temp_free_i64(t2);
2595 }
2596
tcg_gen_gvec_neg(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t oprsz,uint32_t maxsz)2597 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2598 uint32_t oprsz, uint32_t maxsz)
2599 {
2600 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2601 static const GVecGen2 g[4] = {
2602 { .fni8 = tcg_gen_vec_neg8_i64,
2603 .fniv = tcg_gen_neg_vec,
2604 .fno = gen_helper_gvec_neg8,
2605 .opt_opc = vecop_list,
2606 .vece = MO_8 },
2607 { .fni8 = tcg_gen_vec_neg16_i64,
2608 .fniv = tcg_gen_neg_vec,
2609 .fno = gen_helper_gvec_neg16,
2610 .opt_opc = vecop_list,
2611 .vece = MO_16 },
2612 { .fni4 = tcg_gen_neg_i32,
2613 .fniv = tcg_gen_neg_vec,
2614 .fno = gen_helper_gvec_neg32,
2615 .opt_opc = vecop_list,
2616 .vece = MO_32 },
2617 { .fni8 = tcg_gen_neg_i64,
2618 .fniv = tcg_gen_neg_vec,
2619 .fno = gen_helper_gvec_neg64,
2620 .opt_opc = vecop_list,
2621 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2622 .vece = MO_64 },
2623 };
2624
2625 tcg_debug_assert(vece <= MO_64);
2626 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2627 }
2628
gen_absv_mask(TCGv_i64 d,TCGv_i64 b,unsigned vece)2629 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2630 {
2631 TCGv_i64 t = tcg_temp_ebb_new_i64();
2632 int nbit = 8 << vece;
2633
2634 /* Create -1 for each negative element. */
2635 tcg_gen_shri_i64(t, b, nbit - 1);
2636 tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2637 tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2638
2639 /*
2640 * Invert (via xor -1) and add one.
2641 * Because of the ordering the msb is cleared,
2642 * so we never have carry into the next element.
2643 */
2644 tcg_gen_xor_i64(d, b, t);
2645 tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2646 tcg_gen_add_i64(d, d, t);
2647
2648 tcg_temp_free_i64(t);
2649 }
2650
tcg_gen_vec_abs8_i64(TCGv_i64 d,TCGv_i64 b)2651 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2652 {
2653 gen_absv_mask(d, b, MO_8);
2654 }
2655
tcg_gen_vec_abs16_i64(TCGv_i64 d,TCGv_i64 b)2656 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2657 {
2658 gen_absv_mask(d, b, MO_16);
2659 }
2660
tcg_gen_gvec_abs(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t oprsz,uint32_t maxsz)2661 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2662 uint32_t oprsz, uint32_t maxsz)
2663 {
2664 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2665 static const GVecGen2 g[4] = {
2666 { .fni8 = tcg_gen_vec_abs8_i64,
2667 .fniv = tcg_gen_abs_vec,
2668 .fno = gen_helper_gvec_abs8,
2669 .opt_opc = vecop_list,
2670 .vece = MO_8 },
2671 { .fni8 = tcg_gen_vec_abs16_i64,
2672 .fniv = tcg_gen_abs_vec,
2673 .fno = gen_helper_gvec_abs16,
2674 .opt_opc = vecop_list,
2675 .vece = MO_16 },
2676 { .fni4 = tcg_gen_abs_i32,
2677 .fniv = tcg_gen_abs_vec,
2678 .fno = gen_helper_gvec_abs32,
2679 .opt_opc = vecop_list,
2680 .vece = MO_32 },
2681 { .fni8 = tcg_gen_abs_i64,
2682 .fniv = tcg_gen_abs_vec,
2683 .fno = gen_helper_gvec_abs64,
2684 .opt_opc = vecop_list,
2685 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2686 .vece = MO_64 },
2687 };
2688
2689 tcg_debug_assert(vece <= MO_64);
2690 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2691 }
2692
tcg_gen_gvec_and(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2693 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2694 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2695 {
2696 static const GVecGen3 g = {
2697 .fni8 = tcg_gen_and_i64,
2698 .fniv = tcg_gen_and_vec,
2699 .fno = gen_helper_gvec_and,
2700 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2701 };
2702
2703 if (aofs == bofs) {
2704 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2705 } else {
2706 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2707 }
2708 }
2709
tcg_gen_gvec_or(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2710 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2711 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2712 {
2713 static const GVecGen3 g = {
2714 .fni8 = tcg_gen_or_i64,
2715 .fniv = tcg_gen_or_vec,
2716 .fno = gen_helper_gvec_or,
2717 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2718 };
2719
2720 if (aofs == bofs) {
2721 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2722 } else {
2723 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2724 }
2725 }
2726
tcg_gen_gvec_xor(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2727 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2728 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2729 {
2730 static const GVecGen3 g = {
2731 .fni8 = tcg_gen_xor_i64,
2732 .fniv = tcg_gen_xor_vec,
2733 .fno = gen_helper_gvec_xor,
2734 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2735 };
2736
2737 if (aofs == bofs) {
2738 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2739 } else {
2740 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2741 }
2742 }
2743
tcg_gen_gvec_andc(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2744 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2745 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2746 {
2747 static const GVecGen3 g = {
2748 .fni8 = tcg_gen_andc_i64,
2749 .fniv = tcg_gen_andc_vec,
2750 .fno = gen_helper_gvec_andc,
2751 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2752 };
2753
2754 if (aofs == bofs) {
2755 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2756 } else {
2757 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2758 }
2759 }
2760
tcg_gen_gvec_orc(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2761 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2762 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2763 {
2764 static const GVecGen3 g = {
2765 .fni8 = tcg_gen_orc_i64,
2766 .fniv = tcg_gen_orc_vec,
2767 .fno = gen_helper_gvec_orc,
2768 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2769 };
2770
2771 if (aofs == bofs) {
2772 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2773 } else {
2774 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2775 }
2776 }
2777
tcg_gen_gvec_nand(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2778 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2779 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2780 {
2781 static const GVecGen3 g = {
2782 .fni8 = tcg_gen_nand_i64,
2783 .fniv = tcg_gen_nand_vec,
2784 .fno = gen_helper_gvec_nand,
2785 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2786 };
2787
2788 if (aofs == bofs) {
2789 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2790 } else {
2791 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2792 }
2793 }
2794
tcg_gen_gvec_nor(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2795 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2796 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2797 {
2798 static const GVecGen3 g = {
2799 .fni8 = tcg_gen_nor_i64,
2800 .fniv = tcg_gen_nor_vec,
2801 .fno = gen_helper_gvec_nor,
2802 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2803 };
2804
2805 if (aofs == bofs) {
2806 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2807 } else {
2808 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2809 }
2810 }
2811
tcg_gen_gvec_eqv(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)2812 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2813 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2814 {
2815 static const GVecGen3 g = {
2816 .fni8 = tcg_gen_eqv_i64,
2817 .fniv = tcg_gen_eqv_vec,
2818 .fno = gen_helper_gvec_eqv,
2819 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2820 };
2821
2822 if (aofs == bofs) {
2823 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2824 } else {
2825 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2826 }
2827 }
2828
2829 static const GVecGen2s gop_ands = {
2830 .fni8 = tcg_gen_and_i64,
2831 .fniv = tcg_gen_and_vec,
2832 .fno = gen_helper_gvec_ands,
2833 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2834 .vece = MO_64
2835 };
2836
tcg_gen_gvec_ands(unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i64 c,uint32_t oprsz,uint32_t maxsz)2837 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2838 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2839 {
2840 TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2841 tcg_gen_dup_i64(vece, tmp, c);
2842 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2843 tcg_temp_free_i64(tmp);
2844 }
2845
tcg_gen_gvec_andi(unsigned vece,uint32_t dofs,uint32_t aofs,int64_t c,uint32_t oprsz,uint32_t maxsz)2846 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2847 int64_t c, uint32_t oprsz, uint32_t maxsz)
2848 {
2849 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2850 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2851 }
2852
tcg_gen_gvec_andcs(unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i64 c,uint32_t oprsz,uint32_t maxsz)2853 void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
2854 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2855 {
2856 static GVecGen2s g = {
2857 .fni8 = tcg_gen_andc_i64,
2858 .fniv = tcg_gen_andc_vec,
2859 .fno = gen_helper_gvec_andcs,
2860 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2861 .vece = MO_64
2862 };
2863
2864 TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2865 tcg_gen_dup_i64(vece, tmp, c);
2866 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &g);
2867 tcg_temp_free_i64(tmp);
2868 }
2869
2870 static const GVecGen2s gop_xors = {
2871 .fni8 = tcg_gen_xor_i64,
2872 .fniv = tcg_gen_xor_vec,
2873 .fno = gen_helper_gvec_xors,
2874 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2875 .vece = MO_64
2876 };
2877
tcg_gen_gvec_xors(unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i64 c,uint32_t oprsz,uint32_t maxsz)2878 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2879 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2880 {
2881 TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2882 tcg_gen_dup_i64(vece, tmp, c);
2883 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2884 tcg_temp_free_i64(tmp);
2885 }
2886
tcg_gen_gvec_xori(unsigned vece,uint32_t dofs,uint32_t aofs,int64_t c,uint32_t oprsz,uint32_t maxsz)2887 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2888 int64_t c, uint32_t oprsz, uint32_t maxsz)
2889 {
2890 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2891 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2892 }
2893
2894 static const GVecGen2s gop_ors = {
2895 .fni8 = tcg_gen_or_i64,
2896 .fniv = tcg_gen_or_vec,
2897 .fno = gen_helper_gvec_ors,
2898 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2899 .vece = MO_64
2900 };
2901
tcg_gen_gvec_ors(unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i64 c,uint32_t oprsz,uint32_t maxsz)2902 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2903 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2904 {
2905 TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2906 tcg_gen_dup_i64(vece, tmp, c);
2907 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2908 tcg_temp_free_i64(tmp);
2909 }
2910
tcg_gen_gvec_ori(unsigned vece,uint32_t dofs,uint32_t aofs,int64_t c,uint32_t oprsz,uint32_t maxsz)2911 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2912 int64_t c, uint32_t oprsz, uint32_t maxsz)
2913 {
2914 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2915 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2916 }
2917
tcg_gen_vec_shl8i_i64(TCGv_i64 d,TCGv_i64 a,int64_t c)2918 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2919 {
2920 uint64_t mask = dup_const(MO_8, 0xff << c);
2921 tcg_gen_shli_i64(d, a, c);
2922 tcg_gen_andi_i64(d, d, mask);
2923 }
2924
tcg_gen_vec_shl16i_i64(TCGv_i64 d,TCGv_i64 a,int64_t c)2925 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2926 {
2927 uint64_t mask = dup_const(MO_16, 0xffff << c);
2928 tcg_gen_shli_i64(d, a, c);
2929 tcg_gen_andi_i64(d, d, mask);
2930 }
2931
tcg_gen_vec_shl8i_i32(TCGv_i32 d,TCGv_i32 a,int32_t c)2932 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2933 {
2934 uint32_t mask = dup_const(MO_8, 0xff << c);
2935 tcg_gen_shli_i32(d, a, c);
2936 tcg_gen_andi_i32(d, d, mask);
2937 }
2938
tcg_gen_vec_shl16i_i32(TCGv_i32 d,TCGv_i32 a,int32_t c)2939 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2940 {
2941 uint32_t mask = dup_const(MO_16, 0xffff << c);
2942 tcg_gen_shli_i32(d, a, c);
2943 tcg_gen_andi_i32(d, d, mask);
2944 }
2945
tcg_gen_gvec_shli(unsigned vece,uint32_t dofs,uint32_t aofs,int64_t shift,uint32_t oprsz,uint32_t maxsz)2946 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2947 int64_t shift, uint32_t oprsz, uint32_t maxsz)
2948 {
2949 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2950 static const GVecGen2i g[4] = {
2951 { .fni8 = tcg_gen_vec_shl8i_i64,
2952 .fniv = tcg_gen_shli_vec,
2953 .fno = gen_helper_gvec_shl8i,
2954 .opt_opc = vecop_list,
2955 .vece = MO_8 },
2956 { .fni8 = tcg_gen_vec_shl16i_i64,
2957 .fniv = tcg_gen_shli_vec,
2958 .fno = gen_helper_gvec_shl16i,
2959 .opt_opc = vecop_list,
2960 .vece = MO_16 },
2961 { .fni4 = tcg_gen_shli_i32,
2962 .fniv = tcg_gen_shli_vec,
2963 .fno = gen_helper_gvec_shl32i,
2964 .opt_opc = vecop_list,
2965 .vece = MO_32 },
2966 { .fni8 = tcg_gen_shli_i64,
2967 .fniv = tcg_gen_shli_vec,
2968 .fno = gen_helper_gvec_shl64i,
2969 .opt_opc = vecop_list,
2970 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2971 .vece = MO_64 },
2972 };
2973
2974 tcg_debug_assert(vece <= MO_64);
2975 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2976 if (shift == 0) {
2977 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2978 } else {
2979 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2980 }
2981 }
2982
tcg_gen_vec_shr8i_i64(TCGv_i64 d,TCGv_i64 a,int64_t c)2983 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2984 {
2985 uint64_t mask = dup_const(MO_8, 0xff >> c);
2986 tcg_gen_shri_i64(d, a, c);
2987 tcg_gen_andi_i64(d, d, mask);
2988 }
2989
tcg_gen_vec_shr16i_i64(TCGv_i64 d,TCGv_i64 a,int64_t c)2990 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2991 {
2992 uint64_t mask = dup_const(MO_16, 0xffff >> c);
2993 tcg_gen_shri_i64(d, a, c);
2994 tcg_gen_andi_i64(d, d, mask);
2995 }
2996
tcg_gen_vec_shr8i_i32(TCGv_i32 d,TCGv_i32 a,int32_t c)2997 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2998 {
2999 uint32_t mask = dup_const(MO_8, 0xff >> c);
3000 tcg_gen_shri_i32(d, a, c);
3001 tcg_gen_andi_i32(d, d, mask);
3002 }
3003
tcg_gen_vec_shr16i_i32(TCGv_i32 d,TCGv_i32 a,int32_t c)3004 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
3005 {
3006 uint32_t mask = dup_const(MO_16, 0xffff >> c);
3007 tcg_gen_shri_i32(d, a, c);
3008 tcg_gen_andi_i32(d, d, mask);
3009 }
3010
tcg_gen_gvec_shri(unsigned vece,uint32_t dofs,uint32_t aofs,int64_t shift,uint32_t oprsz,uint32_t maxsz)3011 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
3012 int64_t shift, uint32_t oprsz, uint32_t maxsz)
3013 {
3014 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
3015 static const GVecGen2i g[4] = {
3016 { .fni8 = tcg_gen_vec_shr8i_i64,
3017 .fniv = tcg_gen_shri_vec,
3018 .fno = gen_helper_gvec_shr8i,
3019 .opt_opc = vecop_list,
3020 .vece = MO_8 },
3021 { .fni8 = tcg_gen_vec_shr16i_i64,
3022 .fniv = tcg_gen_shri_vec,
3023 .fno = gen_helper_gvec_shr16i,
3024 .opt_opc = vecop_list,
3025 .vece = MO_16 },
3026 { .fni4 = tcg_gen_shri_i32,
3027 .fniv = tcg_gen_shri_vec,
3028 .fno = gen_helper_gvec_shr32i,
3029 .opt_opc = vecop_list,
3030 .vece = MO_32 },
3031 { .fni8 = tcg_gen_shri_i64,
3032 .fniv = tcg_gen_shri_vec,
3033 .fno = gen_helper_gvec_shr64i,
3034 .opt_opc = vecop_list,
3035 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3036 .vece = MO_64 },
3037 };
3038
3039 tcg_debug_assert(vece <= MO_64);
3040 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3041 if (shift == 0) {
3042 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3043 } else {
3044 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3045 }
3046 }
3047
tcg_gen_vec_sar8i_i64(TCGv_i64 d,TCGv_i64 a,int64_t c)3048 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3049 {
3050 uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
3051 uint64_t c_mask = dup_const(MO_8, 0xff >> c);
3052 TCGv_i64 s = tcg_temp_ebb_new_i64();
3053
3054 tcg_gen_shri_i64(d, a, c);
3055 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */
3056 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
3057 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */
3058 tcg_gen_or_i64(d, d, s); /* include sign extension */
3059 tcg_temp_free_i64(s);
3060 }
3061
tcg_gen_vec_sar16i_i64(TCGv_i64 d,TCGv_i64 a,int64_t c)3062 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3063 {
3064 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
3065 uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
3066 TCGv_i64 s = tcg_temp_ebb_new_i64();
3067
3068 tcg_gen_shri_i64(d, a, c);
3069 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */
3070 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */
3071 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
3072 tcg_gen_or_i64(d, d, s); /* include sign extension */
3073 tcg_temp_free_i64(s);
3074 }
3075
tcg_gen_vec_sar8i_i32(TCGv_i32 d,TCGv_i32 a,int32_t c)3076 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
3077 {
3078 uint32_t s_mask = dup_const(MO_8, 0x80 >> c);
3079 uint32_t c_mask = dup_const(MO_8, 0xff >> c);
3080 TCGv_i32 s = tcg_temp_ebb_new_i32();
3081
3082 tcg_gen_shri_i32(d, a, c);
3083 tcg_gen_andi_i32(s, d, s_mask); /* isolate (shifted) sign bit */
3084 tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
3085 tcg_gen_andi_i32(d, d, c_mask); /* clear out bits above sign */
3086 tcg_gen_or_i32(d, d, s); /* include sign extension */
3087 tcg_temp_free_i32(s);
3088 }
3089
tcg_gen_vec_sar16i_i32(TCGv_i32 d,TCGv_i32 a,int32_t c)3090 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
3091 {
3092 uint32_t s_mask = dup_const(MO_16, 0x8000 >> c);
3093 uint32_t c_mask = dup_const(MO_16, 0xffff >> c);
3094 TCGv_i32 s = tcg_temp_ebb_new_i32();
3095
3096 tcg_gen_shri_i32(d, a, c);
3097 tcg_gen_andi_i32(s, d, s_mask); /* isolate (shifted) sign bit */
3098 tcg_gen_andi_i32(d, d, c_mask); /* clear out bits above sign */
3099 tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
3100 tcg_gen_or_i32(d, d, s); /* include sign extension */
3101 tcg_temp_free_i32(s);
3102 }
3103
tcg_gen_gvec_sari(unsigned vece,uint32_t dofs,uint32_t aofs,int64_t shift,uint32_t oprsz,uint32_t maxsz)3104 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
3105 int64_t shift, uint32_t oprsz, uint32_t maxsz)
3106 {
3107 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
3108 static const GVecGen2i g[4] = {
3109 { .fni8 = tcg_gen_vec_sar8i_i64,
3110 .fniv = tcg_gen_sari_vec,
3111 .fno = gen_helper_gvec_sar8i,
3112 .opt_opc = vecop_list,
3113 .vece = MO_8 },
3114 { .fni8 = tcg_gen_vec_sar16i_i64,
3115 .fniv = tcg_gen_sari_vec,
3116 .fno = gen_helper_gvec_sar16i,
3117 .opt_opc = vecop_list,
3118 .vece = MO_16 },
3119 { .fni4 = tcg_gen_sari_i32,
3120 .fniv = tcg_gen_sari_vec,
3121 .fno = gen_helper_gvec_sar32i,
3122 .opt_opc = vecop_list,
3123 .vece = MO_32 },
3124 { .fni8 = tcg_gen_sari_i64,
3125 .fniv = tcg_gen_sari_vec,
3126 .fno = gen_helper_gvec_sar64i,
3127 .opt_opc = vecop_list,
3128 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3129 .vece = MO_64 },
3130 };
3131
3132 tcg_debug_assert(vece <= MO_64);
3133 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3134 if (shift == 0) {
3135 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3136 } else {
3137 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3138 }
3139 }
3140
tcg_gen_vec_rotl8i_i64(TCGv_i64 d,TCGv_i64 a,int64_t c)3141 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3142 {
3143 uint64_t mask = dup_const(MO_8, 0xff << c);
3144
3145 tcg_gen_shli_i64(d, a, c);
3146 tcg_gen_shri_i64(a, a, 8 - c);
3147 tcg_gen_andi_i64(d, d, mask);
3148 tcg_gen_andi_i64(a, a, ~mask);
3149 tcg_gen_or_i64(d, d, a);
3150 }
3151
tcg_gen_vec_rotl16i_i64(TCGv_i64 d,TCGv_i64 a,int64_t c)3152 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3153 {
3154 uint64_t mask = dup_const(MO_16, 0xffff << c);
3155
3156 tcg_gen_shli_i64(d, a, c);
3157 tcg_gen_shri_i64(a, a, 16 - c);
3158 tcg_gen_andi_i64(d, d, mask);
3159 tcg_gen_andi_i64(a, a, ~mask);
3160 tcg_gen_or_i64(d, d, a);
3161 }
3162
tcg_gen_gvec_rotli(unsigned vece,uint32_t dofs,uint32_t aofs,int64_t shift,uint32_t oprsz,uint32_t maxsz)3163 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
3164 int64_t shift, uint32_t oprsz, uint32_t maxsz)
3165 {
3166 static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
3167 static const GVecGen2i g[4] = {
3168 { .fni8 = tcg_gen_vec_rotl8i_i64,
3169 .fniv = tcg_gen_rotli_vec,
3170 .fno = gen_helper_gvec_rotl8i,
3171 .opt_opc = vecop_list,
3172 .vece = MO_8 },
3173 { .fni8 = tcg_gen_vec_rotl16i_i64,
3174 .fniv = tcg_gen_rotli_vec,
3175 .fno = gen_helper_gvec_rotl16i,
3176 .opt_opc = vecop_list,
3177 .vece = MO_16 },
3178 { .fni4 = tcg_gen_rotli_i32,
3179 .fniv = tcg_gen_rotli_vec,
3180 .fno = gen_helper_gvec_rotl32i,
3181 .opt_opc = vecop_list,
3182 .vece = MO_32 },
3183 { .fni8 = tcg_gen_rotli_i64,
3184 .fniv = tcg_gen_rotli_vec,
3185 .fno = gen_helper_gvec_rotl64i,
3186 .opt_opc = vecop_list,
3187 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3188 .vece = MO_64 },
3189 };
3190
3191 tcg_debug_assert(vece <= MO_64);
3192 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3193 if (shift == 0) {
3194 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3195 } else {
3196 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3197 }
3198 }
3199
tcg_gen_gvec_rotri(unsigned vece,uint32_t dofs,uint32_t aofs,int64_t shift,uint32_t oprsz,uint32_t maxsz)3200 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
3201 int64_t shift, uint32_t oprsz, uint32_t maxsz)
3202 {
3203 tcg_debug_assert(vece <= MO_64);
3204 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3205 tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
3206 oprsz, maxsz);
3207 }
3208
3209 /*
3210 * Specialized generation vector shifts by a non-constant scalar.
3211 */
3212
3213 typedef struct {
3214 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
3215 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
3216 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
3217 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
3218 gen_helper_gvec_2 *fno[4];
3219 TCGOpcode s_list[2];
3220 TCGOpcode v_list[2];
3221 } GVecGen2sh;
3222
expand_2sh_vec(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t oprsz,uint32_t tysz,TCGType type,TCGv_i32 shift,void (* fni)(unsigned,TCGv_vec,TCGv_vec,TCGv_i32))3223 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3224 uint32_t oprsz, uint32_t tysz, TCGType type,
3225 TCGv_i32 shift,
3226 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
3227 {
3228 for (uint32_t i = 0; i < oprsz; i += tysz) {
3229 TCGv_vec t0 = tcg_temp_new_vec(type);
3230 TCGv_vec t1 = tcg_temp_new_vec(type);
3231
3232 tcg_gen_ld_vec(t0, tcg_env, aofs + i);
3233 fni(vece, t1, t0, shift);
3234 tcg_gen_st_vec(t1, tcg_env, dofs + i);
3235 }
3236 }
3237
3238 static void
do_gvec_shifts(unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i32 shift,uint32_t oprsz,uint32_t maxsz,const GVecGen2sh * g)3239 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
3240 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
3241 {
3242 TCGType type;
3243 uint32_t some;
3244
3245 check_size_align(oprsz, maxsz, dofs | aofs);
3246 check_overlap_2(tcg_env, dofs, tcg_env, aofs, maxsz);
3247
3248 /* If the backend has a scalar expansion, great. */
3249 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
3250 if (type) {
3251 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3252 switch (type) {
3253 case TCG_TYPE_V256:
3254 some = QEMU_ALIGN_DOWN(oprsz, 32);
3255 expand_2sh_vec(vece, dofs, aofs, some, 32,
3256 TCG_TYPE_V256, shift, g->fniv_s);
3257 if (some == oprsz) {
3258 break;
3259 }
3260 dofs += some;
3261 aofs += some;
3262 oprsz -= some;
3263 maxsz -= some;
3264 /* fallthru */
3265 case TCG_TYPE_V128:
3266 expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
3267 TCG_TYPE_V128, shift, g->fniv_s);
3268 break;
3269 case TCG_TYPE_V64:
3270 expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
3271 TCG_TYPE_V64, shift, g->fniv_s);
3272 break;
3273 default:
3274 g_assert_not_reached();
3275 }
3276 tcg_swap_vecop_list(hold_list);
3277 goto clear_tail;
3278 }
3279
3280 /* If the backend supports variable vector shifts, also cool. */
3281 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
3282 if (type) {
3283 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3284 TCGv_vec v_shift = tcg_temp_new_vec(type);
3285
3286 if (vece == MO_64) {
3287 TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3288 tcg_gen_extu_i32_i64(sh64, shift);
3289 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
3290 tcg_temp_free_i64(sh64);
3291 } else {
3292 tcg_gen_dup_i32_vec(vece, v_shift, shift);
3293 }
3294
3295 switch (type) {
3296 case TCG_TYPE_V256:
3297 some = QEMU_ALIGN_DOWN(oprsz, 32);
3298 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
3299 v_shift, false, g->fniv_v);
3300 if (some == oprsz) {
3301 break;
3302 }
3303 dofs += some;
3304 aofs += some;
3305 oprsz -= some;
3306 maxsz -= some;
3307 /* fallthru */
3308 case TCG_TYPE_V128:
3309 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
3310 v_shift, false, g->fniv_v);
3311 break;
3312 case TCG_TYPE_V64:
3313 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
3314 v_shift, false, g->fniv_v);
3315 break;
3316 default:
3317 g_assert_not_reached();
3318 }
3319 tcg_temp_free_vec(v_shift);
3320 tcg_swap_vecop_list(hold_list);
3321 goto clear_tail;
3322 }
3323
3324 /* Otherwise fall back to integral... */
3325 if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3326 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
3327 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3328 TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3329 tcg_gen_extu_i32_i64(sh64, shift);
3330 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
3331 tcg_temp_free_i64(sh64);
3332 } else {
3333 TCGv_ptr a0 = tcg_temp_ebb_new_ptr();
3334 TCGv_ptr a1 = tcg_temp_ebb_new_ptr();
3335 TCGv_i32 desc = tcg_temp_ebb_new_i32();
3336
3337 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
3338 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
3339 tcg_gen_addi_ptr(a0, tcg_env, dofs);
3340 tcg_gen_addi_ptr(a1, tcg_env, aofs);
3341
3342 g->fno[vece](a0, a1, desc);
3343
3344 tcg_temp_free_ptr(a0);
3345 tcg_temp_free_ptr(a1);
3346 tcg_temp_free_i32(desc);
3347 return;
3348 }
3349
3350 clear_tail:
3351 if (oprsz < maxsz) {
3352 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
3353 }
3354 }
3355
tcg_gen_gvec_shls(unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i32 shift,uint32_t oprsz,uint32_t maxsz)3356 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3357 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3358 {
3359 static const GVecGen2sh g = {
3360 .fni4 = tcg_gen_shl_i32,
3361 .fni8 = tcg_gen_shl_i64,
3362 .fniv_s = tcg_gen_shls_vec,
3363 .fniv_v = tcg_gen_shlv_vec,
3364 .fno = {
3365 gen_helper_gvec_shl8i,
3366 gen_helper_gvec_shl16i,
3367 gen_helper_gvec_shl32i,
3368 gen_helper_gvec_shl64i,
3369 },
3370 .s_list = { INDEX_op_shls_vec, 0 },
3371 .v_list = { INDEX_op_shlv_vec, 0 },
3372 };
3373
3374 tcg_debug_assert(vece <= MO_64);
3375 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3376 }
3377
tcg_gen_gvec_shrs(unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i32 shift,uint32_t oprsz,uint32_t maxsz)3378 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3379 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3380 {
3381 static const GVecGen2sh g = {
3382 .fni4 = tcg_gen_shr_i32,
3383 .fni8 = tcg_gen_shr_i64,
3384 .fniv_s = tcg_gen_shrs_vec,
3385 .fniv_v = tcg_gen_shrv_vec,
3386 .fno = {
3387 gen_helper_gvec_shr8i,
3388 gen_helper_gvec_shr16i,
3389 gen_helper_gvec_shr32i,
3390 gen_helper_gvec_shr64i,
3391 },
3392 .s_list = { INDEX_op_shrs_vec, 0 },
3393 .v_list = { INDEX_op_shrv_vec, 0 },
3394 };
3395
3396 tcg_debug_assert(vece <= MO_64);
3397 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3398 }
3399
tcg_gen_gvec_sars(unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i32 shift,uint32_t oprsz,uint32_t maxsz)3400 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3401 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3402 {
3403 static const GVecGen2sh g = {
3404 .fni4 = tcg_gen_sar_i32,
3405 .fni8 = tcg_gen_sar_i64,
3406 .fniv_s = tcg_gen_sars_vec,
3407 .fniv_v = tcg_gen_sarv_vec,
3408 .fno = {
3409 gen_helper_gvec_sar8i,
3410 gen_helper_gvec_sar16i,
3411 gen_helper_gvec_sar32i,
3412 gen_helper_gvec_sar64i,
3413 },
3414 .s_list = { INDEX_op_sars_vec, 0 },
3415 .v_list = { INDEX_op_sarv_vec, 0 },
3416 };
3417
3418 tcg_debug_assert(vece <= MO_64);
3419 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3420 }
3421
tcg_gen_gvec_rotls(unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i32 shift,uint32_t oprsz,uint32_t maxsz)3422 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3423 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3424 {
3425 static const GVecGen2sh g = {
3426 .fni4 = tcg_gen_rotl_i32,
3427 .fni8 = tcg_gen_rotl_i64,
3428 .fniv_s = tcg_gen_rotls_vec,
3429 .fniv_v = tcg_gen_rotlv_vec,
3430 .fno = {
3431 gen_helper_gvec_rotl8i,
3432 gen_helper_gvec_rotl16i,
3433 gen_helper_gvec_rotl32i,
3434 gen_helper_gvec_rotl64i,
3435 },
3436 .s_list = { INDEX_op_rotls_vec, 0 },
3437 .v_list = { INDEX_op_rotlv_vec, 0 },
3438 };
3439
3440 tcg_debug_assert(vece <= MO_64);
3441 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3442 }
3443
tcg_gen_gvec_rotrs(unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i32 shift,uint32_t oprsz,uint32_t maxsz)3444 void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3445 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3446 {
3447 TCGv_i32 tmp = tcg_temp_ebb_new_i32();
3448
3449 tcg_gen_neg_i32(tmp, shift);
3450 tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1);
3451 tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz);
3452 tcg_temp_free_i32(tmp);
3453 }
3454
3455 /*
3456 * Expand D = A << (B % element bits)
3457 *
3458 * Unlike scalar shifts, where it is easy for the target front end
3459 * to include the modulo as part of the expansion. If the target
3460 * naturally includes the modulo as part of the operation, great!
3461 * If the target has some other behaviour from out-of-range shifts,
3462 * then it could not use this function anyway, and would need to
3463 * do it's own expansion with custom functions.
3464 */
tcg_gen_shlv_mod_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)3465 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3466 TCGv_vec a, TCGv_vec b)
3467 {
3468 TCGv_vec t = tcg_temp_new_vec_matching(d);
3469 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3470
3471 tcg_gen_and_vec(vece, t, b, m);
3472 tcg_gen_shlv_vec(vece, d, a, t);
3473 tcg_temp_free_vec(t);
3474 }
3475
tcg_gen_shl_mod_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)3476 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3477 {
3478 TCGv_i32 t = tcg_temp_ebb_new_i32();
3479
3480 tcg_gen_andi_i32(t, b, 31);
3481 tcg_gen_shl_i32(d, a, t);
3482 tcg_temp_free_i32(t);
3483 }
3484
tcg_gen_shl_mod_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)3485 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3486 {
3487 TCGv_i64 t = tcg_temp_ebb_new_i64();
3488
3489 tcg_gen_andi_i64(t, b, 63);
3490 tcg_gen_shl_i64(d, a, t);
3491 tcg_temp_free_i64(t);
3492 }
3493
tcg_gen_gvec_shlv(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)3494 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3495 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3496 {
3497 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3498 static const GVecGen3 g[4] = {
3499 { .fniv = tcg_gen_shlv_mod_vec,
3500 .fno = gen_helper_gvec_shl8v,
3501 .opt_opc = vecop_list,
3502 .vece = MO_8 },
3503 { .fniv = tcg_gen_shlv_mod_vec,
3504 .fno = gen_helper_gvec_shl16v,
3505 .opt_opc = vecop_list,
3506 .vece = MO_16 },
3507 { .fni4 = tcg_gen_shl_mod_i32,
3508 .fniv = tcg_gen_shlv_mod_vec,
3509 .fno = gen_helper_gvec_shl32v,
3510 .opt_opc = vecop_list,
3511 .vece = MO_32 },
3512 { .fni8 = tcg_gen_shl_mod_i64,
3513 .fniv = tcg_gen_shlv_mod_vec,
3514 .fno = gen_helper_gvec_shl64v,
3515 .opt_opc = vecop_list,
3516 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3517 .vece = MO_64 },
3518 };
3519
3520 tcg_debug_assert(vece <= MO_64);
3521 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3522 }
3523
3524 /*
3525 * Similarly for logical right shifts.
3526 */
3527
tcg_gen_shrv_mod_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)3528 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3529 TCGv_vec a, TCGv_vec b)
3530 {
3531 TCGv_vec t = tcg_temp_new_vec_matching(d);
3532 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3533
3534 tcg_gen_and_vec(vece, t, b, m);
3535 tcg_gen_shrv_vec(vece, d, a, t);
3536 tcg_temp_free_vec(t);
3537 }
3538
tcg_gen_shr_mod_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)3539 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3540 {
3541 TCGv_i32 t = tcg_temp_ebb_new_i32();
3542
3543 tcg_gen_andi_i32(t, b, 31);
3544 tcg_gen_shr_i32(d, a, t);
3545 tcg_temp_free_i32(t);
3546 }
3547
tcg_gen_shr_mod_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)3548 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3549 {
3550 TCGv_i64 t = tcg_temp_ebb_new_i64();
3551
3552 tcg_gen_andi_i64(t, b, 63);
3553 tcg_gen_shr_i64(d, a, t);
3554 tcg_temp_free_i64(t);
3555 }
3556
tcg_gen_gvec_shrv(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)3557 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3558 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3559 {
3560 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3561 static const GVecGen3 g[4] = {
3562 { .fniv = tcg_gen_shrv_mod_vec,
3563 .fno = gen_helper_gvec_shr8v,
3564 .opt_opc = vecop_list,
3565 .vece = MO_8 },
3566 { .fniv = tcg_gen_shrv_mod_vec,
3567 .fno = gen_helper_gvec_shr16v,
3568 .opt_opc = vecop_list,
3569 .vece = MO_16 },
3570 { .fni4 = tcg_gen_shr_mod_i32,
3571 .fniv = tcg_gen_shrv_mod_vec,
3572 .fno = gen_helper_gvec_shr32v,
3573 .opt_opc = vecop_list,
3574 .vece = MO_32 },
3575 { .fni8 = tcg_gen_shr_mod_i64,
3576 .fniv = tcg_gen_shrv_mod_vec,
3577 .fno = gen_helper_gvec_shr64v,
3578 .opt_opc = vecop_list,
3579 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3580 .vece = MO_64 },
3581 };
3582
3583 tcg_debug_assert(vece <= MO_64);
3584 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3585 }
3586
3587 /*
3588 * Similarly for arithmetic right shifts.
3589 */
3590
tcg_gen_sarv_mod_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)3591 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3592 TCGv_vec a, TCGv_vec b)
3593 {
3594 TCGv_vec t = tcg_temp_new_vec_matching(d);
3595 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3596
3597 tcg_gen_and_vec(vece, t, b, m);
3598 tcg_gen_sarv_vec(vece, d, a, t);
3599 tcg_temp_free_vec(t);
3600 }
3601
tcg_gen_sar_mod_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)3602 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3603 {
3604 TCGv_i32 t = tcg_temp_ebb_new_i32();
3605
3606 tcg_gen_andi_i32(t, b, 31);
3607 tcg_gen_sar_i32(d, a, t);
3608 tcg_temp_free_i32(t);
3609 }
3610
tcg_gen_sar_mod_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)3611 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3612 {
3613 TCGv_i64 t = tcg_temp_ebb_new_i64();
3614
3615 tcg_gen_andi_i64(t, b, 63);
3616 tcg_gen_sar_i64(d, a, t);
3617 tcg_temp_free_i64(t);
3618 }
3619
tcg_gen_gvec_sarv(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)3620 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3621 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3622 {
3623 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3624 static const GVecGen3 g[4] = {
3625 { .fniv = tcg_gen_sarv_mod_vec,
3626 .fno = gen_helper_gvec_sar8v,
3627 .opt_opc = vecop_list,
3628 .vece = MO_8 },
3629 { .fniv = tcg_gen_sarv_mod_vec,
3630 .fno = gen_helper_gvec_sar16v,
3631 .opt_opc = vecop_list,
3632 .vece = MO_16 },
3633 { .fni4 = tcg_gen_sar_mod_i32,
3634 .fniv = tcg_gen_sarv_mod_vec,
3635 .fno = gen_helper_gvec_sar32v,
3636 .opt_opc = vecop_list,
3637 .vece = MO_32 },
3638 { .fni8 = tcg_gen_sar_mod_i64,
3639 .fniv = tcg_gen_sarv_mod_vec,
3640 .fno = gen_helper_gvec_sar64v,
3641 .opt_opc = vecop_list,
3642 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3643 .vece = MO_64 },
3644 };
3645
3646 tcg_debug_assert(vece <= MO_64);
3647 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3648 }
3649
3650 /*
3651 * Similarly for rotates.
3652 */
3653
tcg_gen_rotlv_mod_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)3654 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3655 TCGv_vec a, TCGv_vec b)
3656 {
3657 TCGv_vec t = tcg_temp_new_vec_matching(d);
3658 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3659
3660 tcg_gen_and_vec(vece, t, b, m);
3661 tcg_gen_rotlv_vec(vece, d, a, t);
3662 tcg_temp_free_vec(t);
3663 }
3664
tcg_gen_rotl_mod_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)3665 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3666 {
3667 TCGv_i32 t = tcg_temp_ebb_new_i32();
3668
3669 tcg_gen_andi_i32(t, b, 31);
3670 tcg_gen_rotl_i32(d, a, t);
3671 tcg_temp_free_i32(t);
3672 }
3673
tcg_gen_rotl_mod_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)3674 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3675 {
3676 TCGv_i64 t = tcg_temp_ebb_new_i64();
3677
3678 tcg_gen_andi_i64(t, b, 63);
3679 tcg_gen_rotl_i64(d, a, t);
3680 tcg_temp_free_i64(t);
3681 }
3682
tcg_gen_gvec_rotlv(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)3683 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3684 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3685 {
3686 static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3687 static const GVecGen3 g[4] = {
3688 { .fniv = tcg_gen_rotlv_mod_vec,
3689 .fno = gen_helper_gvec_rotl8v,
3690 .opt_opc = vecop_list,
3691 .vece = MO_8 },
3692 { .fniv = tcg_gen_rotlv_mod_vec,
3693 .fno = gen_helper_gvec_rotl16v,
3694 .opt_opc = vecop_list,
3695 .vece = MO_16 },
3696 { .fni4 = tcg_gen_rotl_mod_i32,
3697 .fniv = tcg_gen_rotlv_mod_vec,
3698 .fno = gen_helper_gvec_rotl32v,
3699 .opt_opc = vecop_list,
3700 .vece = MO_32 },
3701 { .fni8 = tcg_gen_rotl_mod_i64,
3702 .fniv = tcg_gen_rotlv_mod_vec,
3703 .fno = gen_helper_gvec_rotl64v,
3704 .opt_opc = vecop_list,
3705 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3706 .vece = MO_64 },
3707 };
3708
3709 tcg_debug_assert(vece <= MO_64);
3710 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3711 }
3712
tcg_gen_rotrv_mod_vec(unsigned vece,TCGv_vec d,TCGv_vec a,TCGv_vec b)3713 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3714 TCGv_vec a, TCGv_vec b)
3715 {
3716 TCGv_vec t = tcg_temp_new_vec_matching(d);
3717 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3718
3719 tcg_gen_and_vec(vece, t, b, m);
3720 tcg_gen_rotrv_vec(vece, d, a, t);
3721 tcg_temp_free_vec(t);
3722 }
3723
tcg_gen_rotr_mod_i32(TCGv_i32 d,TCGv_i32 a,TCGv_i32 b)3724 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3725 {
3726 TCGv_i32 t = tcg_temp_ebb_new_i32();
3727
3728 tcg_gen_andi_i32(t, b, 31);
3729 tcg_gen_rotr_i32(d, a, t);
3730 tcg_temp_free_i32(t);
3731 }
3732
tcg_gen_rotr_mod_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b)3733 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3734 {
3735 TCGv_i64 t = tcg_temp_ebb_new_i64();
3736
3737 tcg_gen_andi_i64(t, b, 63);
3738 tcg_gen_rotr_i64(d, a, t);
3739 tcg_temp_free_i64(t);
3740 }
3741
tcg_gen_gvec_rotrv(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)3742 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3743 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3744 {
3745 static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3746 static const GVecGen3 g[4] = {
3747 { .fniv = tcg_gen_rotrv_mod_vec,
3748 .fno = gen_helper_gvec_rotr8v,
3749 .opt_opc = vecop_list,
3750 .vece = MO_8 },
3751 { .fniv = tcg_gen_rotrv_mod_vec,
3752 .fno = gen_helper_gvec_rotr16v,
3753 .opt_opc = vecop_list,
3754 .vece = MO_16 },
3755 { .fni4 = tcg_gen_rotr_mod_i32,
3756 .fniv = tcg_gen_rotrv_mod_vec,
3757 .fno = gen_helper_gvec_rotr32v,
3758 .opt_opc = vecop_list,
3759 .vece = MO_32 },
3760 { .fni8 = tcg_gen_rotr_mod_i64,
3761 .fniv = tcg_gen_rotrv_mod_vec,
3762 .fno = gen_helper_gvec_rotr64v,
3763 .opt_opc = vecop_list,
3764 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3765 .vece = MO_64 },
3766 };
3767
3768 tcg_debug_assert(vece <= MO_64);
3769 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3770 }
3771
3772 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
expand_cmp_i32(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,TCGCond cond)3773 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3774 uint32_t oprsz, TCGCond cond)
3775 {
3776 TCGv_i32 t0 = tcg_temp_ebb_new_i32();
3777 TCGv_i32 t1 = tcg_temp_ebb_new_i32();
3778 uint32_t i;
3779
3780 for (i = 0; i < oprsz; i += 4) {
3781 tcg_gen_ld_i32(t0, tcg_env, aofs + i);
3782 tcg_gen_ld_i32(t1, tcg_env, bofs + i);
3783 tcg_gen_negsetcond_i32(cond, t0, t0, t1);
3784 tcg_gen_st_i32(t0, tcg_env, dofs + i);
3785 }
3786 tcg_temp_free_i32(t1);
3787 tcg_temp_free_i32(t0);
3788 }
3789
expand_cmp_i64(uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,TCGCond cond)3790 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3791 uint32_t oprsz, TCGCond cond)
3792 {
3793 TCGv_i64 t0 = tcg_temp_ebb_new_i64();
3794 TCGv_i64 t1 = tcg_temp_ebb_new_i64();
3795 uint32_t i;
3796
3797 for (i = 0; i < oprsz; i += 8) {
3798 tcg_gen_ld_i64(t0, tcg_env, aofs + i);
3799 tcg_gen_ld_i64(t1, tcg_env, bofs + i);
3800 tcg_gen_negsetcond_i64(cond, t0, t0, t1);
3801 tcg_gen_st_i64(t0, tcg_env, dofs + i);
3802 }
3803 tcg_temp_free_i64(t1);
3804 tcg_temp_free_i64(t0);
3805 }
3806
expand_cmp_vec(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t tysz,TCGType type,TCGCond cond)3807 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3808 uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3809 TCGType type, TCGCond cond)
3810 {
3811 for (uint32_t i = 0; i < oprsz; i += tysz) {
3812 TCGv_vec t0 = tcg_temp_new_vec(type);
3813 TCGv_vec t1 = tcg_temp_new_vec(type);
3814 TCGv_vec t2 = tcg_temp_new_vec(type);
3815
3816 tcg_gen_ld_vec(t0, tcg_env, aofs + i);
3817 tcg_gen_ld_vec(t1, tcg_env, bofs + i);
3818 tcg_gen_cmp_vec(cond, vece, t2, t0, t1);
3819 tcg_gen_st_vec(t2, tcg_env, dofs + i);
3820 }
3821 }
3822
tcg_gen_gvec_cmp(TCGCond cond,unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t oprsz,uint32_t maxsz)3823 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3824 uint32_t aofs, uint32_t bofs,
3825 uint32_t oprsz, uint32_t maxsz)
3826 {
3827 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3828 static gen_helper_gvec_3 * const eq_fn[4] = {
3829 gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3830 gen_helper_gvec_eq32, gen_helper_gvec_eq64
3831 };
3832 static gen_helper_gvec_3 * const ne_fn[4] = {
3833 gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3834 gen_helper_gvec_ne32, gen_helper_gvec_ne64
3835 };
3836 static gen_helper_gvec_3 * const lt_fn[4] = {
3837 gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3838 gen_helper_gvec_lt32, gen_helper_gvec_lt64
3839 };
3840 static gen_helper_gvec_3 * const le_fn[4] = {
3841 gen_helper_gvec_le8, gen_helper_gvec_le16,
3842 gen_helper_gvec_le32, gen_helper_gvec_le64
3843 };
3844 static gen_helper_gvec_3 * const ltu_fn[4] = {
3845 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3846 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3847 };
3848 static gen_helper_gvec_3 * const leu_fn[4] = {
3849 gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3850 gen_helper_gvec_leu32, gen_helper_gvec_leu64
3851 };
3852 static gen_helper_gvec_3 * const * const fns[16] = {
3853 [TCG_COND_EQ] = eq_fn,
3854 [TCG_COND_NE] = ne_fn,
3855 [TCG_COND_LT] = lt_fn,
3856 [TCG_COND_LE] = le_fn,
3857 [TCG_COND_LTU] = ltu_fn,
3858 [TCG_COND_LEU] = leu_fn,
3859 };
3860
3861 const TCGOpcode *hold_list;
3862 TCGType type;
3863 uint32_t some;
3864
3865 check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3866 check_overlap_3(tcg_env, dofs, tcg_env, aofs, tcg_env, bofs, maxsz);
3867
3868 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3869 do_dup(MO_8, tcg_env, dofs, oprsz, maxsz,
3870 NULL, NULL, -(cond == TCG_COND_ALWAYS));
3871 return;
3872 }
3873
3874 /*
3875 * Implement inline with a vector type, if possible.
3876 * Prefer integer when 64-bit host and 64-bit comparison.
3877 */
3878 hold_list = tcg_swap_vecop_list(cmp_list);
3879 type = choose_vector_type(cmp_list, vece, oprsz,
3880 TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3881 switch (type) {
3882 case TCG_TYPE_V256:
3883 /* Recall that ARM SVE allows vector sizes that are not a
3884 * power of 2, but always a multiple of 16. The intent is
3885 * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3886 */
3887 some = QEMU_ALIGN_DOWN(oprsz, 32);
3888 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3889 if (some == oprsz) {
3890 break;
3891 }
3892 dofs += some;
3893 aofs += some;
3894 bofs += some;
3895 oprsz -= some;
3896 maxsz -= some;
3897 /* fallthru */
3898 case TCG_TYPE_V128:
3899 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3900 break;
3901 case TCG_TYPE_V64:
3902 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3903 break;
3904
3905 case 0:
3906 if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3907 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3908 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3909 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3910 } else {
3911 gen_helper_gvec_3 * const *fn = fns[cond];
3912
3913 if (fn == NULL) {
3914 uint32_t tmp;
3915 tmp = aofs, aofs = bofs, bofs = tmp;
3916 cond = tcg_swap_cond(cond);
3917 fn = fns[cond];
3918 assert(fn != NULL);
3919 }
3920 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3921 oprsz = maxsz;
3922 }
3923 break;
3924
3925 default:
3926 g_assert_not_reached();
3927 }
3928 tcg_swap_vecop_list(hold_list);
3929
3930 if (oprsz < maxsz) {
3931 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
3932 }
3933 }
3934
expand_cmps_vec(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t oprsz,uint32_t tysz,TCGType type,TCGCond cond,TCGv_vec c)3935 static void expand_cmps_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3936 uint32_t oprsz, uint32_t tysz, TCGType type,
3937 TCGCond cond, TCGv_vec c)
3938 {
3939 TCGv_vec t0 = tcg_temp_new_vec(type);
3940 TCGv_vec t1 = tcg_temp_new_vec(type);
3941 uint32_t i;
3942
3943 for (i = 0; i < oprsz; i += tysz) {
3944 tcg_gen_ld_vec(t1, tcg_env, aofs + i);
3945 tcg_gen_cmp_vec(cond, vece, t0, t1, c);
3946 tcg_gen_st_vec(t0, tcg_env, dofs + i);
3947 }
3948 }
3949
tcg_gen_gvec_cmps(TCGCond cond,unsigned vece,uint32_t dofs,uint32_t aofs,TCGv_i64 c,uint32_t oprsz,uint32_t maxsz)3950 void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
3951 uint32_t aofs, TCGv_i64 c,
3952 uint32_t oprsz, uint32_t maxsz)
3953 {
3954 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3955 static gen_helper_gvec_2i * const eq_fn[4] = {
3956 gen_helper_gvec_eqs8, gen_helper_gvec_eqs16,
3957 gen_helper_gvec_eqs32, gen_helper_gvec_eqs64
3958 };
3959 static gen_helper_gvec_2i * const lt_fn[4] = {
3960 gen_helper_gvec_lts8, gen_helper_gvec_lts16,
3961 gen_helper_gvec_lts32, gen_helper_gvec_lts64
3962 };
3963 static gen_helper_gvec_2i * const le_fn[4] = {
3964 gen_helper_gvec_les8, gen_helper_gvec_les16,
3965 gen_helper_gvec_les32, gen_helper_gvec_les64
3966 };
3967 static gen_helper_gvec_2i * const ltu_fn[4] = {
3968 gen_helper_gvec_ltus8, gen_helper_gvec_ltus16,
3969 gen_helper_gvec_ltus32, gen_helper_gvec_ltus64
3970 };
3971 static gen_helper_gvec_2i * const leu_fn[4] = {
3972 gen_helper_gvec_leus8, gen_helper_gvec_leus16,
3973 gen_helper_gvec_leus32, gen_helper_gvec_leus64
3974 };
3975 static gen_helper_gvec_2i * const * const fns[16] = {
3976 [TCG_COND_EQ] = eq_fn,
3977 [TCG_COND_LT] = lt_fn,
3978 [TCG_COND_LE] = le_fn,
3979 [TCG_COND_LTU] = ltu_fn,
3980 [TCG_COND_LEU] = leu_fn,
3981 };
3982
3983 TCGType type;
3984
3985 check_size_align(oprsz, maxsz, dofs | aofs);
3986 check_overlap_2(tcg_env, dofs, tcg_env, aofs, maxsz);
3987
3988 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3989 do_dup(MO_8, tcg_env, dofs, oprsz, maxsz,
3990 NULL, NULL, -(cond == TCG_COND_ALWAYS));
3991 return;
3992 }
3993
3994 /*
3995 * Implement inline with a vector type, if possible.
3996 * Prefer integer when 64-bit host and 64-bit comparison.
3997 */
3998 type = choose_vector_type(cmp_list, vece, oprsz,
3999 TCG_TARGET_REG_BITS == 64 && vece == MO_64);
4000 if (type != 0) {
4001 const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list);
4002 TCGv_vec t_vec = tcg_temp_new_vec(type);
4003 uint32_t some;
4004
4005 tcg_gen_dup_i64_vec(vece, t_vec, c);
4006 switch (type) {
4007 case TCG_TYPE_V256:
4008 some = QEMU_ALIGN_DOWN(oprsz, 32);
4009 expand_cmps_vec(vece, dofs, aofs, some, 32,
4010 TCG_TYPE_V256, cond, t_vec);
4011 aofs += some;
4012 dofs += some;
4013 oprsz -= some;
4014 maxsz -= some;
4015 /* fallthru */
4016
4017 case TCG_TYPE_V128:
4018 some = QEMU_ALIGN_DOWN(oprsz, 16);
4019 expand_cmps_vec(vece, dofs, aofs, some, 16,
4020 TCG_TYPE_V128, cond, t_vec);
4021 break;
4022
4023 case TCG_TYPE_V64:
4024 some = QEMU_ALIGN_DOWN(oprsz, 8);
4025 expand_cmps_vec(vece, dofs, aofs, some, 8,
4026 TCG_TYPE_V64, cond, t_vec);
4027 break;
4028
4029 default:
4030 g_assert_not_reached();
4031 }
4032 tcg_temp_free_vec(t_vec);
4033 tcg_swap_vecop_list(hold_list);
4034 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
4035 TCGv_i64 t0 = tcg_temp_ebb_new_i64();
4036 uint32_t i;
4037
4038 for (i = 0; i < oprsz; i += 8) {
4039 tcg_gen_ld_i64(t0, tcg_env, aofs + i);
4040 tcg_gen_negsetcond_i64(cond, t0, t0, c);
4041 tcg_gen_st_i64(t0, tcg_env, dofs + i);
4042 }
4043 tcg_temp_free_i64(t0);
4044 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
4045 TCGv_i32 t0 = tcg_temp_ebb_new_i32();
4046 TCGv_i32 t1 = tcg_temp_ebb_new_i32();
4047 uint32_t i;
4048
4049 tcg_gen_extrl_i64_i32(t1, c);
4050 for (i = 0; i < oprsz; i += 4) {
4051 tcg_gen_ld_i32(t0, tcg_env, aofs + i);
4052 tcg_gen_negsetcond_i32(cond, t0, t0, t1);
4053 tcg_gen_st_i32(t0, tcg_env, dofs + i);
4054 }
4055 tcg_temp_free_i32(t0);
4056 tcg_temp_free_i32(t1);
4057 } else {
4058 gen_helper_gvec_2i * const *fn = fns[cond];
4059 bool inv = false;
4060
4061 if (fn == NULL) {
4062 cond = tcg_invert_cond(cond);
4063 fn = fns[cond];
4064 assert(fn != NULL);
4065 inv = true;
4066 }
4067 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, fn[vece]);
4068 return;
4069 }
4070
4071 if (oprsz < maxsz) {
4072 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz);
4073 }
4074 }
4075
tcg_gen_gvec_cmpi(TCGCond cond,unsigned vece,uint32_t dofs,uint32_t aofs,int64_t c,uint32_t oprsz,uint32_t maxsz)4076 void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
4077 uint32_t aofs, int64_t c,
4078 uint32_t oprsz, uint32_t maxsz)
4079 {
4080 TCGv_i64 tmp = tcg_constant_i64(c);
4081 tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz);
4082 }
4083
tcg_gen_bitsel_i64(TCGv_i64 d,TCGv_i64 a,TCGv_i64 b,TCGv_i64 c)4084 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
4085 {
4086 TCGv_i64 t = tcg_temp_ebb_new_i64();
4087
4088 tcg_gen_and_i64(t, b, a);
4089 tcg_gen_andc_i64(d, c, a);
4090 tcg_gen_or_i64(d, d, t);
4091 tcg_temp_free_i64(t);
4092 }
4093
tcg_gen_gvec_bitsel(unsigned vece,uint32_t dofs,uint32_t aofs,uint32_t bofs,uint32_t cofs,uint32_t oprsz,uint32_t maxsz)4094 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
4095 uint32_t bofs, uint32_t cofs,
4096 uint32_t oprsz, uint32_t maxsz)
4097 {
4098 static const GVecGen4 g = {
4099 .fni8 = tcg_gen_bitsel_i64,
4100 .fniv = tcg_gen_bitsel_vec,
4101 .fno = gen_helper_gvec_bitsel,
4102 };
4103
4104 tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
4105 }
4106