xref: /qemu/tcg/tcg-op-gvec.c (revision 513823e7521a09ed7ad1e32e6454bac3b2cbf52d)
1 /*
2  * Generic vector operation expansion
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "tcg/tcg.h"
22 #include "tcg/tcg-temp-internal.h"
23 #include "tcg/tcg-op-common.h"
24 #include "tcg/tcg-op-gvec-common.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "tcg-has.h"
27 
28 #define MAX_UNROLL  4
29 
30 #ifdef CONFIG_DEBUG_TCG
31 static const TCGOpcode vecop_list_empty[1] = { 0 };
32 #else
33 #define vecop_list_empty NULL
34 #endif
35 
36 
37 /* Verify vector size and alignment rules.  OFS should be the OR of all
38    of the operand offsets so that we can check them all at once.  */
39 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
40 {
41     uint32_t max_align;
42 
43     switch (oprsz) {
44     case 8:
45     case 16:
46     case 32:
47         tcg_debug_assert(oprsz <= maxsz);
48         break;
49     default:
50         tcg_debug_assert(oprsz == maxsz);
51         break;
52     }
53     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
54 
55     max_align = maxsz >= 16 ? 15 : 7;
56     tcg_debug_assert((maxsz & max_align) == 0);
57     tcg_debug_assert((ofs & max_align) == 0);
58 }
59 
60 /* Verify vector overlap rules for two operands.  */
61 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
62 {
63     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
64 }
65 
66 /* Verify vector overlap rules for three operands.  */
67 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
68 {
69     check_overlap_2(d, a, s);
70     check_overlap_2(d, b, s);
71     check_overlap_2(a, b, s);
72 }
73 
74 /* Verify vector overlap rules for four operands.  */
75 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
76                             uint32_t c, uint32_t s)
77 {
78     check_overlap_2(d, a, s);
79     check_overlap_2(d, b, s);
80     check_overlap_2(d, c, s);
81     check_overlap_2(a, b, s);
82     check_overlap_2(a, c, s);
83     check_overlap_2(b, c, s);
84 }
85 
86 /* Create a descriptor from components.  */
87 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
88 {
89     uint32_t desc = 0;
90 
91     check_size_align(oprsz, maxsz, 0);
92 
93     /*
94      * We want to check that 'data' will fit into SIMD_DATA_BITS.
95      * However, some callers want to treat the data as a signed
96      * value (which they can later get back with simd_data())
97      * and some want to treat it as an unsigned value.
98      * So here we assert only that the data will fit into the
99      * field in at least one way. This means that some invalid
100      * values from the caller will not be detected, e.g. if the
101      * caller wants to handle the value as a signed integer but
102      * incorrectly passes us 1 << (SIMD_DATA_BITS - 1).
103      */
104     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS) ||
105                      data == extract32(data, 0, SIMD_DATA_BITS));
106 
107     oprsz = (oprsz / 8) - 1;
108     maxsz = (maxsz / 8) - 1;
109 
110     /*
111      * We have just asserted in check_size_align that either
112      * oprsz is {8,16,32} or matches maxsz.  Encode the final
113      * case with '2', as that would otherwise map to 24.
114      */
115     if (oprsz == maxsz) {
116         oprsz = 2;
117     }
118 
119     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
120     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
121     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
122 
123     return desc;
124 }
125 
126 /* Generate a call to a gvec-style helper with two vector operands.  */
127 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
128                         uint32_t oprsz, uint32_t maxsz, int32_t data,
129                         gen_helper_gvec_2 *fn)
130 {
131     TCGv_ptr a0, a1;
132     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
133 
134     a0 = tcg_temp_ebb_new_ptr();
135     a1 = tcg_temp_ebb_new_ptr();
136 
137     tcg_gen_addi_ptr(a0, tcg_env, dofs);
138     tcg_gen_addi_ptr(a1, tcg_env, aofs);
139 
140     fn(a0, a1, desc);
141 
142     tcg_temp_free_ptr(a0);
143     tcg_temp_free_ptr(a1);
144 }
145 
146 /* Generate a call to a gvec-style helper with two vector operands
147    and one scalar operand.  */
148 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
149                          uint32_t oprsz, uint32_t maxsz, int32_t data,
150                          gen_helper_gvec_2i *fn)
151 {
152     TCGv_ptr a0, a1;
153     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
154 
155     a0 = tcg_temp_ebb_new_ptr();
156     a1 = tcg_temp_ebb_new_ptr();
157 
158     tcg_gen_addi_ptr(a0, tcg_env, dofs);
159     tcg_gen_addi_ptr(a1, tcg_env, aofs);
160 
161     fn(a0, a1, c, desc);
162 
163     tcg_temp_free_ptr(a0);
164     tcg_temp_free_ptr(a1);
165 }
166 
167 /* Generate a call to a gvec-style helper with three vector operands.  */
168 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
169                         uint32_t oprsz, uint32_t maxsz, int32_t data,
170                         gen_helper_gvec_3 *fn)
171 {
172     TCGv_ptr a0, a1, a2;
173     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
174 
175     a0 = tcg_temp_ebb_new_ptr();
176     a1 = tcg_temp_ebb_new_ptr();
177     a2 = tcg_temp_ebb_new_ptr();
178 
179     tcg_gen_addi_ptr(a0, tcg_env, dofs);
180     tcg_gen_addi_ptr(a1, tcg_env, aofs);
181     tcg_gen_addi_ptr(a2, tcg_env, bofs);
182 
183     fn(a0, a1, a2, desc);
184 
185     tcg_temp_free_ptr(a0);
186     tcg_temp_free_ptr(a1);
187     tcg_temp_free_ptr(a2);
188 }
189 
190 /* Generate a call to a gvec-style helper with four vector operands.  */
191 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
192                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
193                         int32_t data, gen_helper_gvec_4 *fn)
194 {
195     TCGv_ptr a0, a1, a2, a3;
196     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
197 
198     a0 = tcg_temp_ebb_new_ptr();
199     a1 = tcg_temp_ebb_new_ptr();
200     a2 = tcg_temp_ebb_new_ptr();
201     a3 = tcg_temp_ebb_new_ptr();
202 
203     tcg_gen_addi_ptr(a0, tcg_env, dofs);
204     tcg_gen_addi_ptr(a1, tcg_env, aofs);
205     tcg_gen_addi_ptr(a2, tcg_env, bofs);
206     tcg_gen_addi_ptr(a3, tcg_env, cofs);
207 
208     fn(a0, a1, a2, a3, desc);
209 
210     tcg_temp_free_ptr(a0);
211     tcg_temp_free_ptr(a1);
212     tcg_temp_free_ptr(a2);
213     tcg_temp_free_ptr(a3);
214 }
215 
216 /* Generate a call to a gvec-style helper with five vector operands.  */
217 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
218                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
219                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
220 {
221     TCGv_ptr a0, a1, a2, a3, a4;
222     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
223 
224     a0 = tcg_temp_ebb_new_ptr();
225     a1 = tcg_temp_ebb_new_ptr();
226     a2 = tcg_temp_ebb_new_ptr();
227     a3 = tcg_temp_ebb_new_ptr();
228     a4 = tcg_temp_ebb_new_ptr();
229 
230     tcg_gen_addi_ptr(a0, tcg_env, dofs);
231     tcg_gen_addi_ptr(a1, tcg_env, aofs);
232     tcg_gen_addi_ptr(a2, tcg_env, bofs);
233     tcg_gen_addi_ptr(a3, tcg_env, cofs);
234     tcg_gen_addi_ptr(a4, tcg_env, xofs);
235 
236     fn(a0, a1, a2, a3, a4, desc);
237 
238     tcg_temp_free_ptr(a0);
239     tcg_temp_free_ptr(a1);
240     tcg_temp_free_ptr(a2);
241     tcg_temp_free_ptr(a3);
242     tcg_temp_free_ptr(a4);
243 }
244 
245 /* Generate a call to a gvec-style helper with three vector operands
246    and an extra pointer operand.  */
247 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
248                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
249                         int32_t data, gen_helper_gvec_2_ptr *fn)
250 {
251     TCGv_ptr a0, a1;
252     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
253 
254     a0 = tcg_temp_ebb_new_ptr();
255     a1 = tcg_temp_ebb_new_ptr();
256 
257     tcg_gen_addi_ptr(a0, tcg_env, dofs);
258     tcg_gen_addi_ptr(a1, tcg_env, aofs);
259 
260     fn(a0, a1, ptr, desc);
261 
262     tcg_temp_free_ptr(a0);
263     tcg_temp_free_ptr(a1);
264 }
265 
266 /* Generate a call to a gvec-style helper with three vector operands
267    and an extra pointer operand.  */
268 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
269                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
270                         int32_t data, gen_helper_gvec_3_ptr *fn)
271 {
272     TCGv_ptr a0, a1, a2;
273     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
274 
275     a0 = tcg_temp_ebb_new_ptr();
276     a1 = tcg_temp_ebb_new_ptr();
277     a2 = tcg_temp_ebb_new_ptr();
278 
279     tcg_gen_addi_ptr(a0, tcg_env, dofs);
280     tcg_gen_addi_ptr(a1, tcg_env, aofs);
281     tcg_gen_addi_ptr(a2, tcg_env, bofs);
282 
283     fn(a0, a1, a2, ptr, desc);
284 
285     tcg_temp_free_ptr(a0);
286     tcg_temp_free_ptr(a1);
287     tcg_temp_free_ptr(a2);
288 }
289 
290 /* Generate a call to a gvec-style helper with four vector operands
291    and an extra pointer operand.  */
292 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
293                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
294                         uint32_t maxsz, int32_t data,
295                         gen_helper_gvec_4_ptr *fn)
296 {
297     TCGv_ptr a0, a1, a2, a3;
298     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
299 
300     a0 = tcg_temp_ebb_new_ptr();
301     a1 = tcg_temp_ebb_new_ptr();
302     a2 = tcg_temp_ebb_new_ptr();
303     a3 = tcg_temp_ebb_new_ptr();
304 
305     tcg_gen_addi_ptr(a0, tcg_env, dofs);
306     tcg_gen_addi_ptr(a1, tcg_env, aofs);
307     tcg_gen_addi_ptr(a2, tcg_env, bofs);
308     tcg_gen_addi_ptr(a3, tcg_env, cofs);
309 
310     fn(a0, a1, a2, a3, ptr, desc);
311 
312     tcg_temp_free_ptr(a0);
313     tcg_temp_free_ptr(a1);
314     tcg_temp_free_ptr(a2);
315     tcg_temp_free_ptr(a3);
316 }
317 
318 /* Generate a call to a gvec-style helper with five vector operands
319    and an extra pointer operand.  */
320 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
321                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
322                         uint32_t oprsz, uint32_t maxsz, int32_t data,
323                         gen_helper_gvec_5_ptr *fn)
324 {
325     TCGv_ptr a0, a1, a2, a3, a4;
326     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
327 
328     a0 = tcg_temp_ebb_new_ptr();
329     a1 = tcg_temp_ebb_new_ptr();
330     a2 = tcg_temp_ebb_new_ptr();
331     a3 = tcg_temp_ebb_new_ptr();
332     a4 = tcg_temp_ebb_new_ptr();
333 
334     tcg_gen_addi_ptr(a0, tcg_env, dofs);
335     tcg_gen_addi_ptr(a1, tcg_env, aofs);
336     tcg_gen_addi_ptr(a2, tcg_env, bofs);
337     tcg_gen_addi_ptr(a3, tcg_env, cofs);
338     tcg_gen_addi_ptr(a4, tcg_env, eofs);
339 
340     fn(a0, a1, a2, a3, a4, ptr, desc);
341 
342     tcg_temp_free_ptr(a0);
343     tcg_temp_free_ptr(a1);
344     tcg_temp_free_ptr(a2);
345     tcg_temp_free_ptr(a3);
346     tcg_temp_free_ptr(a4);
347 }
348 
349 /* Return true if we want to implement something of OPRSZ bytes
350    in units of LNSZ.  This limits the expansion of inline code.  */
351 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
352 {
353     uint32_t q, r;
354 
355     if (oprsz < lnsz) {
356         return false;
357     }
358 
359     q = oprsz / lnsz;
360     r = oprsz % lnsz;
361     tcg_debug_assert((r & 7) == 0);
362 
363     if (lnsz < 16) {
364         /* For sizes below 16, accept no remainder. */
365         if (r != 0) {
366             return false;
367         }
368     } else {
369         /*
370          * Recall that ARM SVE allows vector sizes that are not a
371          * power of 2, but always a multiple of 16.  The intent is
372          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
373          * In addition, expand_clr needs to handle a multiple of 8.
374          * Thus we can handle the tail with one more operation per
375          * diminishing power of 2.
376          */
377         q += ctpop32(r);
378     }
379 
380     return q <= MAX_UNROLL;
381 }
382 
383 static void expand_clr(uint32_t dofs, uint32_t maxsz);
384 
385 /* Duplicate C as per VECE.  */
386 uint64_t (dup_const)(unsigned vece, uint64_t c)
387 {
388     switch (vece) {
389     case MO_8:
390         return 0x0101010101010101ull * (uint8_t)c;
391     case MO_16:
392         return 0x0001000100010001ull * (uint16_t)c;
393     case MO_32:
394         return 0x0000000100000001ull * (uint32_t)c;
395     case MO_64:
396         return c;
397     default:
398         g_assert_not_reached();
399     }
400 }
401 
402 /* Duplicate IN into OUT as per VECE.  */
403 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
404 {
405     switch (vece) {
406     case MO_8:
407         tcg_gen_ext8u_i32(out, in);
408         tcg_gen_muli_i32(out, out, 0x01010101);
409         break;
410     case MO_16:
411         tcg_gen_deposit_i32(out, in, in, 16, 16);
412         break;
413     case MO_32:
414         tcg_gen_mov_i32(out, in);
415         break;
416     default:
417         g_assert_not_reached();
418     }
419 }
420 
421 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
422 {
423     switch (vece) {
424     case MO_8:
425         tcg_gen_ext8u_i64(out, in);
426         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
427         break;
428     case MO_16:
429         tcg_gen_ext16u_i64(out, in);
430         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
431         break;
432     case MO_32:
433         tcg_gen_deposit_i64(out, in, in, 32, 32);
434         break;
435     case MO_64:
436         tcg_gen_mov_i64(out, in);
437         break;
438     default:
439         g_assert_not_reached();
440     }
441 }
442 
443 /* Select a supported vector type for implementing an operation on SIZE
444  * bytes.  If OP is 0, assume that the real operation to be performed is
445  * required by all backends.  Otherwise, make sure than OP can be performed
446  * on elements of size VECE in the selected type.  Do not select V64 if
447  * PREFER_I64 is true.  Return 0 if no vector type is selected.
448  */
449 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
450                                   uint32_t size, bool prefer_i64)
451 {
452     /*
453      * Recall that ARM SVE allows vector sizes that are not a
454      * power of 2, but always a multiple of 16.  The intent is
455      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
456      * It is hard to imagine a case in which v256 is supported
457      * but v128 is not, but check anyway.
458      * In addition, expand_clr needs to handle a multiple of 8.
459      */
460     if (TCG_TARGET_HAS_v256 &&
461         check_size_impl(size, 32) &&
462         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
463         (!(size & 16) ||
464          (TCG_TARGET_HAS_v128 &&
465           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
466         (!(size & 8) ||
467          (TCG_TARGET_HAS_v64 &&
468           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
469         return TCG_TYPE_V256;
470     }
471     if (TCG_TARGET_HAS_v128 &&
472         check_size_impl(size, 16) &&
473         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
474         (!(size & 8) ||
475          (TCG_TARGET_HAS_v64 &&
476           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
477         return TCG_TYPE_V128;
478     }
479     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
480         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
481         return TCG_TYPE_V64;
482     }
483     return 0;
484 }
485 
486 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
487                          uint32_t maxsz, TCGv_vec t_vec)
488 {
489     uint32_t i = 0;
490 
491     tcg_debug_assert(oprsz >= 8);
492 
493     /*
494      * This may be expand_clr for the tail of an operation, e.g.
495      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
496      * are misaligned wrt the maximum vector size, so do that first.
497      */
498     if (dofs & 8) {
499         tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V64);
500         i += 8;
501     }
502 
503     switch (type) {
504     case TCG_TYPE_V256:
505         /*
506          * Recall that ARM SVE allows vector sizes that are not a
507          * power of 2, but always a multiple of 16.  The intent is
508          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
509          */
510         for (; i + 32 <= oprsz; i += 32) {
511             tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V256);
512         }
513         /* fallthru */
514     case TCG_TYPE_V128:
515         for (; i + 16 <= oprsz; i += 16) {
516             tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V128);
517         }
518         break;
519     case TCG_TYPE_V64:
520         for (; i < oprsz; i += 8) {
521             tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V64);
522         }
523         break;
524     default:
525         g_assert_not_reached();
526     }
527 
528     if (oprsz < maxsz) {
529         expand_clr(dofs + oprsz, maxsz - oprsz);
530     }
531 }
532 
533 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
534  * Only one of IN_32 or IN_64 may be set;
535  * IN_C is used if IN_32 and IN_64 are unset.
536  */
537 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
538                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
539                    uint64_t in_c)
540 {
541     TCGType type;
542     TCGv_i64 t_64;
543     TCGv_i32 t_32, t_desc;
544     TCGv_ptr t_ptr;
545     uint32_t i;
546 
547     assert(vece <= (in_32 ? MO_32 : MO_64));
548     assert(in_32 == NULL || in_64 == NULL);
549 
550     /* If we're storing 0, expand oprsz to maxsz.  */
551     if (in_32 == NULL && in_64 == NULL) {
552         in_c = dup_const(vece, in_c);
553         if (in_c == 0) {
554             oprsz = maxsz;
555             vece = MO_8;
556         } else if (in_c == dup_const(MO_8, in_c)) {
557             vece = MO_8;
558         }
559     }
560 
561     /* Implement inline with a vector type, if possible.
562      * Prefer integer when 64-bit host and no variable dup.
563      */
564     type = choose_vector_type(NULL, vece, oprsz,
565                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
566                                && (in_64 == NULL || vece == MO_64)));
567     if (type != 0) {
568         TCGv_vec t_vec = tcg_temp_new_vec(type);
569 
570         if (in_32) {
571             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
572         } else if (in_64) {
573             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
574         } else {
575             tcg_gen_dupi_vec(vece, t_vec, in_c);
576         }
577         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
578         return;
579     }
580 
581     /* Otherwise, inline with an integer type, unless "large".  */
582     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
583         t_64 = NULL;
584         t_32 = NULL;
585 
586         if (in_32) {
587             /* We are given a 32-bit variable input.  For a 64-bit host,
588                use a 64-bit operation unless the 32-bit operation would
589                be simple enough.  */
590             if (TCG_TARGET_REG_BITS == 64
591                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
592                 t_64 = tcg_temp_ebb_new_i64();
593                 tcg_gen_extu_i32_i64(t_64, in_32);
594                 tcg_gen_dup_i64(vece, t_64, t_64);
595             } else {
596                 t_32 = tcg_temp_ebb_new_i32();
597                 tcg_gen_dup_i32(vece, t_32, in_32);
598             }
599         } else if (in_64) {
600             /* We are given a 64-bit variable input.  */
601             t_64 = tcg_temp_ebb_new_i64();
602             tcg_gen_dup_i64(vece, t_64, in_64);
603         } else {
604             /* We are given a constant input.  */
605             /* For 64-bit hosts, use 64-bit constants for "simple" constants
606                or when we'd need too many 32-bit stores, or when a 64-bit
607                constant is really required.  */
608             if (vece == MO_64
609                 || (TCG_TARGET_REG_BITS == 64
610                     && (in_c == 0 || in_c == -1
611                         || !check_size_impl(oprsz, 4)))) {
612                 t_64 = tcg_constant_i64(in_c);
613             } else {
614                 t_32 = tcg_constant_i32(in_c);
615             }
616         }
617 
618         /* Implement inline if we picked an implementation size above.  */
619         if (t_32) {
620             for (i = 0; i < oprsz; i += 4) {
621                 tcg_gen_st_i32(t_32, tcg_env, dofs + i);
622             }
623             tcg_temp_free_i32(t_32);
624             goto done;
625         }
626         if (t_64) {
627             for (i = 0; i < oprsz; i += 8) {
628                 tcg_gen_st_i64(t_64, tcg_env, dofs + i);
629             }
630             tcg_temp_free_i64(t_64);
631             goto done;
632         }
633     }
634 
635     /* Otherwise implement out of line.  */
636     t_ptr = tcg_temp_ebb_new_ptr();
637     tcg_gen_addi_ptr(t_ptr, tcg_env, dofs);
638 
639     /*
640      * This may be expand_clr for the tail of an operation, e.g.
641      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
642      * wrt simd_desc and will assert.  Simply pass all replicated byte
643      * stores through to memset.
644      */
645     if (oprsz == maxsz && vece == MO_8) {
646         TCGv_ptr t_size = tcg_constant_ptr(oprsz);
647         TCGv_i32 t_val;
648 
649         if (in_32) {
650             t_val = in_32;
651         } else if (in_64) {
652             t_val = tcg_temp_ebb_new_i32();
653             tcg_gen_extrl_i64_i32(t_val, in_64);
654         } else {
655             t_val = tcg_constant_i32(in_c);
656         }
657         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
658 
659         if (in_64) {
660             tcg_temp_free_i32(t_val);
661         }
662         tcg_temp_free_ptr(t_ptr);
663         return;
664     }
665 
666     t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
667 
668     if (vece == MO_64) {
669         if (in_64) {
670             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
671         } else {
672             t_64 = tcg_constant_i64(in_c);
673             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
674         }
675     } else {
676         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
677         static dup_fn * const fns[3] = {
678             gen_helper_gvec_dup8,
679             gen_helper_gvec_dup16,
680             gen_helper_gvec_dup32
681         };
682 
683         if (in_32) {
684             fns[vece](t_ptr, t_desc, in_32);
685         } else if (in_64) {
686             t_32 = tcg_temp_ebb_new_i32();
687             tcg_gen_extrl_i64_i32(t_32, in_64);
688             fns[vece](t_ptr, t_desc, t_32);
689             tcg_temp_free_i32(t_32);
690         } else {
691             if (vece == MO_8) {
692                 in_c &= 0xff;
693             } else if (vece == MO_16) {
694                 in_c &= 0xffff;
695             }
696             t_32 = tcg_constant_i32(in_c);
697             fns[vece](t_ptr, t_desc, t_32);
698         }
699     }
700 
701     tcg_temp_free_ptr(t_ptr);
702     return;
703 
704  done:
705     if (oprsz < maxsz) {
706         expand_clr(dofs + oprsz, maxsz - oprsz);
707     }
708 }
709 
710 /* Likewise, but with zero.  */
711 static void expand_clr(uint32_t dofs, uint32_t maxsz)
712 {
713     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
714 }
715 
716 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
717 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
718                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
719 {
720     TCGv_i32 t0 = tcg_temp_new_i32();
721     TCGv_i32 t1 = tcg_temp_new_i32();
722     uint32_t i;
723 
724     for (i = 0; i < oprsz; i += 4) {
725         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
726         if (load_dest) {
727             tcg_gen_ld_i32(t1, tcg_env, dofs + i);
728         }
729         fni(t1, t0);
730         tcg_gen_st_i32(t1, tcg_env, dofs + i);
731     }
732     tcg_temp_free_i32(t0);
733     tcg_temp_free_i32(t1);
734 }
735 
736 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
737                           int32_t c, bool load_dest,
738                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
739 {
740     TCGv_i32 t0 = tcg_temp_new_i32();
741     TCGv_i32 t1 = tcg_temp_new_i32();
742     uint32_t i;
743 
744     for (i = 0; i < oprsz; i += 4) {
745         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
746         if (load_dest) {
747             tcg_gen_ld_i32(t1, tcg_env, dofs + i);
748         }
749         fni(t1, t0, c);
750         tcg_gen_st_i32(t1, tcg_env, dofs + i);
751     }
752     tcg_temp_free_i32(t0);
753     tcg_temp_free_i32(t1);
754 }
755 
756 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
757                           TCGv_i32 c, bool scalar_first,
758                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
759 {
760     TCGv_i32 t0 = tcg_temp_new_i32();
761     TCGv_i32 t1 = tcg_temp_new_i32();
762     uint32_t i;
763 
764     for (i = 0; i < oprsz; i += 4) {
765         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
766         if (scalar_first) {
767             fni(t1, c, t0);
768         } else {
769             fni(t1, t0, c);
770         }
771         tcg_gen_st_i32(t1, tcg_env, dofs + i);
772     }
773     tcg_temp_free_i32(t0);
774     tcg_temp_free_i32(t1);
775 }
776 
777 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
778 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
779                          uint32_t bofs, uint32_t oprsz, bool load_dest,
780                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
781 {
782     TCGv_i32 t0 = tcg_temp_new_i32();
783     TCGv_i32 t1 = tcg_temp_new_i32();
784     TCGv_i32 t2 = tcg_temp_new_i32();
785     uint32_t i;
786 
787     for (i = 0; i < oprsz; i += 4) {
788         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
789         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
790         if (load_dest) {
791             tcg_gen_ld_i32(t2, tcg_env, dofs + i);
792         }
793         fni(t2, t0, t1);
794         tcg_gen_st_i32(t2, tcg_env, dofs + i);
795     }
796     tcg_temp_free_i32(t2);
797     tcg_temp_free_i32(t1);
798     tcg_temp_free_i32(t0);
799 }
800 
801 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
802                           uint32_t oprsz, int32_t c,
803                           bool load_dest, bool write_aofs,
804                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
805 {
806     TCGv_i32 t0 = tcg_temp_new_i32();
807     TCGv_i32 t1 = tcg_temp_new_i32();
808     TCGv_i32 t2 = tcg_temp_new_i32();
809     uint32_t i;
810 
811     for (i = 0; i < oprsz; i += 4) {
812         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
813         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
814         if (load_dest) {
815             tcg_gen_ld_i32(t2, tcg_env, dofs + i);
816         }
817         fni(t2, t0, t1, c);
818         tcg_gen_st_i32(t2, tcg_env, dofs + i);
819         if (write_aofs) {
820             tcg_gen_st_i32(t0, tcg_env, aofs + i);
821         }
822     }
823     tcg_temp_free_i32(t0);
824     tcg_temp_free_i32(t1);
825     tcg_temp_free_i32(t2);
826 }
827 
828 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
829 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
830                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
831                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
832 {
833     TCGv_i32 t0 = tcg_temp_new_i32();
834     TCGv_i32 t1 = tcg_temp_new_i32();
835     TCGv_i32 t2 = tcg_temp_new_i32();
836     TCGv_i32 t3 = tcg_temp_new_i32();
837     uint32_t i;
838 
839     for (i = 0; i < oprsz; i += 4) {
840         tcg_gen_ld_i32(t1, tcg_env, aofs + i);
841         tcg_gen_ld_i32(t2, tcg_env, bofs + i);
842         tcg_gen_ld_i32(t3, tcg_env, cofs + i);
843         fni(t0, t1, t2, t3);
844         tcg_gen_st_i32(t0, tcg_env, dofs + i);
845         if (write_aofs) {
846             tcg_gen_st_i32(t1, tcg_env, aofs + i);
847         }
848     }
849     tcg_temp_free_i32(t3);
850     tcg_temp_free_i32(t2);
851     tcg_temp_free_i32(t1);
852     tcg_temp_free_i32(t0);
853 }
854 
855 static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
856                           uint32_t cofs, uint32_t oprsz, int32_t c,
857                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32,
858                                       int32_t))
859 {
860     TCGv_i32 t0 = tcg_temp_new_i32();
861     TCGv_i32 t1 = tcg_temp_new_i32();
862     TCGv_i32 t2 = tcg_temp_new_i32();
863     TCGv_i32 t3 = tcg_temp_new_i32();
864     uint32_t i;
865 
866     for (i = 0; i < oprsz; i += 4) {
867         tcg_gen_ld_i32(t1, tcg_env, aofs + i);
868         tcg_gen_ld_i32(t2, tcg_env, bofs + i);
869         tcg_gen_ld_i32(t3, tcg_env, cofs + i);
870         fni(t0, t1, t2, t3, c);
871         tcg_gen_st_i32(t0, tcg_env, dofs + i);
872     }
873     tcg_temp_free_i32(t3);
874     tcg_temp_free_i32(t2);
875     tcg_temp_free_i32(t1);
876     tcg_temp_free_i32(t0);
877 }
878 
879 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
880 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
881                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
882 {
883     TCGv_i64 t0 = tcg_temp_new_i64();
884     TCGv_i64 t1 = tcg_temp_new_i64();
885     uint32_t i;
886 
887     for (i = 0; i < oprsz; i += 8) {
888         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
889         if (load_dest) {
890             tcg_gen_ld_i64(t1, tcg_env, dofs + i);
891         }
892         fni(t1, t0);
893         tcg_gen_st_i64(t1, tcg_env, dofs + i);
894     }
895     tcg_temp_free_i64(t0);
896     tcg_temp_free_i64(t1);
897 }
898 
899 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
900                           int64_t c, bool load_dest,
901                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
902 {
903     TCGv_i64 t0 = tcg_temp_new_i64();
904     TCGv_i64 t1 = tcg_temp_new_i64();
905     uint32_t i;
906 
907     for (i = 0; i < oprsz; i += 8) {
908         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
909         if (load_dest) {
910             tcg_gen_ld_i64(t1, tcg_env, dofs + i);
911         }
912         fni(t1, t0, c);
913         tcg_gen_st_i64(t1, tcg_env, dofs + i);
914     }
915     tcg_temp_free_i64(t0);
916     tcg_temp_free_i64(t1);
917 }
918 
919 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
920                           TCGv_i64 c, bool scalar_first,
921                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
922 {
923     TCGv_i64 t0 = tcg_temp_new_i64();
924     TCGv_i64 t1 = tcg_temp_new_i64();
925     uint32_t i;
926 
927     for (i = 0; i < oprsz; i += 8) {
928         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
929         if (scalar_first) {
930             fni(t1, c, t0);
931         } else {
932             fni(t1, t0, c);
933         }
934         tcg_gen_st_i64(t1, tcg_env, dofs + i);
935     }
936     tcg_temp_free_i64(t0);
937     tcg_temp_free_i64(t1);
938 }
939 
940 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
941 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
942                          uint32_t bofs, uint32_t oprsz, bool load_dest,
943                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
944 {
945     TCGv_i64 t0 = tcg_temp_new_i64();
946     TCGv_i64 t1 = tcg_temp_new_i64();
947     TCGv_i64 t2 = tcg_temp_new_i64();
948     uint32_t i;
949 
950     for (i = 0; i < oprsz; i += 8) {
951         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
952         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
953         if (load_dest) {
954             tcg_gen_ld_i64(t2, tcg_env, dofs + i);
955         }
956         fni(t2, t0, t1);
957         tcg_gen_st_i64(t2, tcg_env, dofs + i);
958     }
959     tcg_temp_free_i64(t2);
960     tcg_temp_free_i64(t1);
961     tcg_temp_free_i64(t0);
962 }
963 
964 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
965                           uint32_t oprsz, int64_t c,
966                           bool load_dest, bool write_aofs,
967                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
968 {
969     TCGv_i64 t0 = tcg_temp_new_i64();
970     TCGv_i64 t1 = tcg_temp_new_i64();
971     TCGv_i64 t2 = tcg_temp_new_i64();
972     uint32_t i;
973 
974     for (i = 0; i < oprsz; i += 8) {
975         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
976         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
977         if (load_dest) {
978             tcg_gen_ld_i64(t2, tcg_env, dofs + i);
979         }
980         fni(t2, t0, t1, c);
981         tcg_gen_st_i64(t2, tcg_env, dofs + i);
982         if (write_aofs) {
983             tcg_gen_st_i64(t0, tcg_env, aofs + i);
984         }
985     }
986     tcg_temp_free_i64(t0);
987     tcg_temp_free_i64(t1);
988     tcg_temp_free_i64(t2);
989 }
990 
991 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
992 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
993                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
994                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
995 {
996     TCGv_i64 t0 = tcg_temp_new_i64();
997     TCGv_i64 t1 = tcg_temp_new_i64();
998     TCGv_i64 t2 = tcg_temp_new_i64();
999     TCGv_i64 t3 = tcg_temp_new_i64();
1000     uint32_t i;
1001 
1002     for (i = 0; i < oprsz; i += 8) {
1003         tcg_gen_ld_i64(t1, tcg_env, aofs + i);
1004         tcg_gen_ld_i64(t2, tcg_env, bofs + i);
1005         tcg_gen_ld_i64(t3, tcg_env, cofs + i);
1006         fni(t0, t1, t2, t3);
1007         tcg_gen_st_i64(t0, tcg_env, dofs + i);
1008         if (write_aofs) {
1009             tcg_gen_st_i64(t1, tcg_env, aofs + i);
1010         }
1011     }
1012     tcg_temp_free_i64(t3);
1013     tcg_temp_free_i64(t2);
1014     tcg_temp_free_i64(t1);
1015     tcg_temp_free_i64(t0);
1016 }
1017 
1018 static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1019                           uint32_t cofs, uint32_t oprsz, int64_t c,
1020                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64,
1021                                       int64_t))
1022 {
1023     TCGv_i64 t0 = tcg_temp_new_i64();
1024     TCGv_i64 t1 = tcg_temp_new_i64();
1025     TCGv_i64 t2 = tcg_temp_new_i64();
1026     TCGv_i64 t3 = tcg_temp_new_i64();
1027     uint32_t i;
1028 
1029     for (i = 0; i < oprsz; i += 8) {
1030         tcg_gen_ld_i64(t1, tcg_env, aofs + i);
1031         tcg_gen_ld_i64(t2, tcg_env, bofs + i);
1032         tcg_gen_ld_i64(t3, tcg_env, cofs + i);
1033         fni(t0, t1, t2, t3, c);
1034         tcg_gen_st_i64(t0, tcg_env, dofs + i);
1035     }
1036     tcg_temp_free_i64(t3);
1037     tcg_temp_free_i64(t2);
1038     tcg_temp_free_i64(t1);
1039     tcg_temp_free_i64(t0);
1040 }
1041 
1042 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
1043 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1044                          uint32_t oprsz, uint32_t tysz, TCGType type,
1045                          bool load_dest,
1046                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
1047 {
1048     for (uint32_t i = 0; i < oprsz; i += tysz) {
1049         TCGv_vec t0 = tcg_temp_new_vec(type);
1050         TCGv_vec t1 = tcg_temp_new_vec(type);
1051 
1052         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1053         if (load_dest) {
1054             tcg_gen_ld_vec(t1, tcg_env, dofs + i);
1055         }
1056         fni(vece, t1, t0);
1057         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1058     }
1059 }
1060 
1061 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
1062    using host vectors.  */
1063 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1064                           uint32_t oprsz, uint32_t tysz, TCGType type,
1065                           int64_t c, bool load_dest,
1066                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1067 {
1068     for (uint32_t i = 0; i < oprsz; i += tysz) {
1069         TCGv_vec t0 = tcg_temp_new_vec(type);
1070         TCGv_vec t1 = tcg_temp_new_vec(type);
1071 
1072         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1073         if (load_dest) {
1074             tcg_gen_ld_vec(t1, tcg_env, dofs + i);
1075         }
1076         fni(vece, t1, t0, c);
1077         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1078     }
1079 }
1080 
1081 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1082                           uint32_t oprsz, uint32_t tysz, TCGType type,
1083                           TCGv_vec c, bool scalar_first,
1084                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1085 {
1086     for (uint32_t i = 0; i < oprsz; i += tysz) {
1087         TCGv_vec t0 = tcg_temp_new_vec(type);
1088         TCGv_vec t1 = tcg_temp_new_vec(type);
1089 
1090         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1091         if (scalar_first) {
1092             fni(vece, t1, c, t0);
1093         } else {
1094             fni(vece, t1, t0, c);
1095         }
1096         tcg_gen_st_vec(t1, tcg_env, dofs + i);
1097     }
1098 }
1099 
1100 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1101 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1102                          uint32_t bofs, uint32_t oprsz,
1103                          uint32_t tysz, TCGType type, bool load_dest,
1104                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1105 {
1106     for (uint32_t i = 0; i < oprsz; i += tysz) {
1107         TCGv_vec t0 = tcg_temp_new_vec(type);
1108         TCGv_vec t1 = tcg_temp_new_vec(type);
1109         TCGv_vec t2 = tcg_temp_new_vec(type);
1110 
1111         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1112         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
1113         if (load_dest) {
1114             tcg_gen_ld_vec(t2, tcg_env, dofs + i);
1115         }
1116         fni(vece, t2, t0, t1);
1117         tcg_gen_st_vec(t2, tcg_env, dofs + i);
1118     }
1119 }
1120 
1121 /*
1122  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1123  * using host vectors.
1124  */
1125 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1126                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1127                           TCGType type, int64_t c,
1128                           bool load_dest, bool write_aofs,
1129                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1130                                       int64_t))
1131 {
1132     for (uint32_t i = 0; i < oprsz; i += tysz) {
1133         TCGv_vec t0 = tcg_temp_new_vec(type);
1134         TCGv_vec t1 = tcg_temp_new_vec(type);
1135         TCGv_vec t2 = tcg_temp_new_vec(type);
1136 
1137         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
1138         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
1139         if (load_dest) {
1140             tcg_gen_ld_vec(t2, tcg_env, dofs + i);
1141         }
1142         fni(vece, t2, t0, t1, c);
1143         tcg_gen_st_vec(t2, tcg_env, dofs + i);
1144         if (write_aofs) {
1145             tcg_gen_st_vec(t0, tcg_env, aofs + i);
1146         }
1147     }
1148 }
1149 
1150 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1151 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1152                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1153                          uint32_t tysz, TCGType type, bool write_aofs,
1154                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1155                                      TCGv_vec, TCGv_vec))
1156 {
1157     for (uint32_t i = 0; i < oprsz; i += tysz) {
1158         TCGv_vec t0 = tcg_temp_new_vec(type);
1159         TCGv_vec t1 = tcg_temp_new_vec(type);
1160         TCGv_vec t2 = tcg_temp_new_vec(type);
1161         TCGv_vec t3 = tcg_temp_new_vec(type);
1162 
1163         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
1164         tcg_gen_ld_vec(t2, tcg_env, bofs + i);
1165         tcg_gen_ld_vec(t3, tcg_env, cofs + i);
1166         fni(vece, t0, t1, t2, t3);
1167         tcg_gen_st_vec(t0, tcg_env, dofs + i);
1168         if (write_aofs) {
1169             tcg_gen_st_vec(t1, tcg_env, aofs + i);
1170         }
1171     }
1172 }
1173 
1174 /*
1175  * Expand OPSZ bytes worth of four-vector operands and an immediate operand
1176  * using host vectors.
1177  */
1178 static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1179                           uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1180                           uint32_t tysz, TCGType type, int64_t c,
1181                           void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1182                                      TCGv_vec, TCGv_vec, int64_t))
1183 {
1184     for (uint32_t i = 0; i < oprsz; i += tysz) {
1185         TCGv_vec t0 = tcg_temp_new_vec(type);
1186         TCGv_vec t1 = tcg_temp_new_vec(type);
1187         TCGv_vec t2 = tcg_temp_new_vec(type);
1188         TCGv_vec t3 = tcg_temp_new_vec(type);
1189 
1190         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
1191         tcg_gen_ld_vec(t2, tcg_env, bofs + i);
1192         tcg_gen_ld_vec(t3, tcg_env, cofs + i);
1193         fni(vece, t0, t1, t2, t3, c);
1194         tcg_gen_st_vec(t0, tcg_env, dofs + i);
1195     }
1196 }
1197 
1198 /* Expand a vector two-operand operation.  */
1199 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1200                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1201 {
1202     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1203     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1204     TCGType type;
1205     uint32_t some;
1206 
1207     check_size_align(oprsz, maxsz, dofs | aofs);
1208     check_overlap_2(dofs, aofs, maxsz);
1209 
1210     type = 0;
1211     if (g->fniv) {
1212         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1213     }
1214     switch (type) {
1215     case TCG_TYPE_V256:
1216         /* Recall that ARM SVE allows vector sizes that are not a
1217          * power of 2, but always a multiple of 16.  The intent is
1218          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1219          */
1220         some = QEMU_ALIGN_DOWN(oprsz, 32);
1221         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1222                      g->load_dest, g->fniv);
1223         if (some == oprsz) {
1224             break;
1225         }
1226         dofs += some;
1227         aofs += some;
1228         oprsz -= some;
1229         maxsz -= some;
1230         /* fallthru */
1231     case TCG_TYPE_V128:
1232         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1233                      g->load_dest, g->fniv);
1234         break;
1235     case TCG_TYPE_V64:
1236         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1237                      g->load_dest, g->fniv);
1238         break;
1239 
1240     case 0:
1241         if (g->fni8 && check_size_impl(oprsz, 8)) {
1242             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1243         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1244             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1245         } else {
1246             assert(g->fno != NULL);
1247             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1248             oprsz = maxsz;
1249         }
1250         break;
1251 
1252     default:
1253         g_assert_not_reached();
1254     }
1255     tcg_swap_vecop_list(hold_list);
1256 
1257     if (oprsz < maxsz) {
1258         expand_clr(dofs + oprsz, maxsz - oprsz);
1259     }
1260 }
1261 
1262 /* Expand a vector operation with two vectors and an immediate.  */
1263 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1264                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1265 {
1266     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1267     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1268     TCGType type;
1269     uint32_t some;
1270 
1271     check_size_align(oprsz, maxsz, dofs | aofs);
1272     check_overlap_2(dofs, aofs, maxsz);
1273 
1274     type = 0;
1275     if (g->fniv) {
1276         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1277     }
1278     switch (type) {
1279     case TCG_TYPE_V256:
1280         /* Recall that ARM SVE allows vector sizes that are not a
1281          * power of 2, but always a multiple of 16.  The intent is
1282          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1283          */
1284         some = QEMU_ALIGN_DOWN(oprsz, 32);
1285         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1286                       c, g->load_dest, g->fniv);
1287         if (some == oprsz) {
1288             break;
1289         }
1290         dofs += some;
1291         aofs += some;
1292         oprsz -= some;
1293         maxsz -= some;
1294         /* fallthru */
1295     case TCG_TYPE_V128:
1296         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1297                       c, g->load_dest, g->fniv);
1298         break;
1299     case TCG_TYPE_V64:
1300         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1301                       c, g->load_dest, g->fniv);
1302         break;
1303 
1304     case 0:
1305         if (g->fni8 && check_size_impl(oprsz, 8)) {
1306             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1307         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1308             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1309         } else {
1310             if (g->fno) {
1311                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1312             } else {
1313                 TCGv_i64 tcg_c = tcg_constant_i64(c);
1314                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1315                                     maxsz, c, g->fnoi);
1316             }
1317             oprsz = maxsz;
1318         }
1319         break;
1320 
1321     default:
1322         g_assert_not_reached();
1323     }
1324     tcg_swap_vecop_list(hold_list);
1325 
1326     if (oprsz < maxsz) {
1327         expand_clr(dofs + oprsz, maxsz - oprsz);
1328     }
1329 }
1330 
1331 /* Expand a vector operation with two vectors and a scalar.  */
1332 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1333                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1334 {
1335     TCGType type;
1336 
1337     check_size_align(oprsz, maxsz, dofs | aofs);
1338     check_overlap_2(dofs, aofs, maxsz);
1339 
1340     type = 0;
1341     if (g->fniv) {
1342         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1343     }
1344     if (type != 0) {
1345         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1346         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1347         TCGv_vec t_vec = tcg_temp_new_vec(type);
1348         uint32_t some;
1349 
1350         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1351 
1352         switch (type) {
1353         case TCG_TYPE_V256:
1354             /* Recall that ARM SVE allows vector sizes that are not a
1355              * power of 2, but always a multiple of 16.  The intent is
1356              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1357              */
1358             some = QEMU_ALIGN_DOWN(oprsz, 32);
1359             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1360                           t_vec, g->scalar_first, g->fniv);
1361             if (some == oprsz) {
1362                 break;
1363             }
1364             dofs += some;
1365             aofs += some;
1366             oprsz -= some;
1367             maxsz -= some;
1368             /* fallthru */
1369 
1370         case TCG_TYPE_V128:
1371             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1372                           t_vec, g->scalar_first, g->fniv);
1373             break;
1374 
1375         case TCG_TYPE_V64:
1376             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1377                           t_vec, g->scalar_first, g->fniv);
1378             break;
1379 
1380         default:
1381             g_assert_not_reached();
1382         }
1383         tcg_temp_free_vec(t_vec);
1384         tcg_swap_vecop_list(hold_list);
1385     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1386         TCGv_i64 t64 = tcg_temp_new_i64();
1387 
1388         tcg_gen_dup_i64(g->vece, t64, c);
1389         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1390         tcg_temp_free_i64(t64);
1391     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1392         TCGv_i32 t32 = tcg_temp_new_i32();
1393 
1394         tcg_gen_extrl_i64_i32(t32, c);
1395         tcg_gen_dup_i32(g->vece, t32, t32);
1396         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1397         tcg_temp_free_i32(t32);
1398     } else {
1399         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1400         return;
1401     }
1402 
1403     if (oprsz < maxsz) {
1404         expand_clr(dofs + oprsz, maxsz - oprsz);
1405     }
1406 }
1407 
1408 /* Expand a vector three-operand operation.  */
1409 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1410                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1411 {
1412     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1413     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1414     TCGType type;
1415     uint32_t some;
1416 
1417     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1418     check_overlap_3(dofs, aofs, bofs, maxsz);
1419 
1420     type = 0;
1421     if (g->fniv) {
1422         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1423     }
1424     switch (type) {
1425     case TCG_TYPE_V256:
1426         /* Recall that ARM SVE allows vector sizes that are not a
1427          * power of 2, but always a multiple of 16.  The intent is
1428          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1429          */
1430         some = QEMU_ALIGN_DOWN(oprsz, 32);
1431         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1432                      g->load_dest, g->fniv);
1433         if (some == oprsz) {
1434             break;
1435         }
1436         dofs += some;
1437         aofs += some;
1438         bofs += some;
1439         oprsz -= some;
1440         maxsz -= some;
1441         /* fallthru */
1442     case TCG_TYPE_V128:
1443         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1444                      g->load_dest, g->fniv);
1445         break;
1446     case TCG_TYPE_V64:
1447         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1448                      g->load_dest, g->fniv);
1449         break;
1450 
1451     case 0:
1452         if (g->fni8 && check_size_impl(oprsz, 8)) {
1453             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1454         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1455             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1456         } else {
1457             assert(g->fno != NULL);
1458             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1459                                maxsz, g->data, g->fno);
1460             oprsz = maxsz;
1461         }
1462         break;
1463 
1464     default:
1465         g_assert_not_reached();
1466     }
1467     tcg_swap_vecop_list(hold_list);
1468 
1469     if (oprsz < maxsz) {
1470         expand_clr(dofs + oprsz, maxsz - oprsz);
1471     }
1472 }
1473 
1474 /* Expand a vector operation with three vectors and an immediate.  */
1475 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1476                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1477                      const GVecGen3i *g)
1478 {
1479     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1480     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1481     TCGType type;
1482     uint32_t some;
1483 
1484     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1485     check_overlap_3(dofs, aofs, bofs, maxsz);
1486 
1487     type = 0;
1488     if (g->fniv) {
1489         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1490     }
1491     switch (type) {
1492     case TCG_TYPE_V256:
1493         /*
1494          * Recall that ARM SVE allows vector sizes that are not a
1495          * power of 2, but always a multiple of 16.  The intent is
1496          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1497          */
1498         some = QEMU_ALIGN_DOWN(oprsz, 32);
1499         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1500                       c, g->load_dest, g->write_aofs, g->fniv);
1501         if (some == oprsz) {
1502             break;
1503         }
1504         dofs += some;
1505         aofs += some;
1506         bofs += some;
1507         oprsz -= some;
1508         maxsz -= some;
1509         /* fallthru */
1510     case TCG_TYPE_V128:
1511         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1512                       c, g->load_dest, g->write_aofs, g->fniv);
1513         break;
1514     case TCG_TYPE_V64:
1515         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1516                       c, g->load_dest, g->write_aofs, g->fniv);
1517         break;
1518 
1519     case 0:
1520         if (g->fni8 && check_size_impl(oprsz, 8)) {
1521             expand_3i_i64(dofs, aofs, bofs, oprsz, c,
1522                           g->load_dest, g->write_aofs, g->fni8);
1523         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1524             expand_3i_i32(dofs, aofs, bofs, oprsz, c,
1525                           g->load_dest, g->write_aofs, g->fni4);
1526         } else {
1527             assert(g->fno != NULL);
1528             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1529             oprsz = maxsz;
1530         }
1531         break;
1532 
1533     default:
1534         g_assert_not_reached();
1535     }
1536     tcg_swap_vecop_list(hold_list);
1537 
1538     if (oprsz < maxsz) {
1539         expand_clr(dofs + oprsz, maxsz - oprsz);
1540     }
1541 }
1542 
1543 /* Expand a vector four-operand operation.  */
1544 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1545                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1546 {
1547     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1548     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1549     TCGType type;
1550     uint32_t some;
1551 
1552     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1553     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1554 
1555     type = 0;
1556     if (g->fniv) {
1557         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1558     }
1559     switch (type) {
1560     case TCG_TYPE_V256:
1561         /* Recall that ARM SVE allows vector sizes that are not a
1562          * power of 2, but always a multiple of 16.  The intent is
1563          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1564          */
1565         some = QEMU_ALIGN_DOWN(oprsz, 32);
1566         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1567                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1568         if (some == oprsz) {
1569             break;
1570         }
1571         dofs += some;
1572         aofs += some;
1573         bofs += some;
1574         cofs += some;
1575         oprsz -= some;
1576         maxsz -= some;
1577         /* fallthru */
1578     case TCG_TYPE_V128:
1579         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1580                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1581         break;
1582     case TCG_TYPE_V64:
1583         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1584                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1585         break;
1586 
1587     case 0:
1588         if (g->fni8 && check_size_impl(oprsz, 8)) {
1589             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1590                          g->write_aofs, g->fni8);
1591         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1592             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1593                          g->write_aofs, g->fni4);
1594         } else {
1595             assert(g->fno != NULL);
1596             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1597                                oprsz, maxsz, g->data, g->fno);
1598             oprsz = maxsz;
1599         }
1600         break;
1601 
1602     default:
1603         g_assert_not_reached();
1604     }
1605     tcg_swap_vecop_list(hold_list);
1606 
1607     if (oprsz < maxsz) {
1608         expand_clr(dofs + oprsz, maxsz - oprsz);
1609     }
1610 }
1611 
1612 /* Expand a vector four-operand operation.  */
1613 void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1614                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1615                      const GVecGen4i *g)
1616 {
1617     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1618     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1619     TCGType type;
1620     uint32_t some;
1621 
1622     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1623     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1624 
1625     type = 0;
1626     if (g->fniv) {
1627         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1628     }
1629     switch (type) {
1630     case TCG_TYPE_V256:
1631         /*
1632          * Recall that ARM SVE allows vector sizes that are not a
1633          * power of 2, but always a multiple of 16.  The intent is
1634          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1635          */
1636         some = QEMU_ALIGN_DOWN(oprsz, 32);
1637         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some,
1638                       32, TCG_TYPE_V256, c, g->fniv);
1639         if (some == oprsz) {
1640             break;
1641         }
1642         dofs += some;
1643         aofs += some;
1644         bofs += some;
1645         cofs += some;
1646         oprsz -= some;
1647         maxsz -= some;
1648         /* fallthru */
1649     case TCG_TYPE_V128:
1650         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1651                        16, TCG_TYPE_V128, c, g->fniv);
1652         break;
1653     case TCG_TYPE_V64:
1654         expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1655                       8, TCG_TYPE_V64, c, g->fniv);
1656         break;
1657 
1658     case 0:
1659         if (g->fni8 && check_size_impl(oprsz, 8)) {
1660             expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8);
1661         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1662             expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4);
1663         } else {
1664             assert(g->fno != NULL);
1665             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1666                                oprsz, maxsz, c, g->fno);
1667             oprsz = maxsz;
1668         }
1669         break;
1670 
1671     default:
1672         g_assert_not_reached();
1673     }
1674     tcg_swap_vecop_list(hold_list);
1675 
1676     if (oprsz < maxsz) {
1677         expand_clr(dofs + oprsz, maxsz - oprsz);
1678     }
1679 }
1680 
1681 /*
1682  * Expand specific vector operations.
1683  */
1684 
1685 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1686 {
1687     tcg_gen_mov_vec(a, b);
1688 }
1689 
1690 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1691                       uint32_t oprsz, uint32_t maxsz)
1692 {
1693     static const GVecGen2 g = {
1694         .fni8 = tcg_gen_mov_i64,
1695         .fniv = vec_mov2,
1696         .fno = gen_helper_gvec_mov,
1697         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1698     };
1699     if (dofs != aofs) {
1700         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1701     } else {
1702         check_size_align(oprsz, maxsz, dofs);
1703         if (oprsz < maxsz) {
1704             expand_clr(dofs + oprsz, maxsz - oprsz);
1705         }
1706     }
1707 }
1708 
1709 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1710                           uint32_t maxsz, TCGv_i32 in)
1711 {
1712     check_size_align(oprsz, maxsz, dofs);
1713     tcg_debug_assert(vece <= MO_32);
1714     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1715 }
1716 
1717 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1718                           uint32_t maxsz, TCGv_i64 in)
1719 {
1720     check_size_align(oprsz, maxsz, dofs);
1721     tcg_debug_assert(vece <= MO_64);
1722     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1723 }
1724 
1725 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1726                           uint32_t oprsz, uint32_t maxsz)
1727 {
1728     check_size_align(oprsz, maxsz, dofs);
1729     if (vece <= MO_64) {
1730         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1731         if (type != 0) {
1732             TCGv_vec t_vec = tcg_temp_new_vec(type);
1733             tcg_gen_dup_mem_vec(vece, t_vec, tcg_env, aofs);
1734             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1735         } else if (vece <= MO_32) {
1736             TCGv_i32 in = tcg_temp_ebb_new_i32();
1737             switch (vece) {
1738             case MO_8:
1739                 tcg_gen_ld8u_i32(in, tcg_env, aofs);
1740                 break;
1741             case MO_16:
1742                 tcg_gen_ld16u_i32(in, tcg_env, aofs);
1743                 break;
1744             default:
1745                 tcg_gen_ld_i32(in, tcg_env, aofs);
1746                 break;
1747             }
1748             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1749             tcg_temp_free_i32(in);
1750         } else {
1751             TCGv_i64 in = tcg_temp_ebb_new_i64();
1752             tcg_gen_ld_i64(in, tcg_env, aofs);
1753             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1754             tcg_temp_free_i64(in);
1755         }
1756     } else if (vece == 4) {
1757         /* 128-bit duplicate.  */
1758         int i;
1759 
1760         tcg_debug_assert(oprsz >= 16);
1761         if (TCG_TARGET_HAS_v128) {
1762             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1763 
1764             tcg_gen_ld_vec(in, tcg_env, aofs);
1765             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1766                 tcg_gen_st_vec(in, tcg_env, dofs + i);
1767             }
1768         } else {
1769             TCGv_i64 in0 = tcg_temp_ebb_new_i64();
1770             TCGv_i64 in1 = tcg_temp_ebb_new_i64();
1771 
1772             tcg_gen_ld_i64(in0, tcg_env, aofs);
1773             tcg_gen_ld_i64(in1, tcg_env, aofs + 8);
1774             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1775                 tcg_gen_st_i64(in0, tcg_env, dofs + i);
1776                 tcg_gen_st_i64(in1, tcg_env, dofs + i + 8);
1777             }
1778             tcg_temp_free_i64(in0);
1779             tcg_temp_free_i64(in1);
1780         }
1781         if (oprsz < maxsz) {
1782             expand_clr(dofs + oprsz, maxsz - oprsz);
1783         }
1784     } else if (vece == 5) {
1785         /* 256-bit duplicate.  */
1786         int i;
1787 
1788         tcg_debug_assert(oprsz >= 32);
1789         tcg_debug_assert(oprsz % 32 == 0);
1790         if (TCG_TARGET_HAS_v256) {
1791             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1792 
1793             tcg_gen_ld_vec(in, tcg_env, aofs);
1794             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1795                 tcg_gen_st_vec(in, tcg_env, dofs + i);
1796             }
1797         } else if (TCG_TARGET_HAS_v128) {
1798             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1799             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1800 
1801             tcg_gen_ld_vec(in0, tcg_env, aofs);
1802             tcg_gen_ld_vec(in1, tcg_env, aofs + 16);
1803             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1804                 tcg_gen_st_vec(in0, tcg_env, dofs + i);
1805                 tcg_gen_st_vec(in1, tcg_env, dofs + i + 16);
1806             }
1807         } else {
1808             TCGv_i64 in[4];
1809             int j;
1810 
1811             for (j = 0; j < 4; ++j) {
1812                 in[j] = tcg_temp_ebb_new_i64();
1813                 tcg_gen_ld_i64(in[j], tcg_env, aofs + j * 8);
1814             }
1815             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1816                 for (j = 0; j < 4; ++j) {
1817                     tcg_gen_st_i64(in[j], tcg_env, dofs + i + j * 8);
1818                 }
1819             }
1820             for (j = 0; j < 4; ++j) {
1821                 tcg_temp_free_i64(in[j]);
1822             }
1823         }
1824         if (oprsz < maxsz) {
1825             expand_clr(dofs + oprsz, maxsz - oprsz);
1826         }
1827     } else {
1828         g_assert_not_reached();
1829     }
1830 }
1831 
1832 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1833                           uint32_t maxsz, uint64_t x)
1834 {
1835     check_size_align(oprsz, maxsz, dofs);
1836     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1837 }
1838 
1839 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1840                       uint32_t oprsz, uint32_t maxsz)
1841 {
1842     static const GVecGen2 g = {
1843         .fni8 = tcg_gen_not_i64,
1844         .fniv = tcg_gen_not_vec,
1845         .fno = gen_helper_gvec_not,
1846         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1847     };
1848     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1849 }
1850 
1851 /* Perform a vector addition using normal addition and a mask.  The mask
1852    should be the sign bit of each lane.  This 6-operation form is more
1853    efficient than separate additions when there are 4 or more lanes in
1854    the 64-bit operation.  */
1855 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1856 {
1857     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1858     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1859     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
1860 
1861     tcg_gen_andc_i64(t1, a, m);
1862     tcg_gen_andc_i64(t2, b, m);
1863     tcg_gen_xor_i64(t3, a, b);
1864     tcg_gen_add_i64(d, t1, t2);
1865     tcg_gen_and_i64(t3, t3, m);
1866     tcg_gen_xor_i64(d, d, t3);
1867 
1868     tcg_temp_free_i64(t1);
1869     tcg_temp_free_i64(t2);
1870     tcg_temp_free_i64(t3);
1871 }
1872 
1873 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1874 {
1875     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1876     gen_addv_mask(d, a, b, m);
1877 }
1878 
1879 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1880 {
1881     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1882     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1883     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1884     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
1885 
1886     tcg_gen_andc_i32(t1, a, m);
1887     tcg_gen_andc_i32(t2, b, m);
1888     tcg_gen_xor_i32(t3, a, b);
1889     tcg_gen_add_i32(d, t1, t2);
1890     tcg_gen_and_i32(t3, t3, m);
1891     tcg_gen_xor_i32(d, d, t3);
1892 
1893     tcg_temp_free_i32(t1);
1894     tcg_temp_free_i32(t2);
1895     tcg_temp_free_i32(t3);
1896 }
1897 
1898 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1899 {
1900     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1901     gen_addv_mask(d, a, b, m);
1902 }
1903 
1904 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1905 {
1906     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
1907     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
1908 
1909     tcg_gen_andi_i32(t1, a, ~0xffff);
1910     tcg_gen_add_i32(t2, a, b);
1911     tcg_gen_add_i32(t1, t1, b);
1912     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1913 
1914     tcg_temp_free_i32(t1);
1915     tcg_temp_free_i32(t2);
1916 }
1917 
1918 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1919 {
1920     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
1921     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
1922 
1923     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1924     tcg_gen_add_i64(t2, a, b);
1925     tcg_gen_add_i64(t1, t1, b);
1926     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1927 
1928     tcg_temp_free_i64(t1);
1929     tcg_temp_free_i64(t2);
1930 }
1931 
1932 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1933 
1934 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1935                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1936 {
1937     static const GVecGen3 g[4] = {
1938         { .fni8 = tcg_gen_vec_add8_i64,
1939           .fniv = tcg_gen_add_vec,
1940           .fno = gen_helper_gvec_add8,
1941           .opt_opc = vecop_list_add,
1942           .vece = MO_8 },
1943         { .fni8 = tcg_gen_vec_add16_i64,
1944           .fniv = tcg_gen_add_vec,
1945           .fno = gen_helper_gvec_add16,
1946           .opt_opc = vecop_list_add,
1947           .vece = MO_16 },
1948         { .fni4 = tcg_gen_add_i32,
1949           .fniv = tcg_gen_add_vec,
1950           .fno = gen_helper_gvec_add32,
1951           .opt_opc = vecop_list_add,
1952           .vece = MO_32 },
1953         { .fni8 = tcg_gen_add_i64,
1954           .fniv = tcg_gen_add_vec,
1955           .fno = gen_helper_gvec_add64,
1956           .opt_opc = vecop_list_add,
1957           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1958           .vece = MO_64 },
1959     };
1960 
1961     tcg_debug_assert(vece <= MO_64);
1962     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1963 }
1964 
1965 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1966                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1967 {
1968     static const GVecGen2s g[4] = {
1969         { .fni8 = tcg_gen_vec_add8_i64,
1970           .fniv = tcg_gen_add_vec,
1971           .fno = gen_helper_gvec_adds8,
1972           .opt_opc = vecop_list_add,
1973           .vece = MO_8 },
1974         { .fni8 = tcg_gen_vec_add16_i64,
1975           .fniv = tcg_gen_add_vec,
1976           .fno = gen_helper_gvec_adds16,
1977           .opt_opc = vecop_list_add,
1978           .vece = MO_16 },
1979         { .fni4 = tcg_gen_add_i32,
1980           .fniv = tcg_gen_add_vec,
1981           .fno = gen_helper_gvec_adds32,
1982           .opt_opc = vecop_list_add,
1983           .vece = MO_32 },
1984         { .fni8 = tcg_gen_add_i64,
1985           .fniv = tcg_gen_add_vec,
1986           .fno = gen_helper_gvec_adds64,
1987           .opt_opc = vecop_list_add,
1988           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1989           .vece = MO_64 },
1990     };
1991 
1992     tcg_debug_assert(vece <= MO_64);
1993     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1994 }
1995 
1996 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1997                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1998 {
1999     TCGv_i64 tmp = tcg_constant_i64(c);
2000     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
2001 }
2002 
2003 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
2004 
2005 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
2006                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2007 {
2008     static const GVecGen2s g[4] = {
2009         { .fni8 = tcg_gen_vec_sub8_i64,
2010           .fniv = tcg_gen_sub_vec,
2011           .fno = gen_helper_gvec_subs8,
2012           .opt_opc = vecop_list_sub,
2013           .vece = MO_8 },
2014         { .fni8 = tcg_gen_vec_sub16_i64,
2015           .fniv = tcg_gen_sub_vec,
2016           .fno = gen_helper_gvec_subs16,
2017           .opt_opc = vecop_list_sub,
2018           .vece = MO_16 },
2019         { .fni4 = tcg_gen_sub_i32,
2020           .fniv = tcg_gen_sub_vec,
2021           .fno = gen_helper_gvec_subs32,
2022           .opt_opc = vecop_list_sub,
2023           .vece = MO_32 },
2024         { .fni8 = tcg_gen_sub_i64,
2025           .fniv = tcg_gen_sub_vec,
2026           .fno = gen_helper_gvec_subs64,
2027           .opt_opc = vecop_list_sub,
2028           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2029           .vece = MO_64 },
2030     };
2031 
2032     tcg_debug_assert(vece <= MO_64);
2033     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2034 }
2035 
2036 /* Perform a vector subtraction using normal subtraction and a mask.
2037    Compare gen_addv_mask above.  */
2038 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
2039 {
2040     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2041     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2042     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2043 
2044     tcg_gen_or_i64(t1, a, m);
2045     tcg_gen_andc_i64(t2, b, m);
2046     tcg_gen_eqv_i64(t3, a, b);
2047     tcg_gen_sub_i64(d, t1, t2);
2048     tcg_gen_and_i64(t3, t3, m);
2049     tcg_gen_xor_i64(d, d, t3);
2050 
2051     tcg_temp_free_i64(t1);
2052     tcg_temp_free_i64(t2);
2053     tcg_temp_free_i64(t3);
2054 }
2055 
2056 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2057 {
2058     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2059     gen_subv_mask(d, a, b, m);
2060 }
2061 
2062 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2063 {
2064     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
2065     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2066     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2067     TCGv_i32 t3 = tcg_temp_ebb_new_i32();
2068 
2069     tcg_gen_or_i32(t1, a, m);
2070     tcg_gen_andc_i32(t2, b, m);
2071     tcg_gen_eqv_i32(t3, a, b);
2072     tcg_gen_sub_i32(d, t1, t2);
2073     tcg_gen_and_i32(t3, t3, m);
2074     tcg_gen_xor_i32(d, d, t3);
2075 
2076     tcg_temp_free_i32(t1);
2077     tcg_temp_free_i32(t2);
2078     tcg_temp_free_i32(t3);
2079 }
2080 
2081 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2082 {
2083     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2084     gen_subv_mask(d, a, b, m);
2085 }
2086 
2087 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2088 {
2089     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
2090     TCGv_i32 t2 = tcg_temp_ebb_new_i32();
2091 
2092     tcg_gen_andi_i32(t1, b, ~0xffff);
2093     tcg_gen_sub_i32(t2, a, b);
2094     tcg_gen_sub_i32(t1, a, t1);
2095     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
2096 
2097     tcg_temp_free_i32(t1);
2098     tcg_temp_free_i32(t2);
2099 }
2100 
2101 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2102 {
2103     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2104     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2105 
2106     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2107     tcg_gen_sub_i64(t2, a, b);
2108     tcg_gen_sub_i64(t1, a, t1);
2109     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2110 
2111     tcg_temp_free_i64(t1);
2112     tcg_temp_free_i64(t2);
2113 }
2114 
2115 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
2116                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2117 {
2118     static const GVecGen3 g[4] = {
2119         { .fni8 = tcg_gen_vec_sub8_i64,
2120           .fniv = tcg_gen_sub_vec,
2121           .fno = gen_helper_gvec_sub8,
2122           .opt_opc = vecop_list_sub,
2123           .vece = MO_8 },
2124         { .fni8 = tcg_gen_vec_sub16_i64,
2125           .fniv = tcg_gen_sub_vec,
2126           .fno = gen_helper_gvec_sub16,
2127           .opt_opc = vecop_list_sub,
2128           .vece = MO_16 },
2129         { .fni4 = tcg_gen_sub_i32,
2130           .fniv = tcg_gen_sub_vec,
2131           .fno = gen_helper_gvec_sub32,
2132           .opt_opc = vecop_list_sub,
2133           .vece = MO_32 },
2134         { .fni8 = tcg_gen_sub_i64,
2135           .fniv = tcg_gen_sub_vec,
2136           .fno = gen_helper_gvec_sub64,
2137           .opt_opc = vecop_list_sub,
2138           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2139           .vece = MO_64 },
2140     };
2141 
2142     tcg_debug_assert(vece <= MO_64);
2143     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2144 }
2145 
2146 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
2147 
2148 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
2149                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2150 {
2151     static const GVecGen3 g[4] = {
2152         { .fniv = tcg_gen_mul_vec,
2153           .fno = gen_helper_gvec_mul8,
2154           .opt_opc = vecop_list_mul,
2155           .vece = MO_8 },
2156         { .fniv = tcg_gen_mul_vec,
2157           .fno = gen_helper_gvec_mul16,
2158           .opt_opc = vecop_list_mul,
2159           .vece = MO_16 },
2160         { .fni4 = tcg_gen_mul_i32,
2161           .fniv = tcg_gen_mul_vec,
2162           .fno = gen_helper_gvec_mul32,
2163           .opt_opc = vecop_list_mul,
2164           .vece = MO_32 },
2165         { .fni8 = tcg_gen_mul_i64,
2166           .fniv = tcg_gen_mul_vec,
2167           .fno = gen_helper_gvec_mul64,
2168           .opt_opc = vecop_list_mul,
2169           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2170           .vece = MO_64 },
2171     };
2172 
2173     tcg_debug_assert(vece <= MO_64);
2174     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2175 }
2176 
2177 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
2178                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2179 {
2180     static const GVecGen2s g[4] = {
2181         { .fniv = tcg_gen_mul_vec,
2182           .fno = gen_helper_gvec_muls8,
2183           .opt_opc = vecop_list_mul,
2184           .vece = MO_8 },
2185         { .fniv = tcg_gen_mul_vec,
2186           .fno = gen_helper_gvec_muls16,
2187           .opt_opc = vecop_list_mul,
2188           .vece = MO_16 },
2189         { .fni4 = tcg_gen_mul_i32,
2190           .fniv = tcg_gen_mul_vec,
2191           .fno = gen_helper_gvec_muls32,
2192           .opt_opc = vecop_list_mul,
2193           .vece = MO_32 },
2194         { .fni8 = tcg_gen_mul_i64,
2195           .fniv = tcg_gen_mul_vec,
2196           .fno = gen_helper_gvec_muls64,
2197           .opt_opc = vecop_list_mul,
2198           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2199           .vece = MO_64 },
2200     };
2201 
2202     tcg_debug_assert(vece <= MO_64);
2203     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2204 }
2205 
2206 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2207                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2208 {
2209     TCGv_i64 tmp = tcg_constant_i64(c);
2210     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2211 }
2212 
2213 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2214                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2215 {
2216     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2217     static const GVecGen3 g[4] = {
2218         { .fniv = tcg_gen_ssadd_vec,
2219           .fno = gen_helper_gvec_ssadd8,
2220           .opt_opc = vecop_list,
2221           .vece = MO_8 },
2222         { .fniv = tcg_gen_ssadd_vec,
2223           .fno = gen_helper_gvec_ssadd16,
2224           .opt_opc = vecop_list,
2225           .vece = MO_16 },
2226         { .fniv = tcg_gen_ssadd_vec,
2227           .fno = gen_helper_gvec_ssadd32,
2228           .opt_opc = vecop_list,
2229           .vece = MO_32 },
2230         { .fniv = tcg_gen_ssadd_vec,
2231           .fno = gen_helper_gvec_ssadd64,
2232           .opt_opc = vecop_list,
2233           .vece = MO_64 },
2234     };
2235     tcg_debug_assert(vece <= MO_64);
2236     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2237 }
2238 
2239 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2240                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2241 {
2242     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2243     static const GVecGen3 g[4] = {
2244         { .fniv = tcg_gen_sssub_vec,
2245           .fno = gen_helper_gvec_sssub8,
2246           .opt_opc = vecop_list,
2247           .vece = MO_8 },
2248         { .fniv = tcg_gen_sssub_vec,
2249           .fno = gen_helper_gvec_sssub16,
2250           .opt_opc = vecop_list,
2251           .vece = MO_16 },
2252         { .fniv = tcg_gen_sssub_vec,
2253           .fno = gen_helper_gvec_sssub32,
2254           .opt_opc = vecop_list,
2255           .vece = MO_32 },
2256         { .fniv = tcg_gen_sssub_vec,
2257           .fno = gen_helper_gvec_sssub64,
2258           .opt_opc = vecop_list,
2259           .vece = MO_64 },
2260     };
2261     tcg_debug_assert(vece <= MO_64);
2262     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2263 }
2264 
2265 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2266 {
2267     TCGv_i32 max = tcg_constant_i32(-1);
2268     tcg_gen_add_i32(d, a, b);
2269     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2270 }
2271 
2272 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2273 {
2274     TCGv_i64 max = tcg_constant_i64(-1);
2275     tcg_gen_add_i64(d, a, b);
2276     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2277 }
2278 
2279 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2280                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2281 {
2282     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2283     static const GVecGen3 g[4] = {
2284         { .fniv = tcg_gen_usadd_vec,
2285           .fno = gen_helper_gvec_usadd8,
2286           .opt_opc = vecop_list,
2287           .vece = MO_8 },
2288         { .fniv = tcg_gen_usadd_vec,
2289           .fno = gen_helper_gvec_usadd16,
2290           .opt_opc = vecop_list,
2291           .vece = MO_16 },
2292         { .fni4 = tcg_gen_usadd_i32,
2293           .fniv = tcg_gen_usadd_vec,
2294           .fno = gen_helper_gvec_usadd32,
2295           .opt_opc = vecop_list,
2296           .vece = MO_32 },
2297         { .fni8 = tcg_gen_usadd_i64,
2298           .fniv = tcg_gen_usadd_vec,
2299           .fno = gen_helper_gvec_usadd64,
2300           .opt_opc = vecop_list,
2301           .vece = MO_64 }
2302     };
2303     tcg_debug_assert(vece <= MO_64);
2304     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2305 }
2306 
2307 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2308 {
2309     TCGv_i32 min = tcg_constant_i32(0);
2310     tcg_gen_sub_i32(d, a, b);
2311     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2312 }
2313 
2314 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2315 {
2316     TCGv_i64 min = tcg_constant_i64(0);
2317     tcg_gen_sub_i64(d, a, b);
2318     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2319 }
2320 
2321 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2322                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2323 {
2324     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2325     static const GVecGen3 g[4] = {
2326         { .fniv = tcg_gen_ussub_vec,
2327           .fno = gen_helper_gvec_ussub8,
2328           .opt_opc = vecop_list,
2329           .vece = MO_8 },
2330         { .fniv = tcg_gen_ussub_vec,
2331           .fno = gen_helper_gvec_ussub16,
2332           .opt_opc = vecop_list,
2333           .vece = MO_16 },
2334         { .fni4 = tcg_gen_ussub_i32,
2335           .fniv = tcg_gen_ussub_vec,
2336           .fno = gen_helper_gvec_ussub32,
2337           .opt_opc = vecop_list,
2338           .vece = MO_32 },
2339         { .fni8 = tcg_gen_ussub_i64,
2340           .fniv = tcg_gen_ussub_vec,
2341           .fno = gen_helper_gvec_ussub64,
2342           .opt_opc = vecop_list,
2343           .vece = MO_64 }
2344     };
2345     tcg_debug_assert(vece <= MO_64);
2346     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2347 }
2348 
2349 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2350                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2351 {
2352     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2353     static const GVecGen3 g[4] = {
2354         { .fniv = tcg_gen_smin_vec,
2355           .fno = gen_helper_gvec_smin8,
2356           .opt_opc = vecop_list,
2357           .vece = MO_8 },
2358         { .fniv = tcg_gen_smin_vec,
2359           .fno = gen_helper_gvec_smin16,
2360           .opt_opc = vecop_list,
2361           .vece = MO_16 },
2362         { .fni4 = tcg_gen_smin_i32,
2363           .fniv = tcg_gen_smin_vec,
2364           .fno = gen_helper_gvec_smin32,
2365           .opt_opc = vecop_list,
2366           .vece = MO_32 },
2367         { .fni8 = tcg_gen_smin_i64,
2368           .fniv = tcg_gen_smin_vec,
2369           .fno = gen_helper_gvec_smin64,
2370           .opt_opc = vecop_list,
2371           .vece = MO_64 }
2372     };
2373     tcg_debug_assert(vece <= MO_64);
2374     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2375 }
2376 
2377 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2378                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2379 {
2380     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2381     static const GVecGen3 g[4] = {
2382         { .fniv = tcg_gen_umin_vec,
2383           .fno = gen_helper_gvec_umin8,
2384           .opt_opc = vecop_list,
2385           .vece = MO_8 },
2386         { .fniv = tcg_gen_umin_vec,
2387           .fno = gen_helper_gvec_umin16,
2388           .opt_opc = vecop_list,
2389           .vece = MO_16 },
2390         { .fni4 = tcg_gen_umin_i32,
2391           .fniv = tcg_gen_umin_vec,
2392           .fno = gen_helper_gvec_umin32,
2393           .opt_opc = vecop_list,
2394           .vece = MO_32 },
2395         { .fni8 = tcg_gen_umin_i64,
2396           .fniv = tcg_gen_umin_vec,
2397           .fno = gen_helper_gvec_umin64,
2398           .opt_opc = vecop_list,
2399           .vece = MO_64 }
2400     };
2401     tcg_debug_assert(vece <= MO_64);
2402     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2403 }
2404 
2405 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2406                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2407 {
2408     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2409     static const GVecGen3 g[4] = {
2410         { .fniv = tcg_gen_smax_vec,
2411           .fno = gen_helper_gvec_smax8,
2412           .opt_opc = vecop_list,
2413           .vece = MO_8 },
2414         { .fniv = tcg_gen_smax_vec,
2415           .fno = gen_helper_gvec_smax16,
2416           .opt_opc = vecop_list,
2417           .vece = MO_16 },
2418         { .fni4 = tcg_gen_smax_i32,
2419           .fniv = tcg_gen_smax_vec,
2420           .fno = gen_helper_gvec_smax32,
2421           .opt_opc = vecop_list,
2422           .vece = MO_32 },
2423         { .fni8 = tcg_gen_smax_i64,
2424           .fniv = tcg_gen_smax_vec,
2425           .fno = gen_helper_gvec_smax64,
2426           .opt_opc = vecop_list,
2427           .vece = MO_64 }
2428     };
2429     tcg_debug_assert(vece <= MO_64);
2430     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2431 }
2432 
2433 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2434                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2435 {
2436     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2437     static const GVecGen3 g[4] = {
2438         { .fniv = tcg_gen_umax_vec,
2439           .fno = gen_helper_gvec_umax8,
2440           .opt_opc = vecop_list,
2441           .vece = MO_8 },
2442         { .fniv = tcg_gen_umax_vec,
2443           .fno = gen_helper_gvec_umax16,
2444           .opt_opc = vecop_list,
2445           .vece = MO_16 },
2446         { .fni4 = tcg_gen_umax_i32,
2447           .fniv = tcg_gen_umax_vec,
2448           .fno = gen_helper_gvec_umax32,
2449           .opt_opc = vecop_list,
2450           .vece = MO_32 },
2451         { .fni8 = tcg_gen_umax_i64,
2452           .fniv = tcg_gen_umax_vec,
2453           .fno = gen_helper_gvec_umax64,
2454           .opt_opc = vecop_list,
2455           .vece = MO_64 }
2456     };
2457     tcg_debug_assert(vece <= MO_64);
2458     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2459 }
2460 
2461 /* Perform a vector negation using normal negation and a mask.
2462    Compare gen_subv_mask above.  */
2463 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2464 {
2465     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2466     TCGv_i64 t3 = tcg_temp_ebb_new_i64();
2467 
2468     tcg_gen_andc_i64(t3, m, b);
2469     tcg_gen_andc_i64(t2, b, m);
2470     tcg_gen_sub_i64(d, m, t2);
2471     tcg_gen_xor_i64(d, d, t3);
2472 
2473     tcg_temp_free_i64(t2);
2474     tcg_temp_free_i64(t3);
2475 }
2476 
2477 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2478 {
2479     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2480     gen_negv_mask(d, b, m);
2481 }
2482 
2483 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2484 {
2485     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2486     gen_negv_mask(d, b, m);
2487 }
2488 
2489 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2490 {
2491     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
2492     TCGv_i64 t2 = tcg_temp_ebb_new_i64();
2493 
2494     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2495     tcg_gen_neg_i64(t2, b);
2496     tcg_gen_neg_i64(t1, t1);
2497     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2498 
2499     tcg_temp_free_i64(t1);
2500     tcg_temp_free_i64(t2);
2501 }
2502 
2503 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2504                       uint32_t oprsz, uint32_t maxsz)
2505 {
2506     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2507     static const GVecGen2 g[4] = {
2508         { .fni8 = tcg_gen_vec_neg8_i64,
2509           .fniv = tcg_gen_neg_vec,
2510           .fno = gen_helper_gvec_neg8,
2511           .opt_opc = vecop_list,
2512           .vece = MO_8 },
2513         { .fni8 = tcg_gen_vec_neg16_i64,
2514           .fniv = tcg_gen_neg_vec,
2515           .fno = gen_helper_gvec_neg16,
2516           .opt_opc = vecop_list,
2517           .vece = MO_16 },
2518         { .fni4 = tcg_gen_neg_i32,
2519           .fniv = tcg_gen_neg_vec,
2520           .fno = gen_helper_gvec_neg32,
2521           .opt_opc = vecop_list,
2522           .vece = MO_32 },
2523         { .fni8 = tcg_gen_neg_i64,
2524           .fniv = tcg_gen_neg_vec,
2525           .fno = gen_helper_gvec_neg64,
2526           .opt_opc = vecop_list,
2527           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2528           .vece = MO_64 },
2529     };
2530 
2531     tcg_debug_assert(vece <= MO_64);
2532     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2533 }
2534 
2535 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2536 {
2537     TCGv_i64 t = tcg_temp_ebb_new_i64();
2538     int nbit = 8 << vece;
2539 
2540     /* Create -1 for each negative element.  */
2541     tcg_gen_shri_i64(t, b, nbit - 1);
2542     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2543     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2544 
2545     /*
2546      * Invert (via xor -1) and add one.
2547      * Because of the ordering the msb is cleared,
2548      * so we never have carry into the next element.
2549      */
2550     tcg_gen_xor_i64(d, b, t);
2551     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2552     tcg_gen_add_i64(d, d, t);
2553 
2554     tcg_temp_free_i64(t);
2555 }
2556 
2557 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2558 {
2559     gen_absv_mask(d, b, MO_8);
2560 }
2561 
2562 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2563 {
2564     gen_absv_mask(d, b, MO_16);
2565 }
2566 
2567 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2568                       uint32_t oprsz, uint32_t maxsz)
2569 {
2570     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2571     static const GVecGen2 g[4] = {
2572         { .fni8 = tcg_gen_vec_abs8_i64,
2573           .fniv = tcg_gen_abs_vec,
2574           .fno = gen_helper_gvec_abs8,
2575           .opt_opc = vecop_list,
2576           .vece = MO_8 },
2577         { .fni8 = tcg_gen_vec_abs16_i64,
2578           .fniv = tcg_gen_abs_vec,
2579           .fno = gen_helper_gvec_abs16,
2580           .opt_opc = vecop_list,
2581           .vece = MO_16 },
2582         { .fni4 = tcg_gen_abs_i32,
2583           .fniv = tcg_gen_abs_vec,
2584           .fno = gen_helper_gvec_abs32,
2585           .opt_opc = vecop_list,
2586           .vece = MO_32 },
2587         { .fni8 = tcg_gen_abs_i64,
2588           .fniv = tcg_gen_abs_vec,
2589           .fno = gen_helper_gvec_abs64,
2590           .opt_opc = vecop_list,
2591           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2592           .vece = MO_64 },
2593     };
2594 
2595     tcg_debug_assert(vece <= MO_64);
2596     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2597 }
2598 
2599 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2600                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2601 {
2602     static const GVecGen3 g = {
2603         .fni8 = tcg_gen_and_i64,
2604         .fniv = tcg_gen_and_vec,
2605         .fno = gen_helper_gvec_and,
2606         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2607     };
2608 
2609     if (aofs == bofs) {
2610         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2611     } else {
2612         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2613     }
2614 }
2615 
2616 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2617                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2618 {
2619     static const GVecGen3 g = {
2620         .fni8 = tcg_gen_or_i64,
2621         .fniv = tcg_gen_or_vec,
2622         .fno = gen_helper_gvec_or,
2623         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2624     };
2625 
2626     if (aofs == bofs) {
2627         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2628     } else {
2629         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2630     }
2631 }
2632 
2633 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2634                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2635 {
2636     static const GVecGen3 g = {
2637         .fni8 = tcg_gen_xor_i64,
2638         .fniv = tcg_gen_xor_vec,
2639         .fno = gen_helper_gvec_xor,
2640         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2641     };
2642 
2643     if (aofs == bofs) {
2644         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2645     } else {
2646         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2647     }
2648 }
2649 
2650 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2651                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2652 {
2653     static const GVecGen3 g = {
2654         .fni8 = tcg_gen_andc_i64,
2655         .fniv = tcg_gen_andc_vec,
2656         .fno = gen_helper_gvec_andc,
2657         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2658     };
2659 
2660     if (aofs == bofs) {
2661         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2662     } else {
2663         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2664     }
2665 }
2666 
2667 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2668                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2669 {
2670     static const GVecGen3 g = {
2671         .fni8 = tcg_gen_orc_i64,
2672         .fniv = tcg_gen_orc_vec,
2673         .fno = gen_helper_gvec_orc,
2674         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2675     };
2676 
2677     if (aofs == bofs) {
2678         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2679     } else {
2680         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2681     }
2682 }
2683 
2684 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2685                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2686 {
2687     static const GVecGen3 g = {
2688         .fni8 = tcg_gen_nand_i64,
2689         .fniv = tcg_gen_nand_vec,
2690         .fno = gen_helper_gvec_nand,
2691         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2692     };
2693 
2694     if (aofs == bofs) {
2695         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2696     } else {
2697         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2698     }
2699 }
2700 
2701 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2702                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2703 {
2704     static const GVecGen3 g = {
2705         .fni8 = tcg_gen_nor_i64,
2706         .fniv = tcg_gen_nor_vec,
2707         .fno = gen_helper_gvec_nor,
2708         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2709     };
2710 
2711     if (aofs == bofs) {
2712         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2713     } else {
2714         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2715     }
2716 }
2717 
2718 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2719                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2720 {
2721     static const GVecGen3 g = {
2722         .fni8 = tcg_gen_eqv_i64,
2723         .fniv = tcg_gen_eqv_vec,
2724         .fno = gen_helper_gvec_eqv,
2725         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2726     };
2727 
2728     if (aofs == bofs) {
2729         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2730     } else {
2731         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2732     }
2733 }
2734 
2735 static const GVecGen2s gop_ands = {
2736     .fni8 = tcg_gen_and_i64,
2737     .fniv = tcg_gen_and_vec,
2738     .fno = gen_helper_gvec_ands,
2739     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2740     .vece = MO_64
2741 };
2742 
2743 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2744                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2745 {
2746     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2747     tcg_gen_dup_i64(vece, tmp, c);
2748     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2749     tcg_temp_free_i64(tmp);
2750 }
2751 
2752 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2753                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2754 {
2755     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2756     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2757 }
2758 
2759 void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
2760                         TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2761 {
2762     static GVecGen2s g = {
2763         .fni8 = tcg_gen_andc_i64,
2764         .fniv = tcg_gen_andc_vec,
2765         .fno = gen_helper_gvec_andcs,
2766         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2767         .vece = MO_64
2768     };
2769 
2770     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2771     tcg_gen_dup_i64(vece, tmp, c);
2772     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &g);
2773     tcg_temp_free_i64(tmp);
2774 }
2775 
2776 static const GVecGen2s gop_xors = {
2777     .fni8 = tcg_gen_xor_i64,
2778     .fniv = tcg_gen_xor_vec,
2779     .fno = gen_helper_gvec_xors,
2780     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2781     .vece = MO_64
2782 };
2783 
2784 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2785                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2786 {
2787     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2788     tcg_gen_dup_i64(vece, tmp, c);
2789     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2790     tcg_temp_free_i64(tmp);
2791 }
2792 
2793 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2794                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2795 {
2796     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2797     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2798 }
2799 
2800 static const GVecGen2s gop_ors = {
2801     .fni8 = tcg_gen_or_i64,
2802     .fniv = tcg_gen_or_vec,
2803     .fno = gen_helper_gvec_ors,
2804     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2805     .vece = MO_64
2806 };
2807 
2808 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2809                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2810 {
2811     TCGv_i64 tmp = tcg_temp_ebb_new_i64();
2812     tcg_gen_dup_i64(vece, tmp, c);
2813     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2814     tcg_temp_free_i64(tmp);
2815 }
2816 
2817 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2818                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2819 {
2820     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2821     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2822 }
2823 
2824 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2825 {
2826     uint64_t mask = dup_const(MO_8, 0xff << c);
2827     tcg_gen_shli_i64(d, a, c);
2828     tcg_gen_andi_i64(d, d, mask);
2829 }
2830 
2831 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2832 {
2833     uint64_t mask = dup_const(MO_16, 0xffff << c);
2834     tcg_gen_shli_i64(d, a, c);
2835     tcg_gen_andi_i64(d, d, mask);
2836 }
2837 
2838 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2839 {
2840     uint32_t mask = dup_const(MO_8, 0xff << c);
2841     tcg_gen_shli_i32(d, a, c);
2842     tcg_gen_andi_i32(d, d, mask);
2843 }
2844 
2845 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2846 {
2847     uint32_t mask = dup_const(MO_16, 0xffff << c);
2848     tcg_gen_shli_i32(d, a, c);
2849     tcg_gen_andi_i32(d, d, mask);
2850 }
2851 
2852 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2853                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2854 {
2855     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2856     static const GVecGen2i g[4] = {
2857         { .fni8 = tcg_gen_vec_shl8i_i64,
2858           .fniv = tcg_gen_shli_vec,
2859           .fno = gen_helper_gvec_shl8i,
2860           .opt_opc = vecop_list,
2861           .vece = MO_8 },
2862         { .fni8 = tcg_gen_vec_shl16i_i64,
2863           .fniv = tcg_gen_shli_vec,
2864           .fno = gen_helper_gvec_shl16i,
2865           .opt_opc = vecop_list,
2866           .vece = MO_16 },
2867         { .fni4 = tcg_gen_shli_i32,
2868           .fniv = tcg_gen_shli_vec,
2869           .fno = gen_helper_gvec_shl32i,
2870           .opt_opc = vecop_list,
2871           .vece = MO_32 },
2872         { .fni8 = tcg_gen_shli_i64,
2873           .fniv = tcg_gen_shli_vec,
2874           .fno = gen_helper_gvec_shl64i,
2875           .opt_opc = vecop_list,
2876           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2877           .vece = MO_64 },
2878     };
2879 
2880     tcg_debug_assert(vece <= MO_64);
2881     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2882     if (shift == 0) {
2883         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2884     } else {
2885         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2886     }
2887 }
2888 
2889 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2890 {
2891     uint64_t mask = dup_const(MO_8, 0xff >> c);
2892     tcg_gen_shri_i64(d, a, c);
2893     tcg_gen_andi_i64(d, d, mask);
2894 }
2895 
2896 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2897 {
2898     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2899     tcg_gen_shri_i64(d, a, c);
2900     tcg_gen_andi_i64(d, d, mask);
2901 }
2902 
2903 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2904 {
2905     uint32_t mask = dup_const(MO_8, 0xff >> c);
2906     tcg_gen_shri_i32(d, a, c);
2907     tcg_gen_andi_i32(d, d, mask);
2908 }
2909 
2910 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2911 {
2912     uint32_t mask = dup_const(MO_16, 0xffff >> c);
2913     tcg_gen_shri_i32(d, a, c);
2914     tcg_gen_andi_i32(d, d, mask);
2915 }
2916 
2917 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2918                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2919 {
2920     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2921     static const GVecGen2i g[4] = {
2922         { .fni8 = tcg_gen_vec_shr8i_i64,
2923           .fniv = tcg_gen_shri_vec,
2924           .fno = gen_helper_gvec_shr8i,
2925           .opt_opc = vecop_list,
2926           .vece = MO_8 },
2927         { .fni8 = tcg_gen_vec_shr16i_i64,
2928           .fniv = tcg_gen_shri_vec,
2929           .fno = gen_helper_gvec_shr16i,
2930           .opt_opc = vecop_list,
2931           .vece = MO_16 },
2932         { .fni4 = tcg_gen_shri_i32,
2933           .fniv = tcg_gen_shri_vec,
2934           .fno = gen_helper_gvec_shr32i,
2935           .opt_opc = vecop_list,
2936           .vece = MO_32 },
2937         { .fni8 = tcg_gen_shri_i64,
2938           .fniv = tcg_gen_shri_vec,
2939           .fno = gen_helper_gvec_shr64i,
2940           .opt_opc = vecop_list,
2941           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2942           .vece = MO_64 },
2943     };
2944 
2945     tcg_debug_assert(vece <= MO_64);
2946     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2947     if (shift == 0) {
2948         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2949     } else {
2950         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2951     }
2952 }
2953 
2954 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2955 {
2956     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2957     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2958     TCGv_i64 s = tcg_temp_ebb_new_i64();
2959 
2960     tcg_gen_shri_i64(d, a, c);
2961     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2962     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2963     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2964     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2965     tcg_temp_free_i64(s);
2966 }
2967 
2968 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2969 {
2970     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2971     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2972     TCGv_i64 s = tcg_temp_ebb_new_i64();
2973 
2974     tcg_gen_shri_i64(d, a, c);
2975     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2976     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2977     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2978     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2979     tcg_temp_free_i64(s);
2980 }
2981 
2982 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2983 {
2984     uint32_t s_mask = dup_const(MO_8, 0x80 >> c);
2985     uint32_t c_mask = dup_const(MO_8, 0xff >> c);
2986     TCGv_i32 s = tcg_temp_ebb_new_i32();
2987 
2988     tcg_gen_shri_i32(d, a, c);
2989     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2990     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2991     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2992     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2993     tcg_temp_free_i32(s);
2994 }
2995 
2996 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2997 {
2998     uint32_t s_mask = dup_const(MO_16, 0x8000 >> c);
2999     uint32_t c_mask = dup_const(MO_16, 0xffff >> c);
3000     TCGv_i32 s = tcg_temp_ebb_new_i32();
3001 
3002     tcg_gen_shri_i32(d, a, c);
3003     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
3004     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
3005     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
3006     tcg_gen_or_i32(d, d, s);         /* include sign extension */
3007     tcg_temp_free_i32(s);
3008 }
3009 
3010 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
3011                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
3012 {
3013     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
3014     static const GVecGen2i g[4] = {
3015         { .fni8 = tcg_gen_vec_sar8i_i64,
3016           .fniv = tcg_gen_sari_vec,
3017           .fno = gen_helper_gvec_sar8i,
3018           .opt_opc = vecop_list,
3019           .vece = MO_8 },
3020         { .fni8 = tcg_gen_vec_sar16i_i64,
3021           .fniv = tcg_gen_sari_vec,
3022           .fno = gen_helper_gvec_sar16i,
3023           .opt_opc = vecop_list,
3024           .vece = MO_16 },
3025         { .fni4 = tcg_gen_sari_i32,
3026           .fniv = tcg_gen_sari_vec,
3027           .fno = gen_helper_gvec_sar32i,
3028           .opt_opc = vecop_list,
3029           .vece = MO_32 },
3030         { .fni8 = tcg_gen_sari_i64,
3031           .fniv = tcg_gen_sari_vec,
3032           .fno = gen_helper_gvec_sar64i,
3033           .opt_opc = vecop_list,
3034           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3035           .vece = MO_64 },
3036     };
3037 
3038     tcg_debug_assert(vece <= MO_64);
3039     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3040     if (shift == 0) {
3041         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3042     } else {
3043         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3044     }
3045 }
3046 
3047 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3048 {
3049     uint64_t mask = dup_const(MO_8, 0xff << c);
3050 
3051     tcg_gen_shli_i64(d, a, c);
3052     tcg_gen_shri_i64(a, a, 8 - c);
3053     tcg_gen_andi_i64(d, d, mask);
3054     tcg_gen_andi_i64(a, a, ~mask);
3055     tcg_gen_or_i64(d, d, a);
3056 }
3057 
3058 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3059 {
3060     uint64_t mask = dup_const(MO_16, 0xffff << c);
3061 
3062     tcg_gen_shli_i64(d, a, c);
3063     tcg_gen_shri_i64(a, a, 16 - c);
3064     tcg_gen_andi_i64(d, d, mask);
3065     tcg_gen_andi_i64(a, a, ~mask);
3066     tcg_gen_or_i64(d, d, a);
3067 }
3068 
3069 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
3070                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3071 {
3072     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
3073     static const GVecGen2i g[4] = {
3074         { .fni8 = tcg_gen_vec_rotl8i_i64,
3075           .fniv = tcg_gen_rotli_vec,
3076           .fno = gen_helper_gvec_rotl8i,
3077           .opt_opc = vecop_list,
3078           .vece = MO_8 },
3079         { .fni8 = tcg_gen_vec_rotl16i_i64,
3080           .fniv = tcg_gen_rotli_vec,
3081           .fno = gen_helper_gvec_rotl16i,
3082           .opt_opc = vecop_list,
3083           .vece = MO_16 },
3084         { .fni4 = tcg_gen_rotli_i32,
3085           .fniv = tcg_gen_rotli_vec,
3086           .fno = gen_helper_gvec_rotl32i,
3087           .opt_opc = vecop_list,
3088           .vece = MO_32 },
3089         { .fni8 = tcg_gen_rotli_i64,
3090           .fniv = tcg_gen_rotli_vec,
3091           .fno = gen_helper_gvec_rotl64i,
3092           .opt_opc = vecop_list,
3093           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3094           .vece = MO_64 },
3095     };
3096 
3097     tcg_debug_assert(vece <= MO_64);
3098     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3099     if (shift == 0) {
3100         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3101     } else {
3102         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3103     }
3104 }
3105 
3106 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
3107                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
3108 {
3109     tcg_debug_assert(vece <= MO_64);
3110     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3111     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
3112                        oprsz, maxsz);
3113 }
3114 
3115 /*
3116  * Specialized generation vector shifts by a non-constant scalar.
3117  */
3118 
3119 typedef struct {
3120     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
3121     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
3122     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
3123     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
3124     gen_helper_gvec_2 *fno[4];
3125     TCGOpcode s_list[2];
3126     TCGOpcode v_list[2];
3127 } GVecGen2sh;
3128 
3129 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3130                            uint32_t oprsz, uint32_t tysz, TCGType type,
3131                            TCGv_i32 shift,
3132                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
3133 {
3134     for (uint32_t i = 0; i < oprsz; i += tysz) {
3135         TCGv_vec t0 = tcg_temp_new_vec(type);
3136         TCGv_vec t1 = tcg_temp_new_vec(type);
3137 
3138         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
3139         fni(vece, t1, t0, shift);
3140         tcg_gen_st_vec(t1, tcg_env, dofs + i);
3141     }
3142 }
3143 
3144 static void
3145 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
3146                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
3147 {
3148     TCGType type;
3149     uint32_t some;
3150 
3151     check_size_align(oprsz, maxsz, dofs | aofs);
3152     check_overlap_2(dofs, aofs, maxsz);
3153 
3154     /* If the backend has a scalar expansion, great.  */
3155     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
3156     if (type) {
3157         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3158         switch (type) {
3159         case TCG_TYPE_V256:
3160             some = QEMU_ALIGN_DOWN(oprsz, 32);
3161             expand_2sh_vec(vece, dofs, aofs, some, 32,
3162                            TCG_TYPE_V256, shift, g->fniv_s);
3163             if (some == oprsz) {
3164                 break;
3165             }
3166             dofs += some;
3167             aofs += some;
3168             oprsz -= some;
3169             maxsz -= some;
3170             /* fallthru */
3171         case TCG_TYPE_V128:
3172             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
3173                            TCG_TYPE_V128, shift, g->fniv_s);
3174             break;
3175         case TCG_TYPE_V64:
3176             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
3177                            TCG_TYPE_V64, shift, g->fniv_s);
3178             break;
3179         default:
3180             g_assert_not_reached();
3181         }
3182         tcg_swap_vecop_list(hold_list);
3183         goto clear_tail;
3184     }
3185 
3186     /* If the backend supports variable vector shifts, also cool.  */
3187     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
3188     if (type) {
3189         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3190         TCGv_vec v_shift = tcg_temp_new_vec(type);
3191 
3192         if (vece == MO_64) {
3193             TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3194             tcg_gen_extu_i32_i64(sh64, shift);
3195             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
3196             tcg_temp_free_i64(sh64);
3197         } else {
3198             tcg_gen_dup_i32_vec(vece, v_shift, shift);
3199         }
3200 
3201         switch (type) {
3202         case TCG_TYPE_V256:
3203             some = QEMU_ALIGN_DOWN(oprsz, 32);
3204             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
3205                           v_shift, false, g->fniv_v);
3206             if (some == oprsz) {
3207                 break;
3208             }
3209             dofs += some;
3210             aofs += some;
3211             oprsz -= some;
3212             maxsz -= some;
3213             /* fallthru */
3214         case TCG_TYPE_V128:
3215             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
3216                           v_shift, false, g->fniv_v);
3217             break;
3218         case TCG_TYPE_V64:
3219             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
3220                           v_shift, false, g->fniv_v);
3221             break;
3222         default:
3223             g_assert_not_reached();
3224         }
3225         tcg_temp_free_vec(v_shift);
3226         tcg_swap_vecop_list(hold_list);
3227         goto clear_tail;
3228     }
3229 
3230     /* Otherwise fall back to integral... */
3231     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3232         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
3233     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3234         TCGv_i64 sh64 = tcg_temp_ebb_new_i64();
3235         tcg_gen_extu_i32_i64(sh64, shift);
3236         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
3237         tcg_temp_free_i64(sh64);
3238     } else {
3239         TCGv_ptr a0 = tcg_temp_ebb_new_ptr();
3240         TCGv_ptr a1 = tcg_temp_ebb_new_ptr();
3241         TCGv_i32 desc = tcg_temp_ebb_new_i32();
3242 
3243         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
3244         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
3245         tcg_gen_addi_ptr(a0, tcg_env, dofs);
3246         tcg_gen_addi_ptr(a1, tcg_env, aofs);
3247 
3248         g->fno[vece](a0, a1, desc);
3249 
3250         tcg_temp_free_ptr(a0);
3251         tcg_temp_free_ptr(a1);
3252         tcg_temp_free_i32(desc);
3253         return;
3254     }
3255 
3256  clear_tail:
3257     if (oprsz < maxsz) {
3258         expand_clr(dofs + oprsz, maxsz - oprsz);
3259     }
3260 }
3261 
3262 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3263                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3264 {
3265     static const GVecGen2sh g = {
3266         .fni4 = tcg_gen_shl_i32,
3267         .fni8 = tcg_gen_shl_i64,
3268         .fniv_s = tcg_gen_shls_vec,
3269         .fniv_v = tcg_gen_shlv_vec,
3270         .fno = {
3271             gen_helper_gvec_shl8i,
3272             gen_helper_gvec_shl16i,
3273             gen_helper_gvec_shl32i,
3274             gen_helper_gvec_shl64i,
3275         },
3276         .s_list = { INDEX_op_shls_vec, 0 },
3277         .v_list = { INDEX_op_shlv_vec, 0 },
3278     };
3279 
3280     tcg_debug_assert(vece <= MO_64);
3281     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3282 }
3283 
3284 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3285                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3286 {
3287     static const GVecGen2sh g = {
3288         .fni4 = tcg_gen_shr_i32,
3289         .fni8 = tcg_gen_shr_i64,
3290         .fniv_s = tcg_gen_shrs_vec,
3291         .fniv_v = tcg_gen_shrv_vec,
3292         .fno = {
3293             gen_helper_gvec_shr8i,
3294             gen_helper_gvec_shr16i,
3295             gen_helper_gvec_shr32i,
3296             gen_helper_gvec_shr64i,
3297         },
3298         .s_list = { INDEX_op_shrs_vec, 0 },
3299         .v_list = { INDEX_op_shrv_vec, 0 },
3300     };
3301 
3302     tcg_debug_assert(vece <= MO_64);
3303     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3304 }
3305 
3306 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3307                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3308 {
3309     static const GVecGen2sh g = {
3310         .fni4 = tcg_gen_sar_i32,
3311         .fni8 = tcg_gen_sar_i64,
3312         .fniv_s = tcg_gen_sars_vec,
3313         .fniv_v = tcg_gen_sarv_vec,
3314         .fno = {
3315             gen_helper_gvec_sar8i,
3316             gen_helper_gvec_sar16i,
3317             gen_helper_gvec_sar32i,
3318             gen_helper_gvec_sar64i,
3319         },
3320         .s_list = { INDEX_op_sars_vec, 0 },
3321         .v_list = { INDEX_op_sarv_vec, 0 },
3322     };
3323 
3324     tcg_debug_assert(vece <= MO_64);
3325     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3326 }
3327 
3328 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3329                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3330 {
3331     static const GVecGen2sh g = {
3332         .fni4 = tcg_gen_rotl_i32,
3333         .fni8 = tcg_gen_rotl_i64,
3334         .fniv_s = tcg_gen_rotls_vec,
3335         .fniv_v = tcg_gen_rotlv_vec,
3336         .fno = {
3337             gen_helper_gvec_rotl8i,
3338             gen_helper_gvec_rotl16i,
3339             gen_helper_gvec_rotl32i,
3340             gen_helper_gvec_rotl64i,
3341         },
3342         .s_list = { INDEX_op_rotls_vec, 0 },
3343         .v_list = { INDEX_op_rotlv_vec, 0 },
3344     };
3345 
3346     tcg_debug_assert(vece <= MO_64);
3347     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3348 }
3349 
3350 void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3351                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3352 {
3353     TCGv_i32 tmp = tcg_temp_ebb_new_i32();
3354 
3355     tcg_gen_neg_i32(tmp, shift);
3356     tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1);
3357     tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz);
3358     tcg_temp_free_i32(tmp);
3359 }
3360 
3361 /*
3362  * Expand D = A << (B % element bits)
3363  *
3364  * Unlike scalar shifts, where it is easy for the target front end
3365  * to include the modulo as part of the expansion.  If the target
3366  * naturally includes the modulo as part of the operation, great!
3367  * If the target has some other behaviour from out-of-range shifts,
3368  * then it could not use this function anyway, and would need to
3369  * do it's own expansion with custom functions.
3370  */
3371 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3372                                  TCGv_vec a, TCGv_vec b)
3373 {
3374     TCGv_vec t = tcg_temp_new_vec_matching(d);
3375     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3376 
3377     tcg_gen_and_vec(vece, t, b, m);
3378     tcg_gen_shlv_vec(vece, d, a, t);
3379     tcg_temp_free_vec(t);
3380 }
3381 
3382 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3383 {
3384     TCGv_i32 t = tcg_temp_ebb_new_i32();
3385 
3386     tcg_gen_andi_i32(t, b, 31);
3387     tcg_gen_shl_i32(d, a, t);
3388     tcg_temp_free_i32(t);
3389 }
3390 
3391 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3392 {
3393     TCGv_i64 t = tcg_temp_ebb_new_i64();
3394 
3395     tcg_gen_andi_i64(t, b, 63);
3396     tcg_gen_shl_i64(d, a, t);
3397     tcg_temp_free_i64(t);
3398 }
3399 
3400 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3401                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3402 {
3403     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3404     static const GVecGen3 g[4] = {
3405         { .fniv = tcg_gen_shlv_mod_vec,
3406           .fno = gen_helper_gvec_shl8v,
3407           .opt_opc = vecop_list,
3408           .vece = MO_8 },
3409         { .fniv = tcg_gen_shlv_mod_vec,
3410           .fno = gen_helper_gvec_shl16v,
3411           .opt_opc = vecop_list,
3412           .vece = MO_16 },
3413         { .fni4 = tcg_gen_shl_mod_i32,
3414           .fniv = tcg_gen_shlv_mod_vec,
3415           .fno = gen_helper_gvec_shl32v,
3416           .opt_opc = vecop_list,
3417           .vece = MO_32 },
3418         { .fni8 = tcg_gen_shl_mod_i64,
3419           .fniv = tcg_gen_shlv_mod_vec,
3420           .fno = gen_helper_gvec_shl64v,
3421           .opt_opc = vecop_list,
3422           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3423           .vece = MO_64 },
3424     };
3425 
3426     tcg_debug_assert(vece <= MO_64);
3427     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3428 }
3429 
3430 /*
3431  * Similarly for logical right shifts.
3432  */
3433 
3434 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3435                                  TCGv_vec a, TCGv_vec b)
3436 {
3437     TCGv_vec t = tcg_temp_new_vec_matching(d);
3438     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3439 
3440     tcg_gen_and_vec(vece, t, b, m);
3441     tcg_gen_shrv_vec(vece, d, a, t);
3442     tcg_temp_free_vec(t);
3443 }
3444 
3445 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3446 {
3447     TCGv_i32 t = tcg_temp_ebb_new_i32();
3448 
3449     tcg_gen_andi_i32(t, b, 31);
3450     tcg_gen_shr_i32(d, a, t);
3451     tcg_temp_free_i32(t);
3452 }
3453 
3454 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3455 {
3456     TCGv_i64 t = tcg_temp_ebb_new_i64();
3457 
3458     tcg_gen_andi_i64(t, b, 63);
3459     tcg_gen_shr_i64(d, a, t);
3460     tcg_temp_free_i64(t);
3461 }
3462 
3463 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3464                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3465 {
3466     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3467     static const GVecGen3 g[4] = {
3468         { .fniv = tcg_gen_shrv_mod_vec,
3469           .fno = gen_helper_gvec_shr8v,
3470           .opt_opc = vecop_list,
3471           .vece = MO_8 },
3472         { .fniv = tcg_gen_shrv_mod_vec,
3473           .fno = gen_helper_gvec_shr16v,
3474           .opt_opc = vecop_list,
3475           .vece = MO_16 },
3476         { .fni4 = tcg_gen_shr_mod_i32,
3477           .fniv = tcg_gen_shrv_mod_vec,
3478           .fno = gen_helper_gvec_shr32v,
3479           .opt_opc = vecop_list,
3480           .vece = MO_32 },
3481         { .fni8 = tcg_gen_shr_mod_i64,
3482           .fniv = tcg_gen_shrv_mod_vec,
3483           .fno = gen_helper_gvec_shr64v,
3484           .opt_opc = vecop_list,
3485           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3486           .vece = MO_64 },
3487     };
3488 
3489     tcg_debug_assert(vece <= MO_64);
3490     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3491 }
3492 
3493 /*
3494  * Similarly for arithmetic right shifts.
3495  */
3496 
3497 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3498                                  TCGv_vec a, TCGv_vec b)
3499 {
3500     TCGv_vec t = tcg_temp_new_vec_matching(d);
3501     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3502 
3503     tcg_gen_and_vec(vece, t, b, m);
3504     tcg_gen_sarv_vec(vece, d, a, t);
3505     tcg_temp_free_vec(t);
3506 }
3507 
3508 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3509 {
3510     TCGv_i32 t = tcg_temp_ebb_new_i32();
3511 
3512     tcg_gen_andi_i32(t, b, 31);
3513     tcg_gen_sar_i32(d, a, t);
3514     tcg_temp_free_i32(t);
3515 }
3516 
3517 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3518 {
3519     TCGv_i64 t = tcg_temp_ebb_new_i64();
3520 
3521     tcg_gen_andi_i64(t, b, 63);
3522     tcg_gen_sar_i64(d, a, t);
3523     tcg_temp_free_i64(t);
3524 }
3525 
3526 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3527                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3528 {
3529     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3530     static const GVecGen3 g[4] = {
3531         { .fniv = tcg_gen_sarv_mod_vec,
3532           .fno = gen_helper_gvec_sar8v,
3533           .opt_opc = vecop_list,
3534           .vece = MO_8 },
3535         { .fniv = tcg_gen_sarv_mod_vec,
3536           .fno = gen_helper_gvec_sar16v,
3537           .opt_opc = vecop_list,
3538           .vece = MO_16 },
3539         { .fni4 = tcg_gen_sar_mod_i32,
3540           .fniv = tcg_gen_sarv_mod_vec,
3541           .fno = gen_helper_gvec_sar32v,
3542           .opt_opc = vecop_list,
3543           .vece = MO_32 },
3544         { .fni8 = tcg_gen_sar_mod_i64,
3545           .fniv = tcg_gen_sarv_mod_vec,
3546           .fno = gen_helper_gvec_sar64v,
3547           .opt_opc = vecop_list,
3548           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3549           .vece = MO_64 },
3550     };
3551 
3552     tcg_debug_assert(vece <= MO_64);
3553     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3554 }
3555 
3556 /*
3557  * Similarly for rotates.
3558  */
3559 
3560 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3561                                   TCGv_vec a, TCGv_vec b)
3562 {
3563     TCGv_vec t = tcg_temp_new_vec_matching(d);
3564     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3565 
3566     tcg_gen_and_vec(vece, t, b, m);
3567     tcg_gen_rotlv_vec(vece, d, a, t);
3568     tcg_temp_free_vec(t);
3569 }
3570 
3571 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3572 {
3573     TCGv_i32 t = tcg_temp_ebb_new_i32();
3574 
3575     tcg_gen_andi_i32(t, b, 31);
3576     tcg_gen_rotl_i32(d, a, t);
3577     tcg_temp_free_i32(t);
3578 }
3579 
3580 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3581 {
3582     TCGv_i64 t = tcg_temp_ebb_new_i64();
3583 
3584     tcg_gen_andi_i64(t, b, 63);
3585     tcg_gen_rotl_i64(d, a, t);
3586     tcg_temp_free_i64(t);
3587 }
3588 
3589 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3590                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3591 {
3592     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3593     static const GVecGen3 g[4] = {
3594         { .fniv = tcg_gen_rotlv_mod_vec,
3595           .fno = gen_helper_gvec_rotl8v,
3596           .opt_opc = vecop_list,
3597           .vece = MO_8 },
3598         { .fniv = tcg_gen_rotlv_mod_vec,
3599           .fno = gen_helper_gvec_rotl16v,
3600           .opt_opc = vecop_list,
3601           .vece = MO_16 },
3602         { .fni4 = tcg_gen_rotl_mod_i32,
3603           .fniv = tcg_gen_rotlv_mod_vec,
3604           .fno = gen_helper_gvec_rotl32v,
3605           .opt_opc = vecop_list,
3606           .vece = MO_32 },
3607         { .fni8 = tcg_gen_rotl_mod_i64,
3608           .fniv = tcg_gen_rotlv_mod_vec,
3609           .fno = gen_helper_gvec_rotl64v,
3610           .opt_opc = vecop_list,
3611           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3612           .vece = MO_64 },
3613     };
3614 
3615     tcg_debug_assert(vece <= MO_64);
3616     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3617 }
3618 
3619 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3620                                   TCGv_vec a, TCGv_vec b)
3621 {
3622     TCGv_vec t = tcg_temp_new_vec_matching(d);
3623     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3624 
3625     tcg_gen_and_vec(vece, t, b, m);
3626     tcg_gen_rotrv_vec(vece, d, a, t);
3627     tcg_temp_free_vec(t);
3628 }
3629 
3630 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3631 {
3632     TCGv_i32 t = tcg_temp_ebb_new_i32();
3633 
3634     tcg_gen_andi_i32(t, b, 31);
3635     tcg_gen_rotr_i32(d, a, t);
3636     tcg_temp_free_i32(t);
3637 }
3638 
3639 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3640 {
3641     TCGv_i64 t = tcg_temp_ebb_new_i64();
3642 
3643     tcg_gen_andi_i64(t, b, 63);
3644     tcg_gen_rotr_i64(d, a, t);
3645     tcg_temp_free_i64(t);
3646 }
3647 
3648 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3649                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3650 {
3651     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3652     static const GVecGen3 g[4] = {
3653         { .fniv = tcg_gen_rotrv_mod_vec,
3654           .fno = gen_helper_gvec_rotr8v,
3655           .opt_opc = vecop_list,
3656           .vece = MO_8 },
3657         { .fniv = tcg_gen_rotrv_mod_vec,
3658           .fno = gen_helper_gvec_rotr16v,
3659           .opt_opc = vecop_list,
3660           .vece = MO_16 },
3661         { .fni4 = tcg_gen_rotr_mod_i32,
3662           .fniv = tcg_gen_rotrv_mod_vec,
3663           .fno = gen_helper_gvec_rotr32v,
3664           .opt_opc = vecop_list,
3665           .vece = MO_32 },
3666         { .fni8 = tcg_gen_rotr_mod_i64,
3667           .fniv = tcg_gen_rotrv_mod_vec,
3668           .fno = gen_helper_gvec_rotr64v,
3669           .opt_opc = vecop_list,
3670           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3671           .vece = MO_64 },
3672     };
3673 
3674     tcg_debug_assert(vece <= MO_64);
3675     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3676 }
3677 
3678 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3679 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3680                            uint32_t oprsz, TCGCond cond)
3681 {
3682     TCGv_i32 t0 = tcg_temp_ebb_new_i32();
3683     TCGv_i32 t1 = tcg_temp_ebb_new_i32();
3684     uint32_t i;
3685 
3686     for (i = 0; i < oprsz; i += 4) {
3687         tcg_gen_ld_i32(t0, tcg_env, aofs + i);
3688         tcg_gen_ld_i32(t1, tcg_env, bofs + i);
3689         tcg_gen_negsetcond_i32(cond, t0, t0, t1);
3690         tcg_gen_st_i32(t0, tcg_env, dofs + i);
3691     }
3692     tcg_temp_free_i32(t1);
3693     tcg_temp_free_i32(t0);
3694 }
3695 
3696 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3697                            uint32_t oprsz, TCGCond cond)
3698 {
3699     TCGv_i64 t0 = tcg_temp_ebb_new_i64();
3700     TCGv_i64 t1 = tcg_temp_ebb_new_i64();
3701     uint32_t i;
3702 
3703     for (i = 0; i < oprsz; i += 8) {
3704         tcg_gen_ld_i64(t0, tcg_env, aofs + i);
3705         tcg_gen_ld_i64(t1, tcg_env, bofs + i);
3706         tcg_gen_negsetcond_i64(cond, t0, t0, t1);
3707         tcg_gen_st_i64(t0, tcg_env, dofs + i);
3708     }
3709     tcg_temp_free_i64(t1);
3710     tcg_temp_free_i64(t0);
3711 }
3712 
3713 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3714                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3715                            TCGType type, TCGCond cond)
3716 {
3717     for (uint32_t i = 0; i < oprsz; i += tysz) {
3718         TCGv_vec t0 = tcg_temp_new_vec(type);
3719         TCGv_vec t1 = tcg_temp_new_vec(type);
3720         TCGv_vec t2 = tcg_temp_new_vec(type);
3721 
3722         tcg_gen_ld_vec(t0, tcg_env, aofs + i);
3723         tcg_gen_ld_vec(t1, tcg_env, bofs + i);
3724         tcg_gen_cmp_vec(cond, vece, t2, t0, t1);
3725         tcg_gen_st_vec(t2, tcg_env, dofs + i);
3726     }
3727 }
3728 
3729 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3730                       uint32_t aofs, uint32_t bofs,
3731                       uint32_t oprsz, uint32_t maxsz)
3732 {
3733     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3734     static gen_helper_gvec_3 * const eq_fn[4] = {
3735         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3736         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3737     };
3738     static gen_helper_gvec_3 * const ne_fn[4] = {
3739         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3740         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3741     };
3742     static gen_helper_gvec_3 * const lt_fn[4] = {
3743         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3744         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3745     };
3746     static gen_helper_gvec_3 * const le_fn[4] = {
3747         gen_helper_gvec_le8, gen_helper_gvec_le16,
3748         gen_helper_gvec_le32, gen_helper_gvec_le64
3749     };
3750     static gen_helper_gvec_3 * const ltu_fn[4] = {
3751         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3752         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3753     };
3754     static gen_helper_gvec_3 * const leu_fn[4] = {
3755         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3756         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3757     };
3758     static gen_helper_gvec_3 * const * const fns[16] = {
3759         [TCG_COND_EQ] = eq_fn,
3760         [TCG_COND_NE] = ne_fn,
3761         [TCG_COND_LT] = lt_fn,
3762         [TCG_COND_LE] = le_fn,
3763         [TCG_COND_LTU] = ltu_fn,
3764         [TCG_COND_LEU] = leu_fn,
3765     };
3766 
3767     const TCGOpcode *hold_list;
3768     TCGType type;
3769     uint32_t some;
3770 
3771     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3772     check_overlap_3(dofs, aofs, bofs, maxsz);
3773 
3774     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3775         do_dup(MO_8, dofs, oprsz, maxsz,
3776                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3777         return;
3778     }
3779 
3780     /*
3781      * Implement inline with a vector type, if possible.
3782      * Prefer integer when 64-bit host and 64-bit comparison.
3783      */
3784     hold_list = tcg_swap_vecop_list(cmp_list);
3785     type = choose_vector_type(cmp_list, vece, oprsz,
3786                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3787     switch (type) {
3788     case TCG_TYPE_V256:
3789         /* Recall that ARM SVE allows vector sizes that are not a
3790          * power of 2, but always a multiple of 16.  The intent is
3791          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3792          */
3793         some = QEMU_ALIGN_DOWN(oprsz, 32);
3794         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3795         if (some == oprsz) {
3796             break;
3797         }
3798         dofs += some;
3799         aofs += some;
3800         bofs += some;
3801         oprsz -= some;
3802         maxsz -= some;
3803         /* fallthru */
3804     case TCG_TYPE_V128:
3805         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3806         break;
3807     case TCG_TYPE_V64:
3808         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3809         break;
3810 
3811     case 0:
3812         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3813             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3814         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3815             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3816         } else {
3817             gen_helper_gvec_3 * const *fn = fns[cond];
3818 
3819             if (fn == NULL) {
3820                 uint32_t tmp;
3821                 tmp = aofs, aofs = bofs, bofs = tmp;
3822                 cond = tcg_swap_cond(cond);
3823                 fn = fns[cond];
3824                 assert(fn != NULL);
3825             }
3826             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3827             oprsz = maxsz;
3828         }
3829         break;
3830 
3831     default:
3832         g_assert_not_reached();
3833     }
3834     tcg_swap_vecop_list(hold_list);
3835 
3836     if (oprsz < maxsz) {
3837         expand_clr(dofs + oprsz, maxsz - oprsz);
3838     }
3839 }
3840 
3841 static void expand_cmps_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3842                             uint32_t oprsz, uint32_t tysz, TCGType type,
3843                             TCGCond cond, TCGv_vec c)
3844 {
3845     TCGv_vec t0 = tcg_temp_new_vec(type);
3846     TCGv_vec t1 = tcg_temp_new_vec(type);
3847     uint32_t i;
3848 
3849     for (i = 0; i < oprsz; i += tysz) {
3850         tcg_gen_ld_vec(t1, tcg_env, aofs + i);
3851         tcg_gen_cmp_vec(cond, vece, t0, t1, c);
3852         tcg_gen_st_vec(t0, tcg_env, dofs + i);
3853     }
3854 }
3855 
3856 void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
3857                        uint32_t aofs, TCGv_i64 c,
3858                        uint32_t oprsz, uint32_t maxsz)
3859 {
3860     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3861     static gen_helper_gvec_2i * const eq_fn[4] = {
3862         gen_helper_gvec_eqs8, gen_helper_gvec_eqs16,
3863         gen_helper_gvec_eqs32, gen_helper_gvec_eqs64
3864     };
3865     static gen_helper_gvec_2i * const lt_fn[4] = {
3866         gen_helper_gvec_lts8, gen_helper_gvec_lts16,
3867         gen_helper_gvec_lts32, gen_helper_gvec_lts64
3868     };
3869     static gen_helper_gvec_2i * const le_fn[4] = {
3870         gen_helper_gvec_les8, gen_helper_gvec_les16,
3871         gen_helper_gvec_les32, gen_helper_gvec_les64
3872     };
3873     static gen_helper_gvec_2i * const ltu_fn[4] = {
3874         gen_helper_gvec_ltus8, gen_helper_gvec_ltus16,
3875         gen_helper_gvec_ltus32, gen_helper_gvec_ltus64
3876     };
3877     static gen_helper_gvec_2i * const leu_fn[4] = {
3878         gen_helper_gvec_leus8, gen_helper_gvec_leus16,
3879         gen_helper_gvec_leus32, gen_helper_gvec_leus64
3880     };
3881     static gen_helper_gvec_2i * const * const fns[16] = {
3882         [TCG_COND_EQ] = eq_fn,
3883         [TCG_COND_LT] = lt_fn,
3884         [TCG_COND_LE] = le_fn,
3885         [TCG_COND_LTU] = ltu_fn,
3886         [TCG_COND_LEU] = leu_fn,
3887     };
3888 
3889     TCGType type;
3890 
3891     check_size_align(oprsz, maxsz, dofs | aofs);
3892     check_overlap_2(dofs, aofs, maxsz);
3893 
3894     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3895         do_dup(MO_8, dofs, oprsz, maxsz,
3896                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3897         return;
3898     }
3899 
3900     /*
3901      * Implement inline with a vector type, if possible.
3902      * Prefer integer when 64-bit host and 64-bit comparison.
3903      */
3904     type = choose_vector_type(cmp_list, vece, oprsz,
3905                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3906     if (type != 0) {
3907         const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list);
3908         TCGv_vec t_vec = tcg_temp_new_vec(type);
3909         uint32_t some;
3910 
3911         tcg_gen_dup_i64_vec(vece, t_vec, c);
3912         switch (type) {
3913         case TCG_TYPE_V256:
3914             some = QEMU_ALIGN_DOWN(oprsz, 32);
3915             expand_cmps_vec(vece, dofs, aofs, some, 32,
3916                             TCG_TYPE_V256, cond, t_vec);
3917             aofs += some;
3918             dofs += some;
3919             oprsz -= some;
3920             maxsz -= some;
3921             /* fallthru */
3922 
3923         case TCG_TYPE_V128:
3924             some = QEMU_ALIGN_DOWN(oprsz, 16);
3925             expand_cmps_vec(vece, dofs, aofs, some, 16,
3926                             TCG_TYPE_V128, cond, t_vec);
3927             break;
3928 
3929         case TCG_TYPE_V64:
3930             some = QEMU_ALIGN_DOWN(oprsz, 8);
3931             expand_cmps_vec(vece, dofs, aofs, some, 8,
3932                             TCG_TYPE_V64, cond, t_vec);
3933             break;
3934 
3935         default:
3936             g_assert_not_reached();
3937         }
3938         tcg_temp_free_vec(t_vec);
3939         tcg_swap_vecop_list(hold_list);
3940     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3941         TCGv_i64 t0 = tcg_temp_ebb_new_i64();
3942         uint32_t i;
3943 
3944         for (i = 0; i < oprsz; i += 8) {
3945             tcg_gen_ld_i64(t0, tcg_env, aofs + i);
3946             tcg_gen_negsetcond_i64(cond, t0, t0, c);
3947             tcg_gen_st_i64(t0, tcg_env, dofs + i);
3948         }
3949         tcg_temp_free_i64(t0);
3950     } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3951         TCGv_i32 t0 = tcg_temp_ebb_new_i32();
3952         TCGv_i32 t1 = tcg_temp_ebb_new_i32();
3953         uint32_t i;
3954 
3955         tcg_gen_extrl_i64_i32(t1, c);
3956         for (i = 0; i < oprsz; i += 4) {
3957             tcg_gen_ld_i32(t0, tcg_env, aofs + i);
3958             tcg_gen_negsetcond_i32(cond, t0, t0, t1);
3959             tcg_gen_st_i32(t0, tcg_env, dofs + i);
3960         }
3961         tcg_temp_free_i32(t0);
3962         tcg_temp_free_i32(t1);
3963     } else {
3964         gen_helper_gvec_2i * const *fn = fns[cond];
3965         bool inv = false;
3966 
3967         if (fn == NULL) {
3968             cond = tcg_invert_cond(cond);
3969             fn = fns[cond];
3970             assert(fn != NULL);
3971             inv = true;
3972         }
3973         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, fn[vece]);
3974         return;
3975     }
3976 
3977     if (oprsz < maxsz) {
3978         expand_clr(dofs + oprsz, maxsz - oprsz);
3979     }
3980 }
3981 
3982 void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
3983                        uint32_t aofs, int64_t c,
3984                        uint32_t oprsz, uint32_t maxsz)
3985 {
3986     TCGv_i64 tmp = tcg_constant_i64(c);
3987     tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz);
3988 }
3989 
3990 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3991 {
3992     TCGv_i64 t = tcg_temp_ebb_new_i64();
3993 
3994     tcg_gen_and_i64(t, b, a);
3995     tcg_gen_andc_i64(d, c, a);
3996     tcg_gen_or_i64(d, d, t);
3997     tcg_temp_free_i64(t);
3998 }
3999 
4000 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
4001                          uint32_t bofs, uint32_t cofs,
4002                          uint32_t oprsz, uint32_t maxsz)
4003 {
4004     static const GVecGen4 g = {
4005         .fni8 = tcg_gen_bitsel_i64,
4006         .fniv = tcg_gen_bitsel_vec,
4007         .fno = gen_helper_gvec_bitsel,
4008     };
4009 
4010     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
4011 }
4012