xref: /qemu/target/riscv/vector_helper.c (revision 513823e7521a09ed7ad1e32e6454bac3b2cbf52d)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/page-protection.h"
27 #include "exec/helper-proto.h"
28 #include "fpu/softfloat.h"
29 #include "tcg/tcg-gvec-desc.h"
30 #include "internals.h"
31 #include "vector_internals.h"
32 #include <math.h>
33 
34 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
35                             target_ulong s2)
36 {
37     int vlmax, vl;
38     RISCVCPU *cpu = env_archcpu(env);
39     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
40     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
41     uint16_t sew = 8 << vsew;
42     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
43     int xlen = riscv_cpu_xlen(env);
44     bool vill = (s2 >> (xlen - 1)) & 0x1;
45     target_ulong reserved = s2 &
46                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
47                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
48     uint16_t vlen = cpu->cfg.vlenb << 3;
49     int8_t lmul;
50 
51     if (vlmul & 4) {
52         /*
53          * Fractional LMUL, check:
54          *
55          * VLEN * LMUL >= SEW
56          * VLEN >> (8 - lmul) >= sew
57          * (vlenb << 3) >> (8 - lmul) >= sew
58          */
59         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
60             vill = true;
61         }
62     }
63 
64     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
65         /* only set vill bit. */
66         env->vill = 1;
67         env->vtype = 0;
68         env->vl = 0;
69         env->vstart = 0;
70         return 0;
71     }
72 
73     /* lmul encoded as in DisasContext::lmul */
74     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
75     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
76     if (s1 <= vlmax) {
77         vl = s1;
78     } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
79         vl = (s1 + 1) >> 1;
80     } else {
81         vl = vlmax;
82     }
83     env->vl = vl;
84     env->vtype = s2;
85     env->vstart = 0;
86     env->vill = 0;
87     return vl;
88 }
89 
90 /*
91  * Get the maximum number of elements can be operated.
92  *
93  * log2_esz: log2 of element size in bytes.
94  */
95 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
96 {
97     /*
98      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
99      * so vlen in bytes (vlenb) is encoded as maxsz.
100      */
101     uint32_t vlenb = simd_maxsz(desc);
102 
103     /* Return VLMAX */
104     int scale = vext_lmul(desc) - log2_esz;
105     return scale < 0 ? vlenb >> -scale : vlenb << scale;
106 }
107 
108 /*
109  * This function checks watchpoint before real load operation.
110  *
111  * In system mode, the TLB API probe_access is enough for watchpoint check.
112  * In user mode, there is no watchpoint support now.
113  *
114  * It will trigger an exception if there is no mapping in TLB
115  * and page table walk can't fill the TLB entry. Then the guest
116  * software can return here after process the exception or never return.
117  */
118 static void probe_pages(CPURISCVState *env, target_ulong addr,
119                         target_ulong len, uintptr_t ra,
120                         MMUAccessType access_type)
121 {
122     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
123     target_ulong curlen = MIN(pagelen, len);
124     int mmu_index = riscv_env_mmu_index(env, false);
125 
126     probe_access(env, adjust_addr(env, addr), curlen, access_type,
127                  mmu_index, ra);
128     if (len > curlen) {
129         addr += curlen;
130         curlen = len - curlen;
131         probe_access(env, adjust_addr(env, addr), curlen, access_type,
132                      mmu_index, ra);
133     }
134 }
135 
136 static inline void vext_set_elem_mask(void *v0, int index,
137                                       uint8_t value)
138 {
139     int idx = index / 64;
140     int pos = index % 64;
141     uint64_t old = ((uint64_t *)v0)[idx];
142     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
143 }
144 
145 /* elements operations for load and store */
146 typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
147                                    uint32_t idx, void *vd, uintptr_t retaddr);
148 typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
149 
150 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
151 static inline QEMU_ALWAYS_INLINE                            \
152 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
153                 uint32_t idx, void *vd, uintptr_t retaddr)  \
154 {                                                           \
155     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
156     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
157 }                                                           \
158                                                             \
159 static inline QEMU_ALWAYS_INLINE                            \
160 void NAME##_host(void *vd, uint32_t idx, void *host)        \
161 {                                                           \
162     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
163     *cur = (ETYPE)LDSUF##_p(host);                          \
164 }
165 
166 GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
167 GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
168 GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
169 GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
170 
171 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
172 static inline QEMU_ALWAYS_INLINE                            \
173 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
174                 uint32_t idx, void *vd, uintptr_t retaddr)  \
175 {                                                           \
176     ETYPE data = *((ETYPE *)vd + H(idx));                   \
177     cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
178 }                                                           \
179                                                             \
180 static inline QEMU_ALWAYS_INLINE                            \
181 void NAME##_host(void *vd, uint32_t idx, void *host)        \
182 {                                                           \
183     ETYPE data = *((ETYPE *)vd + H(idx));                   \
184     STSUF##_p(host, data);                                  \
185 }
186 
187 GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
188 GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
189 GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
190 GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
191 
192 static inline QEMU_ALWAYS_INLINE void
193 vext_continuous_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
194                        void *vd, uint32_t evl, target_ulong addr,
195                        uint32_t reg_start, uintptr_t ra, uint32_t esz,
196                        bool is_load)
197 {
198     uint32_t i;
199     for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
200         ldst_tlb(env, adjust_addr(env, addr), i, vd, ra);
201     }
202 }
203 
204 static inline QEMU_ALWAYS_INLINE void
205 vext_continuous_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
206                         void *vd, uint32_t evl, uint32_t reg_start, void *host,
207                         uint32_t esz, bool is_load)
208 {
209 #if HOST_BIG_ENDIAN
210     for (; reg_start < evl; reg_start++, host += esz) {
211         ldst_host(vd, reg_start, host);
212     }
213 #else
214     if (esz == 1) {
215         uint32_t byte_offset = reg_start * esz;
216         uint32_t size = (evl - reg_start) * esz;
217 
218         if (is_load) {
219             memcpy(vd + byte_offset, host, size);
220         } else {
221             memcpy(host, vd + byte_offset, size);
222         }
223     } else {
224         for (; reg_start < evl; reg_start++, host += esz) {
225             ldst_host(vd, reg_start, host);
226         }
227     }
228 #endif
229 }
230 
231 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
232                                    uint32_t desc, uint32_t nf,
233                                    uint32_t esz, uint32_t max_elems)
234 {
235     uint32_t vta = vext_vta(desc);
236     int k;
237 
238     if (vta == 0) {
239         return;
240     }
241 
242     for (k = 0; k < nf; ++k) {
243         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
244                           (k * max_elems + max_elems) * esz);
245     }
246 }
247 
248 /*
249  * stride: access vector element from strided memory
250  */
251 static void
252 vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride,
253                  CPURISCVState *env, uint32_t desc, uint32_t vm,
254                  vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
255                  uintptr_t ra)
256 {
257     uint32_t i, k;
258     uint32_t nf = vext_nf(desc);
259     uint32_t max_elems = vext_max_elems(desc, log2_esz);
260     uint32_t esz = 1 << log2_esz;
261     uint32_t vma = vext_vma(desc);
262 
263     VSTART_CHECK_EARLY_EXIT(env);
264 
265     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
266         k = 0;
267         while (k < nf) {
268             if (!vm && !vext_elem_mask(v0, i)) {
269                 /* set masked-off elements to 1s */
270                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
271                                   (i + k * max_elems + 1) * esz);
272                 k++;
273                 continue;
274             }
275             target_ulong addr = base + stride * i + (k << log2_esz);
276             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
277             k++;
278         }
279     }
280     env->vstart = 0;
281 
282     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
283 }
284 
285 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
286 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
287                   target_ulong stride, CPURISCVState *env,              \
288                   uint32_t desc)                                        \
289 {                                                                       \
290     uint32_t vm = vext_vm(desc);                                        \
291     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
292                      ctzl(sizeof(ETYPE)), GETPC());                     \
293 }
294 
295 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
296 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
297 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
298 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
299 
300 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
301 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
302                   target_ulong stride, CPURISCVState *env,              \
303                   uint32_t desc)                                        \
304 {                                                                       \
305     uint32_t vm = vext_vm(desc);                                        \
306     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
307                      ctzl(sizeof(ETYPE)), GETPC());                     \
308 }
309 
310 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
311 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
312 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
313 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
314 
315 /*
316  * unit-stride: access elements stored contiguously in memory
317  */
318 
319 /* unmasked unit-stride load and store operation */
320 static inline QEMU_ALWAYS_INLINE void
321 vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
322                   uint32_t elems, uint32_t nf, uint32_t max_elems,
323                   uint32_t log2_esz, bool is_load, int mmu_index,
324                   vext_ldst_elem_fn_tlb *ldst_tlb,
325                   vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
326 {
327     void *host;
328     int i, k, flags;
329     uint32_t esz = 1 << log2_esz;
330     uint32_t size = (elems * nf) << log2_esz;
331     uint32_t evl = env->vstart + elems;
332     MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
333 
334     /* Check page permission/pmp/watchpoint/etc. */
335     flags = probe_access_flags(env, adjust_addr(env, addr), size, access_type,
336                                mmu_index, true, &host, ra);
337 
338     if (flags == 0) {
339         if (nf == 1) {
340             vext_continuous_ldst_host(env, ldst_host, vd, evl, env->vstart,
341                                       host, esz, is_load);
342         } else {
343             for (i = env->vstart; i < evl; ++i) {
344                 k = 0;
345                 while (k < nf) {
346                     ldst_host(vd, i + k * max_elems, host);
347                     host += esz;
348                     k++;
349                 }
350             }
351         }
352         env->vstart += elems;
353     } else {
354         if (nf == 1) {
355             vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
356                                    ra, esz, is_load);
357         } else {
358             /* load bytes from guest memory */
359             for (i = env->vstart; i < evl; env->vstart = ++i) {
360                 k = 0;
361                 while (k < nf) {
362                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
363                              vd, ra);
364                     addr += esz;
365                     k++;
366                 }
367             }
368         }
369     }
370 }
371 
372 static inline QEMU_ALWAYS_INLINE void
373 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
374              vext_ldst_elem_fn_tlb *ldst_tlb,
375              vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
376              uint32_t evl, uintptr_t ra, bool is_load)
377 {
378     uint32_t k;
379     target_ulong page_split, elems, addr;
380     uint32_t nf = vext_nf(desc);
381     uint32_t max_elems = vext_max_elems(desc, log2_esz);
382     uint32_t esz = 1 << log2_esz;
383     uint32_t msize = nf * esz;
384     int mmu_index = riscv_env_mmu_index(env, false);
385 
386     if (env->vstart >= evl) {
387         env->vstart = 0;
388         return;
389     }
390 
391 #if defined(CONFIG_USER_ONLY)
392     /*
393      * For data sizes <= 6 bytes we get better performance by simply calling
394      * vext_continuous_ldst_tlb
395      */
396     if (nf == 1 && (evl << log2_esz) <= 6) {
397         addr = base + (env->vstart << log2_esz);
398         vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart, ra,
399                                  esz, is_load);
400 
401         env->vstart = 0;
402         vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
403         return;
404     }
405 #endif
406 
407     /* Calculate the page range of first page */
408     addr = base + ((env->vstart * nf) << log2_esz);
409     page_split = -(addr | TARGET_PAGE_MASK);
410     /* Get number of elements */
411     elems = page_split / msize;
412     if (unlikely(env->vstart + elems >= evl)) {
413         elems = evl - env->vstart;
414     }
415 
416     /* Load/store elements in the first page */
417     if (likely(elems)) {
418         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
419                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
420     }
421 
422     /* Load/store elements in the second page */
423     if (unlikely(env->vstart < evl)) {
424         /* Cross page element */
425         if (unlikely(page_split % msize)) {
426             for (k = 0; k < nf; k++) {
427                 addr = base + ((env->vstart * nf + k) << log2_esz);
428                 ldst_tlb(env, adjust_addr(env, addr),
429                         env->vstart + k * max_elems, vd, ra);
430             }
431             env->vstart++;
432         }
433 
434         addr = base + ((env->vstart * nf) << log2_esz);
435         /* Get number of elements of second page */
436         elems = evl - env->vstart;
437 
438         /* Load/store elements in the second page */
439         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
440                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
441     }
442 
443     env->vstart = 0;
444     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
445 }
446 
447 /*
448  * masked unit-stride load and store operation will be a special case of
449  * stride, stride = NF * sizeof (ETYPE)
450  */
451 
452 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
453 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
454                          CPURISCVState *env, uint32_t desc)         \
455 {                                                                   \
456     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
457     vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
458                      LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
459 }                                                                   \
460                                                                     \
461 void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
462                   CPURISCVState *env, uint32_t desc)                \
463 {                                                                   \
464     vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
465                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
466 }
467 
468 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
469 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
470 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
471 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
472 
473 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
474 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
475                          CPURISCVState *env, uint32_t desc)              \
476 {                                                                        \
477     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
478     vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
479                      STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
480 }                                                                        \
481                                                                          \
482 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
483                   CPURISCVState *env, uint32_t desc)                     \
484 {                                                                        \
485     vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
486                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
487 }
488 
489 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
490 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
491 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
492 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
493 
494 /*
495  * unit stride mask load and store, EEW = 1
496  */
497 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
498                     CPURISCVState *env, uint32_t desc)
499 {
500     /* evl = ceil(vl/8) */
501     uint8_t evl = (env->vl + 7) >> 3;
502     vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
503                  0, evl, GETPC(), true);
504 }
505 
506 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
507                     CPURISCVState *env, uint32_t desc)
508 {
509     /* evl = ceil(vl/8) */
510     uint8_t evl = (env->vl + 7) >> 3;
511     vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
512                  0, evl, GETPC(), false);
513 }
514 
515 /*
516  * index: access vector element from indexed memory
517  */
518 typedef target_ulong vext_get_index_addr(target_ulong base,
519         uint32_t idx, void *vs2);
520 
521 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
522 static target_ulong NAME(target_ulong base,            \
523                          uint32_t idx, void *vs2)      \
524 {                                                      \
525     return (base + *((ETYPE *)vs2 + H(idx)));          \
526 }
527 
528 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
529 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
530 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
531 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
532 
533 static inline void
534 vext_ldst_index(void *vd, void *v0, target_ulong base,
535                 void *vs2, CPURISCVState *env, uint32_t desc,
536                 vext_get_index_addr get_index_addr,
537                 vext_ldst_elem_fn_tlb *ldst_elem,
538                 uint32_t log2_esz, uintptr_t ra)
539 {
540     uint32_t i, k;
541     uint32_t nf = vext_nf(desc);
542     uint32_t vm = vext_vm(desc);
543     uint32_t max_elems = vext_max_elems(desc, log2_esz);
544     uint32_t esz = 1 << log2_esz;
545     uint32_t vma = vext_vma(desc);
546 
547     VSTART_CHECK_EARLY_EXIT(env);
548 
549     /* load bytes from guest memory */
550     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
551         k = 0;
552         while (k < nf) {
553             if (!vm && !vext_elem_mask(v0, i)) {
554                 /* set masked-off elements to 1s */
555                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
556                                   (i + k * max_elems + 1) * esz);
557                 k++;
558                 continue;
559             }
560             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
561             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
562             k++;
563         }
564     }
565     env->vstart = 0;
566 
567     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
568 }
569 
570 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
571 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
572                   void *vs2, CPURISCVState *env, uint32_t desc)            \
573 {                                                                          \
574     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
575                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
576 }
577 
578 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
579 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
580 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
581 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
582 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
583 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
584 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
585 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
586 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
587 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
588 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
589 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
590 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
591 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
592 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
593 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
594 
595 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
596 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
597                   void *vs2, CPURISCVState *env, uint32_t desc)  \
598 {                                                                \
599     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
600                     STORE_FN, ctzl(sizeof(ETYPE)),               \
601                     GETPC());                                    \
602 }
603 
604 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
605 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
606 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
607 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
608 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
609 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
610 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
611 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
612 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
613 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
614 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
615 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
616 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
617 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
618 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
619 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
620 
621 /*
622  * unit-stride fault-only-fisrt load instructions
623  */
624 static inline void
625 vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
626           uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
627           vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
628 {
629     uint32_t i, k, vl = 0;
630     uint32_t nf = vext_nf(desc);
631     uint32_t vm = vext_vm(desc);
632     uint32_t max_elems = vext_max_elems(desc, log2_esz);
633     uint32_t esz = 1 << log2_esz;
634     uint32_t msize = nf * esz;
635     uint32_t vma = vext_vma(desc);
636     target_ulong addr, offset, remain, page_split, elems;
637     int mmu_index = riscv_env_mmu_index(env, false);
638 
639     VSTART_CHECK_EARLY_EXIT(env);
640 
641     /* probe every access */
642     for (i = env->vstart; i < env->vl; i++) {
643         if (!vm && !vext_elem_mask(v0, i)) {
644             continue;
645         }
646         addr = adjust_addr(env, base + i * (nf << log2_esz));
647         if (i == 0) {
648             /* Allow fault on first element. */
649             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
650         } else {
651             remain = nf << log2_esz;
652             while (remain > 0) {
653                 void *host;
654                 int flags;
655 
656                 offset = -(addr | TARGET_PAGE_MASK);
657 
658                 /* Probe nonfault on subsequent elements. */
659                 flags = probe_access_flags(env, addr, offset, MMU_DATA_LOAD,
660                                            mmu_index, true, &host, 0);
661 
662                 /*
663                  * Stop if invalid (unmapped) or mmio (transaction may fail).
664                  * Do not stop if watchpoint, as the spec says that
665                  * first-fault should continue to access the same
666                  * elements regardless of any watchpoint.
667                  */
668                 if (flags & ~TLB_WATCHPOINT) {
669                     vl = i;
670                     goto ProbeSuccess;
671                 }
672                 if (remain <= offset) {
673                     break;
674                 }
675                 remain -= offset;
676                 addr = adjust_addr(env, addr + offset);
677             }
678         }
679     }
680 ProbeSuccess:
681     /* load bytes from guest memory */
682     if (vl != 0) {
683         env->vl = vl;
684     }
685 
686     if (env->vstart < env->vl) {
687         if (vm) {
688             /* Calculate the page range of first page */
689             addr = base + ((env->vstart * nf) << log2_esz);
690             page_split = -(addr | TARGET_PAGE_MASK);
691             /* Get number of elements */
692             elems = page_split / msize;
693             if (unlikely(env->vstart + elems >= env->vl)) {
694                 elems = env->vl - env->vstart;
695             }
696 
697             /* Load/store elements in the first page */
698             if (likely(elems)) {
699                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
700                                   log2_esz, true, mmu_index, ldst_tlb,
701                                   ldst_host, ra);
702             }
703 
704             /* Load/store elements in the second page */
705             if (unlikely(env->vstart < env->vl)) {
706                 /* Cross page element */
707                 if (unlikely(page_split % msize)) {
708                     for (k = 0; k < nf; k++) {
709                         addr = base + ((env->vstart * nf + k) << log2_esz);
710                         ldst_tlb(env, adjust_addr(env, addr),
711                                  env->vstart + k * max_elems, vd, ra);
712                     }
713                     env->vstart++;
714                 }
715 
716                 addr = base + ((env->vstart * nf) << log2_esz);
717                 /* Get number of elements of second page */
718                 elems = env->vl - env->vstart;
719 
720                 /* Load/store elements in the second page */
721                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
722                                   log2_esz, true, mmu_index, ldst_tlb,
723                                   ldst_host, ra);
724             }
725         } else {
726             for (i = env->vstart; i < env->vl; i++) {
727                 k = 0;
728                 while (k < nf) {
729                     if (!vext_elem_mask(v0, i)) {
730                         /* set masked-off elements to 1s */
731                         vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
732                                           (i + k * max_elems + 1) * esz);
733                         k++;
734                         continue;
735                     }
736                     addr = base + ((i * nf + k) << log2_esz);
737                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
738                              vd, ra);
739                     k++;
740                 }
741             }
742         }
743     }
744     env->vstart = 0;
745 
746     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
747 }
748 
749 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
750 void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
751                   CPURISCVState *env, uint32_t desc)            \
752 {                                                               \
753     vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
754               LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
755 }
756 
757 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
758 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
759 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
760 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
761 
762 #define DO_SWAP(N, M) (M)
763 #define DO_AND(N, M)  (N & M)
764 #define DO_XOR(N, M)  (N ^ M)
765 #define DO_OR(N, M)   (N | M)
766 #define DO_ADD(N, M)  (N + M)
767 
768 /* Signed min/max */
769 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
770 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
771 
772 /*
773  * load and store whole register instructions
774  */
775 static inline QEMU_ALWAYS_INLINE void
776 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
777                 vext_ldst_elem_fn_tlb *ldst_tlb,
778                 vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
779                 uintptr_t ra, bool is_load)
780 {
781     target_ulong page_split, elems, addr;
782     uint32_t nf = vext_nf(desc);
783     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
784     uint32_t max_elems = vlenb >> log2_esz;
785     uint32_t evl = nf * max_elems;
786     uint32_t esz = 1 << log2_esz;
787     int mmu_index = riscv_env_mmu_index(env, false);
788 
789     /* Calculate the page range of first page */
790     addr = base + (env->vstart << log2_esz);
791     page_split = -(addr | TARGET_PAGE_MASK);
792     /* Get number of elements */
793     elems = page_split / esz;
794     if (unlikely(env->vstart + elems >= evl)) {
795         elems = evl - env->vstart;
796     }
797 
798     /* Load/store elements in the first page */
799     if (likely(elems)) {
800         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
801                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
802     }
803 
804     /* Load/store elements in the second page */
805     if (unlikely(env->vstart < evl)) {
806         /* Cross page element */
807         if (unlikely(page_split % esz)) {
808             addr = base + (env->vstart << log2_esz);
809             ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
810             env->vstart++;
811         }
812 
813         addr = base + (env->vstart << log2_esz);
814         /* Get number of elements of second page */
815         elems = evl - env->vstart;
816 
817         /* Load/store elements in the second page */
818         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
819                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
820     }
821 
822     env->vstart = 0;
823 }
824 
825 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
826 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
827                   uint32_t desc)                                    \
828 {                                                                   \
829     vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
830                     ctzl(sizeof(ETYPE)), GETPC(), true);            \
831 }
832 
833 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
834 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
835 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
836 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
837 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
838 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
839 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
840 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
841 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
842 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
843 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
844 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
845 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
846 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
847 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
848 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
849 
850 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
851 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
852                   uint32_t desc)                                        \
853 {                                                                       \
854     vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
855                     ctzl(sizeof(ETYPE)), GETPC(), false);               \
856 }
857 
858 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
859 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
860 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
861 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
862 
863 /*
864  * Vector Integer Arithmetic Instructions
865  */
866 
867 /* (TD, T1, T2, TX1, TX2) */
868 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
869 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
870 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
871 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
872 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
873 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
874 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
875 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
876 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
877 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
878 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
879 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
880 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
881 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
882 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
883 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
884 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
885 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
886 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
887 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
888 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
889 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
890 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
891 
892 #define DO_SUB(N, M) (N - M)
893 #define DO_RSUB(N, M) (M - N)
894 
895 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
896 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
897 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
898 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
899 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
900 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
901 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
902 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
903 
904 GEN_VEXT_VV(vadd_vv_b, 1)
905 GEN_VEXT_VV(vadd_vv_h, 2)
906 GEN_VEXT_VV(vadd_vv_w, 4)
907 GEN_VEXT_VV(vadd_vv_d, 8)
908 GEN_VEXT_VV(vsub_vv_b, 1)
909 GEN_VEXT_VV(vsub_vv_h, 2)
910 GEN_VEXT_VV(vsub_vv_w, 4)
911 GEN_VEXT_VV(vsub_vv_d, 8)
912 
913 
914 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
915 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
916 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
917 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
918 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
919 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
920 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
921 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
922 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
923 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
924 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
925 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
926 
927 GEN_VEXT_VX(vadd_vx_b, 1)
928 GEN_VEXT_VX(vadd_vx_h, 2)
929 GEN_VEXT_VX(vadd_vx_w, 4)
930 GEN_VEXT_VX(vadd_vx_d, 8)
931 GEN_VEXT_VX(vsub_vx_b, 1)
932 GEN_VEXT_VX(vsub_vx_h, 2)
933 GEN_VEXT_VX(vsub_vx_w, 4)
934 GEN_VEXT_VX(vsub_vx_d, 8)
935 GEN_VEXT_VX(vrsub_vx_b, 1)
936 GEN_VEXT_VX(vrsub_vx_h, 2)
937 GEN_VEXT_VX(vrsub_vx_w, 4)
938 GEN_VEXT_VX(vrsub_vx_d, 8)
939 
940 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
941 {
942     intptr_t oprsz = simd_oprsz(desc);
943     intptr_t i;
944 
945     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
946         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
947     }
948 }
949 
950 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
951 {
952     intptr_t oprsz = simd_oprsz(desc);
953     intptr_t i;
954 
955     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
956         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
957     }
958 }
959 
960 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
961 {
962     intptr_t oprsz = simd_oprsz(desc);
963     intptr_t i;
964 
965     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
966         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
967     }
968 }
969 
970 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
971 {
972     intptr_t oprsz = simd_oprsz(desc);
973     intptr_t i;
974 
975     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
976         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
977     }
978 }
979 
980 /* Vector Widening Integer Add/Subtract */
981 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
982 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
983 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
984 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
985 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
986 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
987 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
988 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
989 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
990 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
991 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
992 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
993 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
994 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
995 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
996 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
997 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
998 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
999 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1000 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1001 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1002 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1003 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1004 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1005 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1006 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1007 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1008 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1009 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1010 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1011 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1012 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1013 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1014 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1015 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1016 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1017 GEN_VEXT_VV(vwaddu_vv_b, 2)
1018 GEN_VEXT_VV(vwaddu_vv_h, 4)
1019 GEN_VEXT_VV(vwaddu_vv_w, 8)
1020 GEN_VEXT_VV(vwsubu_vv_b, 2)
1021 GEN_VEXT_VV(vwsubu_vv_h, 4)
1022 GEN_VEXT_VV(vwsubu_vv_w, 8)
1023 GEN_VEXT_VV(vwadd_vv_b, 2)
1024 GEN_VEXT_VV(vwadd_vv_h, 4)
1025 GEN_VEXT_VV(vwadd_vv_w, 8)
1026 GEN_VEXT_VV(vwsub_vv_b, 2)
1027 GEN_VEXT_VV(vwsub_vv_h, 4)
1028 GEN_VEXT_VV(vwsub_vv_w, 8)
1029 GEN_VEXT_VV(vwaddu_wv_b, 2)
1030 GEN_VEXT_VV(vwaddu_wv_h, 4)
1031 GEN_VEXT_VV(vwaddu_wv_w, 8)
1032 GEN_VEXT_VV(vwsubu_wv_b, 2)
1033 GEN_VEXT_VV(vwsubu_wv_h, 4)
1034 GEN_VEXT_VV(vwsubu_wv_w, 8)
1035 GEN_VEXT_VV(vwadd_wv_b, 2)
1036 GEN_VEXT_VV(vwadd_wv_h, 4)
1037 GEN_VEXT_VV(vwadd_wv_w, 8)
1038 GEN_VEXT_VV(vwsub_wv_b, 2)
1039 GEN_VEXT_VV(vwsub_wv_h, 4)
1040 GEN_VEXT_VV(vwsub_wv_w, 8)
1041 
1042 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1043 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1044 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1045 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1046 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1047 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1048 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1049 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1050 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1051 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1052 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1053 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1054 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1055 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1056 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1057 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1058 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1059 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1060 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1061 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1062 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1063 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1064 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1065 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1066 GEN_VEXT_VX(vwaddu_vx_b, 2)
1067 GEN_VEXT_VX(vwaddu_vx_h, 4)
1068 GEN_VEXT_VX(vwaddu_vx_w, 8)
1069 GEN_VEXT_VX(vwsubu_vx_b, 2)
1070 GEN_VEXT_VX(vwsubu_vx_h, 4)
1071 GEN_VEXT_VX(vwsubu_vx_w, 8)
1072 GEN_VEXT_VX(vwadd_vx_b, 2)
1073 GEN_VEXT_VX(vwadd_vx_h, 4)
1074 GEN_VEXT_VX(vwadd_vx_w, 8)
1075 GEN_VEXT_VX(vwsub_vx_b, 2)
1076 GEN_VEXT_VX(vwsub_vx_h, 4)
1077 GEN_VEXT_VX(vwsub_vx_w, 8)
1078 GEN_VEXT_VX(vwaddu_wx_b, 2)
1079 GEN_VEXT_VX(vwaddu_wx_h, 4)
1080 GEN_VEXT_VX(vwaddu_wx_w, 8)
1081 GEN_VEXT_VX(vwsubu_wx_b, 2)
1082 GEN_VEXT_VX(vwsubu_wx_h, 4)
1083 GEN_VEXT_VX(vwsubu_wx_w, 8)
1084 GEN_VEXT_VX(vwadd_wx_b, 2)
1085 GEN_VEXT_VX(vwadd_wx_h, 4)
1086 GEN_VEXT_VX(vwadd_wx_w, 8)
1087 GEN_VEXT_VX(vwsub_wx_b, 2)
1088 GEN_VEXT_VX(vwsub_wx_h, 4)
1089 GEN_VEXT_VX(vwsub_wx_w, 8)
1090 
1091 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1092 #define DO_VADC(N, M, C) (N + M + C)
1093 #define DO_VSBC(N, M, C) (N - M - C)
1094 
1095 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1096 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1097                   CPURISCVState *env, uint32_t desc)          \
1098 {                                                             \
1099     uint32_t vl = env->vl;                                    \
1100     uint32_t esz = sizeof(ETYPE);                             \
1101     uint32_t total_elems =                                    \
1102         vext_get_total_elems(env, desc, esz);                 \
1103     uint32_t vta = vext_vta(desc);                            \
1104     uint32_t i;                                               \
1105                                                               \
1106     VSTART_CHECK_EARLY_EXIT(env);                             \
1107                                                               \
1108     for (i = env->vstart; i < vl; i++) {                      \
1109         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1110         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1111         ETYPE carry = vext_elem_mask(v0, i);                  \
1112                                                               \
1113         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1114     }                                                         \
1115     env->vstart = 0;                                          \
1116     /* set tail elements to 1s */                             \
1117     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1118 }
1119 
1120 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1121 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1122 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1123 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1124 
1125 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1126 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1127 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1128 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1129 
1130 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1131 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1132                   CPURISCVState *env, uint32_t desc)                     \
1133 {                                                                        \
1134     uint32_t vl = env->vl;                                               \
1135     uint32_t esz = sizeof(ETYPE);                                        \
1136     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1137     uint32_t vta = vext_vta(desc);                                       \
1138     uint32_t i;                                                          \
1139                                                                          \
1140     VSTART_CHECK_EARLY_EXIT(env);                                        \
1141                                                                          \
1142     for (i = env->vstart; i < vl; i++) {                                 \
1143         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1144         ETYPE carry = vext_elem_mask(v0, i);                             \
1145                                                                          \
1146         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1147     }                                                                    \
1148     env->vstart = 0;                                                     \
1149     /* set tail elements to 1s */                                        \
1150     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1151 }
1152 
1153 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1154 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1155 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1156 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1157 
1158 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1159 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1160 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1161 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1162 
1163 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1164                           (__typeof(N))(N + M) < N)
1165 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1166 
1167 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1168 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1169                   CPURISCVState *env, uint32_t desc)          \
1170 {                                                             \
1171     uint32_t vl = env->vl;                                    \
1172     uint32_t vm = vext_vm(desc);                              \
1173     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1174     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1175     uint32_t i;                                               \
1176                                                               \
1177     VSTART_CHECK_EARLY_EXIT(env);                             \
1178                                                               \
1179     for (i = env->vstart; i < vl; i++) {                      \
1180         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1181         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1182         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1183         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1184     }                                                         \
1185     env->vstart = 0;                                          \
1186     /*
1187      * mask destination register are always tail-agnostic
1188      * set tail elements to 1s
1189      */                                                       \
1190     if (vta_all_1s) {                                         \
1191         for (; i < total_elems; i++) {                        \
1192             vext_set_elem_mask(vd, i, 1);                     \
1193         }                                                     \
1194     }                                                         \
1195 }
1196 
1197 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1198 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1199 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1200 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1201 
1202 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1203 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1204 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1205 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1206 
1207 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1208 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1209                   void *vs2, CPURISCVState *env, uint32_t desc) \
1210 {                                                               \
1211     uint32_t vl = env->vl;                                      \
1212     uint32_t vm = vext_vm(desc);                                \
1213     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1214     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1215     uint32_t i;                                                 \
1216                                                                 \
1217     VSTART_CHECK_EARLY_EXIT(env);                               \
1218                                                                 \
1219     for (i = env->vstart; i < vl; i++) {                        \
1220         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1221         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1222         vext_set_elem_mask(vd, i,                               \
1223                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1224     }                                                           \
1225     env->vstart = 0;                                            \
1226     /*
1227      * mask destination register are always tail-agnostic
1228      * set tail elements to 1s
1229      */                                                         \
1230     if (vta_all_1s) {                                           \
1231         for (; i < total_elems; i++) {                          \
1232             vext_set_elem_mask(vd, i, 1);                       \
1233         }                                                       \
1234     }                                                           \
1235 }
1236 
1237 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1238 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1239 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1240 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1241 
1242 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1243 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1244 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1245 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1246 
1247 /* Vector Bitwise Logical Instructions */
1248 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1249 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1250 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1251 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1252 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1253 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1254 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1255 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1256 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1257 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1258 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1259 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1260 GEN_VEXT_VV(vand_vv_b, 1)
1261 GEN_VEXT_VV(vand_vv_h, 2)
1262 GEN_VEXT_VV(vand_vv_w, 4)
1263 GEN_VEXT_VV(vand_vv_d, 8)
1264 GEN_VEXT_VV(vor_vv_b, 1)
1265 GEN_VEXT_VV(vor_vv_h, 2)
1266 GEN_VEXT_VV(vor_vv_w, 4)
1267 GEN_VEXT_VV(vor_vv_d, 8)
1268 GEN_VEXT_VV(vxor_vv_b, 1)
1269 GEN_VEXT_VV(vxor_vv_h, 2)
1270 GEN_VEXT_VV(vxor_vv_w, 4)
1271 GEN_VEXT_VV(vxor_vv_d, 8)
1272 
1273 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1274 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1275 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1276 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1277 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1278 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1279 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1280 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1281 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1282 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1283 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1284 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1285 GEN_VEXT_VX(vand_vx_b, 1)
1286 GEN_VEXT_VX(vand_vx_h, 2)
1287 GEN_VEXT_VX(vand_vx_w, 4)
1288 GEN_VEXT_VX(vand_vx_d, 8)
1289 GEN_VEXT_VX(vor_vx_b, 1)
1290 GEN_VEXT_VX(vor_vx_h, 2)
1291 GEN_VEXT_VX(vor_vx_w, 4)
1292 GEN_VEXT_VX(vor_vx_d, 8)
1293 GEN_VEXT_VX(vxor_vx_b, 1)
1294 GEN_VEXT_VX(vxor_vx_h, 2)
1295 GEN_VEXT_VX(vxor_vx_w, 4)
1296 GEN_VEXT_VX(vxor_vx_d, 8)
1297 
1298 /* Vector Single-Width Bit Shift Instructions */
1299 #define DO_SLL(N, M)  (N << (M))
1300 #define DO_SRL(N, M)  (N >> (M))
1301 
1302 /* generate the helpers for shift instructions with two vector operators */
1303 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1304 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1305                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1306 {                                                                         \
1307     uint32_t vm = vext_vm(desc);                                          \
1308     uint32_t vl = env->vl;                                                \
1309     uint32_t esz = sizeof(TS1);                                           \
1310     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1311     uint32_t vta = vext_vta(desc);                                        \
1312     uint32_t vma = vext_vma(desc);                                        \
1313     uint32_t i;                                                           \
1314                                                                           \
1315     VSTART_CHECK_EARLY_EXIT(env);                                         \
1316                                                                           \
1317     for (i = env->vstart; i < vl; i++) {                                  \
1318         if (!vm && !vext_elem_mask(v0, i)) {                              \
1319             /* set masked-off elements to 1s */                           \
1320             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1321             continue;                                                     \
1322         }                                                                 \
1323         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1324         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1325         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1326     }                                                                     \
1327     env->vstart = 0;                                                      \
1328     /* set tail elements to 1s */                                         \
1329     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1330 }
1331 
1332 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1333 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1334 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1335 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1336 
1337 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1338 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1339 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1340 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1341 
1342 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1343 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1344 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1345 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1346 
1347 /*
1348  * generate the helpers for shift instructions with one vector and one scalar
1349  */
1350 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1351 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1352                   void *vs2, CPURISCVState *env,            \
1353                   uint32_t desc)                            \
1354 {                                                           \
1355     uint32_t vm = vext_vm(desc);                            \
1356     uint32_t vl = env->vl;                                  \
1357     uint32_t esz = sizeof(TD);                              \
1358     uint32_t total_elems =                                  \
1359         vext_get_total_elems(env, desc, esz);               \
1360     uint32_t vta = vext_vta(desc);                          \
1361     uint32_t vma = vext_vma(desc);                          \
1362     uint32_t i;                                             \
1363                                                             \
1364     VSTART_CHECK_EARLY_EXIT(env);                           \
1365                                                             \
1366     for (i = env->vstart; i < vl; i++) {                    \
1367         if (!vm && !vext_elem_mask(v0, i)) {                \
1368             /* set masked-off elements to 1s */             \
1369             vext_set_elems_1s(vd, vma, i * esz,             \
1370                               (i + 1) * esz);               \
1371             continue;                                       \
1372         }                                                   \
1373         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1374         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1375     }                                                       \
1376     env->vstart = 0;                                        \
1377     /* set tail elements to 1s */                           \
1378     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1379 }
1380 
1381 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1382 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1383 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1384 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1385 
1386 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1387 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1388 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1389 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1390 
1391 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1392 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1393 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1394 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1395 
1396 /* Vector Narrowing Integer Right Shift Instructions */
1397 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1398 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1399 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1400 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1401 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1402 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1403 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1404 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1405 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1406 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1407 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1408 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1409 
1410 /* Vector Integer Comparison Instructions */
1411 #define DO_MSEQ(N, M) (N == M)
1412 #define DO_MSNE(N, M) (N != M)
1413 #define DO_MSLT(N, M) (N < M)
1414 #define DO_MSLE(N, M) (N <= M)
1415 #define DO_MSGT(N, M) (N > M)
1416 
1417 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1418 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1419                   CPURISCVState *env, uint32_t desc)          \
1420 {                                                             \
1421     uint32_t vm = vext_vm(desc);                              \
1422     uint32_t vl = env->vl;                                    \
1423     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1424     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1425     uint32_t vma = vext_vma(desc);                            \
1426     uint32_t i;                                               \
1427                                                               \
1428     VSTART_CHECK_EARLY_EXIT(env);                             \
1429                                                               \
1430     for (i = env->vstart; i < vl; i++) {                      \
1431         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1432         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1433         if (!vm && !vext_elem_mask(v0, i)) {                  \
1434             /* set masked-off elements to 1s */               \
1435             if (vma) {                                        \
1436                 vext_set_elem_mask(vd, i, 1);                 \
1437             }                                                 \
1438             continue;                                         \
1439         }                                                     \
1440         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1441     }                                                         \
1442     env->vstart = 0;                                          \
1443     /*
1444      * mask destination register are always tail-agnostic
1445      * set tail elements to 1s
1446      */                                                       \
1447     if (vta_all_1s) {                                         \
1448         for (; i < total_elems; i++) {                        \
1449             vext_set_elem_mask(vd, i, 1);                     \
1450         }                                                     \
1451     }                                                         \
1452 }
1453 
1454 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1455 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1456 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1457 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1458 
1459 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1460 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1461 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1462 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1463 
1464 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1465 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1466 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1467 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1468 
1469 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1470 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1471 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1472 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1473 
1474 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1475 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1476 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1477 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1478 
1479 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1480 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1481 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1482 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1483 
1484 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1485 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1486                   CPURISCVState *env, uint32_t desc)                \
1487 {                                                                   \
1488     uint32_t vm = vext_vm(desc);                                    \
1489     uint32_t vl = env->vl;                                          \
1490     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1491     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1492     uint32_t vma = vext_vma(desc);                                  \
1493     uint32_t i;                                                     \
1494                                                                     \
1495     VSTART_CHECK_EARLY_EXIT(env);                                   \
1496                                                                     \
1497     for (i = env->vstart; i < vl; i++) {                            \
1498         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1499         if (!vm && !vext_elem_mask(v0, i)) {                        \
1500             /* set masked-off elements to 1s */                     \
1501             if (vma) {                                              \
1502                 vext_set_elem_mask(vd, i, 1);                       \
1503             }                                                       \
1504             continue;                                               \
1505         }                                                           \
1506         vext_set_elem_mask(vd, i,                                   \
1507                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1508     }                                                               \
1509     env->vstart = 0;                                                \
1510     /*
1511      * mask destination register are always tail-agnostic
1512      * set tail elements to 1s
1513      */                                                             \
1514     if (vta_all_1s) {                                               \
1515         for (; i < total_elems; i++) {                              \
1516             vext_set_elem_mask(vd, i, 1);                           \
1517         }                                                           \
1518     }                                                               \
1519 }
1520 
1521 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1522 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1523 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1524 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1525 
1526 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1527 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1528 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1529 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1530 
1531 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1532 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1533 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1534 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1535 
1536 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1537 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1538 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1539 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1540 
1541 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1542 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1543 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1544 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1545 
1546 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1547 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1548 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1549 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1550 
1551 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1552 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1553 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1554 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1555 
1556 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1557 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1558 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1559 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1560 
1561 /* Vector Integer Min/Max Instructions */
1562 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1563 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1564 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1565 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1566 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1567 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1568 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1569 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1570 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1571 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1572 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1573 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1574 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1575 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1576 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1577 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1578 GEN_VEXT_VV(vminu_vv_b, 1)
1579 GEN_VEXT_VV(vminu_vv_h, 2)
1580 GEN_VEXT_VV(vminu_vv_w, 4)
1581 GEN_VEXT_VV(vminu_vv_d, 8)
1582 GEN_VEXT_VV(vmin_vv_b, 1)
1583 GEN_VEXT_VV(vmin_vv_h, 2)
1584 GEN_VEXT_VV(vmin_vv_w, 4)
1585 GEN_VEXT_VV(vmin_vv_d, 8)
1586 GEN_VEXT_VV(vmaxu_vv_b, 1)
1587 GEN_VEXT_VV(vmaxu_vv_h, 2)
1588 GEN_VEXT_VV(vmaxu_vv_w, 4)
1589 GEN_VEXT_VV(vmaxu_vv_d, 8)
1590 GEN_VEXT_VV(vmax_vv_b, 1)
1591 GEN_VEXT_VV(vmax_vv_h, 2)
1592 GEN_VEXT_VV(vmax_vv_w, 4)
1593 GEN_VEXT_VV(vmax_vv_d, 8)
1594 
1595 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1596 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1597 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1598 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1599 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1600 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1601 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1602 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1603 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1604 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1605 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1606 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1607 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1608 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1609 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1610 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1611 GEN_VEXT_VX(vminu_vx_b, 1)
1612 GEN_VEXT_VX(vminu_vx_h, 2)
1613 GEN_VEXT_VX(vminu_vx_w, 4)
1614 GEN_VEXT_VX(vminu_vx_d, 8)
1615 GEN_VEXT_VX(vmin_vx_b, 1)
1616 GEN_VEXT_VX(vmin_vx_h, 2)
1617 GEN_VEXT_VX(vmin_vx_w, 4)
1618 GEN_VEXT_VX(vmin_vx_d, 8)
1619 GEN_VEXT_VX(vmaxu_vx_b, 1)
1620 GEN_VEXT_VX(vmaxu_vx_h, 2)
1621 GEN_VEXT_VX(vmaxu_vx_w, 4)
1622 GEN_VEXT_VX(vmaxu_vx_d, 8)
1623 GEN_VEXT_VX(vmax_vx_b, 1)
1624 GEN_VEXT_VX(vmax_vx_h, 2)
1625 GEN_VEXT_VX(vmax_vx_w, 4)
1626 GEN_VEXT_VX(vmax_vx_d, 8)
1627 
1628 /* Vector Single-Width Integer Multiply Instructions */
1629 #define DO_MUL(N, M) (N * M)
1630 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1631 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1632 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1633 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1634 GEN_VEXT_VV(vmul_vv_b, 1)
1635 GEN_VEXT_VV(vmul_vv_h, 2)
1636 GEN_VEXT_VV(vmul_vv_w, 4)
1637 GEN_VEXT_VV(vmul_vv_d, 8)
1638 
1639 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1640 {
1641     return (int16_t)s2 * (int16_t)s1 >> 8;
1642 }
1643 
1644 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1645 {
1646     return (int32_t)s2 * (int32_t)s1 >> 16;
1647 }
1648 
1649 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1650 {
1651     return (int64_t)s2 * (int64_t)s1 >> 32;
1652 }
1653 
1654 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1655 {
1656     uint64_t hi_64, lo_64;
1657 
1658     muls64(&lo_64, &hi_64, s1, s2);
1659     return hi_64;
1660 }
1661 
1662 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1663 {
1664     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1665 }
1666 
1667 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1668 {
1669     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1670 }
1671 
1672 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1673 {
1674     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1675 }
1676 
1677 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1678 {
1679     uint64_t hi_64, lo_64;
1680 
1681     mulu64(&lo_64, &hi_64, s2, s1);
1682     return hi_64;
1683 }
1684 
1685 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1686 {
1687     return (int16_t)s2 * (uint16_t)s1 >> 8;
1688 }
1689 
1690 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1691 {
1692     return (int32_t)s2 * (uint32_t)s1 >> 16;
1693 }
1694 
1695 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1696 {
1697     return (int64_t)s2 * (uint64_t)s1 >> 32;
1698 }
1699 
1700 /*
1701  * Let  A = signed operand,
1702  *      B = unsigned operand
1703  *      P = mulu64(A, B), unsigned product
1704  *
1705  * LET  X = 2 ** 64  - A, 2's complement of A
1706  *      SP = signed product
1707  * THEN
1708  *      IF A < 0
1709  *          SP = -X * B
1710  *             = -(2 ** 64 - A) * B
1711  *             = A * B - 2 ** 64 * B
1712  *             = P - 2 ** 64 * B
1713  *      ELSE
1714  *          SP = P
1715  * THEN
1716  *      HI_P -= (A < 0 ? B : 0)
1717  */
1718 
1719 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1720 {
1721     uint64_t hi_64, lo_64;
1722 
1723     mulu64(&lo_64, &hi_64, s2, s1);
1724 
1725     hi_64 -= s2 < 0 ? s1 : 0;
1726     return hi_64;
1727 }
1728 
1729 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1730 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1731 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1732 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1733 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1734 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1735 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1736 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1737 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1738 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1739 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1740 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1741 GEN_VEXT_VV(vmulh_vv_b, 1)
1742 GEN_VEXT_VV(vmulh_vv_h, 2)
1743 GEN_VEXT_VV(vmulh_vv_w, 4)
1744 GEN_VEXT_VV(vmulh_vv_d, 8)
1745 GEN_VEXT_VV(vmulhu_vv_b, 1)
1746 GEN_VEXT_VV(vmulhu_vv_h, 2)
1747 GEN_VEXT_VV(vmulhu_vv_w, 4)
1748 GEN_VEXT_VV(vmulhu_vv_d, 8)
1749 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1750 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1751 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1752 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1753 
1754 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1755 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1756 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1757 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1758 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1759 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1760 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1761 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1762 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1763 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1764 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1765 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1766 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1767 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1768 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1769 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1770 GEN_VEXT_VX(vmul_vx_b, 1)
1771 GEN_VEXT_VX(vmul_vx_h, 2)
1772 GEN_VEXT_VX(vmul_vx_w, 4)
1773 GEN_VEXT_VX(vmul_vx_d, 8)
1774 GEN_VEXT_VX(vmulh_vx_b, 1)
1775 GEN_VEXT_VX(vmulh_vx_h, 2)
1776 GEN_VEXT_VX(vmulh_vx_w, 4)
1777 GEN_VEXT_VX(vmulh_vx_d, 8)
1778 GEN_VEXT_VX(vmulhu_vx_b, 1)
1779 GEN_VEXT_VX(vmulhu_vx_h, 2)
1780 GEN_VEXT_VX(vmulhu_vx_w, 4)
1781 GEN_VEXT_VX(vmulhu_vx_d, 8)
1782 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1783 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1784 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1785 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1786 
1787 /* Vector Integer Divide Instructions */
1788 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1789 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1790 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1791         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1792 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1793         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1794 
1795 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1796 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1797 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1798 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1799 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1800 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1801 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1802 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1803 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1804 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1805 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1806 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1807 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1808 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1809 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1810 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1811 GEN_VEXT_VV(vdivu_vv_b, 1)
1812 GEN_VEXT_VV(vdivu_vv_h, 2)
1813 GEN_VEXT_VV(vdivu_vv_w, 4)
1814 GEN_VEXT_VV(vdivu_vv_d, 8)
1815 GEN_VEXT_VV(vdiv_vv_b, 1)
1816 GEN_VEXT_VV(vdiv_vv_h, 2)
1817 GEN_VEXT_VV(vdiv_vv_w, 4)
1818 GEN_VEXT_VV(vdiv_vv_d, 8)
1819 GEN_VEXT_VV(vremu_vv_b, 1)
1820 GEN_VEXT_VV(vremu_vv_h, 2)
1821 GEN_VEXT_VV(vremu_vv_w, 4)
1822 GEN_VEXT_VV(vremu_vv_d, 8)
1823 GEN_VEXT_VV(vrem_vv_b, 1)
1824 GEN_VEXT_VV(vrem_vv_h, 2)
1825 GEN_VEXT_VV(vrem_vv_w, 4)
1826 GEN_VEXT_VV(vrem_vv_d, 8)
1827 
1828 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1829 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1830 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1831 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1832 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1833 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1834 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1835 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1836 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1837 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1838 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1839 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1840 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1841 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1842 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1843 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1844 GEN_VEXT_VX(vdivu_vx_b, 1)
1845 GEN_VEXT_VX(vdivu_vx_h, 2)
1846 GEN_VEXT_VX(vdivu_vx_w, 4)
1847 GEN_VEXT_VX(vdivu_vx_d, 8)
1848 GEN_VEXT_VX(vdiv_vx_b, 1)
1849 GEN_VEXT_VX(vdiv_vx_h, 2)
1850 GEN_VEXT_VX(vdiv_vx_w, 4)
1851 GEN_VEXT_VX(vdiv_vx_d, 8)
1852 GEN_VEXT_VX(vremu_vx_b, 1)
1853 GEN_VEXT_VX(vremu_vx_h, 2)
1854 GEN_VEXT_VX(vremu_vx_w, 4)
1855 GEN_VEXT_VX(vremu_vx_d, 8)
1856 GEN_VEXT_VX(vrem_vx_b, 1)
1857 GEN_VEXT_VX(vrem_vx_h, 2)
1858 GEN_VEXT_VX(vrem_vx_w, 4)
1859 GEN_VEXT_VX(vrem_vx_d, 8)
1860 
1861 /* Vector Widening Integer Multiply Instructions */
1862 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1863 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1864 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1865 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1866 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1867 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1868 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1869 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1870 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1871 GEN_VEXT_VV(vwmul_vv_b, 2)
1872 GEN_VEXT_VV(vwmul_vv_h, 4)
1873 GEN_VEXT_VV(vwmul_vv_w, 8)
1874 GEN_VEXT_VV(vwmulu_vv_b, 2)
1875 GEN_VEXT_VV(vwmulu_vv_h, 4)
1876 GEN_VEXT_VV(vwmulu_vv_w, 8)
1877 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1878 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1879 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1880 
1881 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1882 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1883 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1884 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1885 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1886 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1887 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1888 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1889 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1890 GEN_VEXT_VX(vwmul_vx_b, 2)
1891 GEN_VEXT_VX(vwmul_vx_h, 4)
1892 GEN_VEXT_VX(vwmul_vx_w, 8)
1893 GEN_VEXT_VX(vwmulu_vx_b, 2)
1894 GEN_VEXT_VX(vwmulu_vx_h, 4)
1895 GEN_VEXT_VX(vwmulu_vx_w, 8)
1896 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1897 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1898 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1899 
1900 /* Vector Single-Width Integer Multiply-Add Instructions */
1901 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1902 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1903 {                                                                  \
1904     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1905     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1906     TD d = *((TD *)vd + HD(i));                                    \
1907     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1908 }
1909 
1910 #define DO_MACC(N, M, D) (M * N + D)
1911 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1912 #define DO_MADD(N, M, D) (M * D + N)
1913 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1914 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1915 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1916 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1917 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1918 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1919 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1920 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1921 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1922 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1923 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1924 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1925 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1926 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1927 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1928 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1929 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1930 GEN_VEXT_VV(vmacc_vv_b, 1)
1931 GEN_VEXT_VV(vmacc_vv_h, 2)
1932 GEN_VEXT_VV(vmacc_vv_w, 4)
1933 GEN_VEXT_VV(vmacc_vv_d, 8)
1934 GEN_VEXT_VV(vnmsac_vv_b, 1)
1935 GEN_VEXT_VV(vnmsac_vv_h, 2)
1936 GEN_VEXT_VV(vnmsac_vv_w, 4)
1937 GEN_VEXT_VV(vnmsac_vv_d, 8)
1938 GEN_VEXT_VV(vmadd_vv_b, 1)
1939 GEN_VEXT_VV(vmadd_vv_h, 2)
1940 GEN_VEXT_VV(vmadd_vv_w, 4)
1941 GEN_VEXT_VV(vmadd_vv_d, 8)
1942 GEN_VEXT_VV(vnmsub_vv_b, 1)
1943 GEN_VEXT_VV(vnmsub_vv_h, 2)
1944 GEN_VEXT_VV(vnmsub_vv_w, 4)
1945 GEN_VEXT_VV(vnmsub_vv_d, 8)
1946 
1947 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1948 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1949 {                                                                   \
1950     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1951     TD d = *((TD *)vd + HD(i));                                     \
1952     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1953 }
1954 
1955 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1956 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1957 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1958 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1959 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1960 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1961 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1962 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1963 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1964 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1965 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1966 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1967 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1968 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1969 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1970 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1971 GEN_VEXT_VX(vmacc_vx_b, 1)
1972 GEN_VEXT_VX(vmacc_vx_h, 2)
1973 GEN_VEXT_VX(vmacc_vx_w, 4)
1974 GEN_VEXT_VX(vmacc_vx_d, 8)
1975 GEN_VEXT_VX(vnmsac_vx_b, 1)
1976 GEN_VEXT_VX(vnmsac_vx_h, 2)
1977 GEN_VEXT_VX(vnmsac_vx_w, 4)
1978 GEN_VEXT_VX(vnmsac_vx_d, 8)
1979 GEN_VEXT_VX(vmadd_vx_b, 1)
1980 GEN_VEXT_VX(vmadd_vx_h, 2)
1981 GEN_VEXT_VX(vmadd_vx_w, 4)
1982 GEN_VEXT_VX(vmadd_vx_d, 8)
1983 GEN_VEXT_VX(vnmsub_vx_b, 1)
1984 GEN_VEXT_VX(vnmsub_vx_h, 2)
1985 GEN_VEXT_VX(vnmsub_vx_w, 4)
1986 GEN_VEXT_VX(vnmsub_vx_d, 8)
1987 
1988 /* Vector Widening Integer Multiply-Add Instructions */
1989 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1990 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1991 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1992 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1993 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1994 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1995 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1996 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1997 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1998 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1999 GEN_VEXT_VV(vwmaccu_vv_h, 4)
2000 GEN_VEXT_VV(vwmaccu_vv_w, 8)
2001 GEN_VEXT_VV(vwmacc_vv_b, 2)
2002 GEN_VEXT_VV(vwmacc_vv_h, 4)
2003 GEN_VEXT_VV(vwmacc_vv_w, 8)
2004 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
2005 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
2006 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
2007 
2008 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
2009 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
2010 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
2011 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
2012 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
2013 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
2014 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
2015 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
2016 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
2017 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
2018 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
2019 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
2020 GEN_VEXT_VX(vwmaccu_vx_b, 2)
2021 GEN_VEXT_VX(vwmaccu_vx_h, 4)
2022 GEN_VEXT_VX(vwmaccu_vx_w, 8)
2023 GEN_VEXT_VX(vwmacc_vx_b, 2)
2024 GEN_VEXT_VX(vwmacc_vx_h, 4)
2025 GEN_VEXT_VX(vwmacc_vx_w, 8)
2026 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2027 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2028 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2029 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2030 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2031 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2032 
2033 /* Vector Integer Merge and Move Instructions */
2034 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2035 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2036                   uint32_t desc)                                     \
2037 {                                                                    \
2038     uint32_t vl = env->vl;                                           \
2039     uint32_t esz = sizeof(ETYPE);                                    \
2040     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2041     uint32_t vta = vext_vta(desc);                                   \
2042     uint32_t i;                                                      \
2043                                                                      \
2044     VSTART_CHECK_EARLY_EXIT(env);                                    \
2045                                                                      \
2046     for (i = env->vstart; i < vl; i++) {                             \
2047         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2048         *((ETYPE *)vd + H(i)) = s1;                                  \
2049     }                                                                \
2050     env->vstart = 0;                                                 \
2051     /* set tail elements to 1s */                                    \
2052     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2053 }
2054 
2055 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2056 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2057 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2058 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2059 
2060 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2061 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2062                   uint32_t desc)                                     \
2063 {                                                                    \
2064     uint32_t vl = env->vl;                                           \
2065     uint32_t esz = sizeof(ETYPE);                                    \
2066     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2067     uint32_t vta = vext_vta(desc);                                   \
2068     uint32_t i;                                                      \
2069                                                                      \
2070     VSTART_CHECK_EARLY_EXIT(env);                                    \
2071                                                                      \
2072     for (i = env->vstart; i < vl; i++) {                             \
2073         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2074     }                                                                \
2075     env->vstart = 0;                                                 \
2076     /* set tail elements to 1s */                                    \
2077     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2078 }
2079 
2080 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2081 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2082 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2083 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2084 
2085 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2086 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2087                   CPURISCVState *env, uint32_t desc)                 \
2088 {                                                                    \
2089     uint32_t vl = env->vl;                                           \
2090     uint32_t esz = sizeof(ETYPE);                                    \
2091     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2092     uint32_t vta = vext_vta(desc);                                   \
2093     uint32_t i;                                                      \
2094                                                                      \
2095     VSTART_CHECK_EARLY_EXIT(env);                                    \
2096                                                                      \
2097     for (i = env->vstart; i < vl; i++) {                             \
2098         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2099         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2100     }                                                                \
2101     env->vstart = 0;                                                 \
2102     /* set tail elements to 1s */                                    \
2103     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2104 }
2105 
2106 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2107 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2108 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2109 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2110 
2111 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2112 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2113                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2114 {                                                                    \
2115     uint32_t vl = env->vl;                                           \
2116     uint32_t esz = sizeof(ETYPE);                                    \
2117     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2118     uint32_t vta = vext_vta(desc);                                   \
2119     uint32_t i;                                                      \
2120                                                                      \
2121     VSTART_CHECK_EARLY_EXIT(env);                                    \
2122                                                                      \
2123     for (i = env->vstart; i < vl; i++) {                             \
2124         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2125         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2126                    (ETYPE)(target_long)s1);                          \
2127         *((ETYPE *)vd + H(i)) = d;                                   \
2128     }                                                                \
2129     env->vstart = 0;                                                 \
2130     /* set tail elements to 1s */                                    \
2131     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2132 }
2133 
2134 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2135 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2136 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2137 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2138 
2139 /*
2140  * Vector Fixed-Point Arithmetic Instructions
2141  */
2142 
2143 /* Vector Single-Width Saturating Add and Subtract */
2144 
2145 /*
2146  * As fixed point instructions probably have round mode and saturation,
2147  * define common macros for fixed point here.
2148  */
2149 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2150                           CPURISCVState *env, int vxrm);
2151 
2152 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2153 static inline void                                                  \
2154 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2155           CPURISCVState *env, int vxrm)                             \
2156 {                                                                   \
2157     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2158     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2159     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2160 }
2161 
2162 static inline void
2163 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2164              CPURISCVState *env,
2165              uint32_t vl, uint32_t vm, int vxrm,
2166              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2167 {
2168     VSTART_CHECK_EARLY_EXIT(env);
2169 
2170     for (uint32_t i = env->vstart; i < vl; i++) {
2171         if (!vm && !vext_elem_mask(v0, i)) {
2172             /* set masked-off elements to 1s */
2173             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2174             continue;
2175         }
2176         fn(vd, vs1, vs2, i, env, vxrm);
2177     }
2178     env->vstart = 0;
2179 }
2180 
2181 static inline void
2182 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2183              CPURISCVState *env,
2184              uint32_t desc,
2185              opivv2_rm_fn *fn, uint32_t esz)
2186 {
2187     uint32_t vm = vext_vm(desc);
2188     uint32_t vl = env->vl;
2189     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2190     uint32_t vta = vext_vta(desc);
2191     uint32_t vma = vext_vma(desc);
2192 
2193     switch (env->vxrm) {
2194     case 0: /* rnu */
2195         vext_vv_rm_1(vd, v0, vs1, vs2,
2196                      env, vl, vm, 0, fn, vma, esz);
2197         break;
2198     case 1: /* rne */
2199         vext_vv_rm_1(vd, v0, vs1, vs2,
2200                      env, vl, vm, 1, fn, vma, esz);
2201         break;
2202     case 2: /* rdn */
2203         vext_vv_rm_1(vd, v0, vs1, vs2,
2204                      env, vl, vm, 2, fn, vma, esz);
2205         break;
2206     default: /* rod */
2207         vext_vv_rm_1(vd, v0, vs1, vs2,
2208                      env, vl, vm, 3, fn, vma, esz);
2209         break;
2210     }
2211     /* set tail elements to 1s */
2212     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2213 }
2214 
2215 /* generate helpers for fixed point instructions with OPIVV format */
2216 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2217 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2218                   CPURISCVState *env, uint32_t desc)            \
2219 {                                                               \
2220     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2221                  do_##NAME, ESZ);                               \
2222 }
2223 
2224 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2225                              uint8_t b)
2226 {
2227     uint8_t res = a + b;
2228     if (res < a) {
2229         res = UINT8_MAX;
2230         env->vxsat = 0x1;
2231     }
2232     return res;
2233 }
2234 
2235 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2236                                uint16_t b)
2237 {
2238     uint16_t res = a + b;
2239     if (res < a) {
2240         res = UINT16_MAX;
2241         env->vxsat = 0x1;
2242     }
2243     return res;
2244 }
2245 
2246 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2247                                uint32_t b)
2248 {
2249     uint32_t res = a + b;
2250     if (res < a) {
2251         res = UINT32_MAX;
2252         env->vxsat = 0x1;
2253     }
2254     return res;
2255 }
2256 
2257 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2258                                uint64_t b)
2259 {
2260     uint64_t res = a + b;
2261     if (res < a) {
2262         res = UINT64_MAX;
2263         env->vxsat = 0x1;
2264     }
2265     return res;
2266 }
2267 
2268 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2269 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2270 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2271 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2272 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2273 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2274 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2275 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2276 
2277 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2278                           CPURISCVState *env, int vxrm);
2279 
2280 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2281 static inline void                                                  \
2282 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2283           CPURISCVState *env, int vxrm)                             \
2284 {                                                                   \
2285     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2286     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2287 }
2288 
2289 static inline void
2290 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2291              CPURISCVState *env,
2292              uint32_t vl, uint32_t vm, int vxrm,
2293              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2294 {
2295     VSTART_CHECK_EARLY_EXIT(env);
2296 
2297     for (uint32_t i = env->vstart; i < vl; i++) {
2298         if (!vm && !vext_elem_mask(v0, i)) {
2299             /* set masked-off elements to 1s */
2300             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2301             continue;
2302         }
2303         fn(vd, s1, vs2, i, env, vxrm);
2304     }
2305     env->vstart = 0;
2306 }
2307 
2308 static inline void
2309 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2310              CPURISCVState *env,
2311              uint32_t desc,
2312              opivx2_rm_fn *fn, uint32_t esz)
2313 {
2314     uint32_t vm = vext_vm(desc);
2315     uint32_t vl = env->vl;
2316     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2317     uint32_t vta = vext_vta(desc);
2318     uint32_t vma = vext_vma(desc);
2319 
2320     switch (env->vxrm) {
2321     case 0: /* rnu */
2322         vext_vx_rm_1(vd, v0, s1, vs2,
2323                      env, vl, vm, 0, fn, vma, esz);
2324         break;
2325     case 1: /* rne */
2326         vext_vx_rm_1(vd, v0, s1, vs2,
2327                      env, vl, vm, 1, fn, vma, esz);
2328         break;
2329     case 2: /* rdn */
2330         vext_vx_rm_1(vd, v0, s1, vs2,
2331                      env, vl, vm, 2, fn, vma, esz);
2332         break;
2333     default: /* rod */
2334         vext_vx_rm_1(vd, v0, s1, vs2,
2335                      env, vl, vm, 3, fn, vma, esz);
2336         break;
2337     }
2338     /* set tail elements to 1s */
2339     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2340 }
2341 
2342 /* generate helpers for fixed point instructions with OPIVX format */
2343 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2344 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2345                   void *vs2, CPURISCVState *env,          \
2346                   uint32_t desc)                          \
2347 {                                                         \
2348     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2349                  do_##NAME, ESZ);                         \
2350 }
2351 
2352 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2353 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2354 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2355 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2356 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2357 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2358 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2359 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2360 
2361 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2362 {
2363     int8_t res = a + b;
2364     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2365         res = a > 0 ? INT8_MAX : INT8_MIN;
2366         env->vxsat = 0x1;
2367     }
2368     return res;
2369 }
2370 
2371 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2372                              int16_t b)
2373 {
2374     int16_t res = a + b;
2375     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2376         res = a > 0 ? INT16_MAX : INT16_MIN;
2377         env->vxsat = 0x1;
2378     }
2379     return res;
2380 }
2381 
2382 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2383                              int32_t b)
2384 {
2385     int32_t res = a + b;
2386     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2387         res = a > 0 ? INT32_MAX : INT32_MIN;
2388         env->vxsat = 0x1;
2389     }
2390     return res;
2391 }
2392 
2393 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2394                              int64_t b)
2395 {
2396     int64_t res = a + b;
2397     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2398         res = a > 0 ? INT64_MAX : INT64_MIN;
2399         env->vxsat = 0x1;
2400     }
2401     return res;
2402 }
2403 
2404 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2405 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2406 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2407 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2408 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2409 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2410 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2411 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2412 
2413 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2414 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2415 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2416 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2417 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2418 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2419 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2420 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2421 
2422 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2423                              uint8_t b)
2424 {
2425     uint8_t res = a - b;
2426     if (res > a) {
2427         res = 0;
2428         env->vxsat = 0x1;
2429     }
2430     return res;
2431 }
2432 
2433 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2434                                uint16_t b)
2435 {
2436     uint16_t res = a - b;
2437     if (res > a) {
2438         res = 0;
2439         env->vxsat = 0x1;
2440     }
2441     return res;
2442 }
2443 
2444 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2445                                uint32_t b)
2446 {
2447     uint32_t res = a - b;
2448     if (res > a) {
2449         res = 0;
2450         env->vxsat = 0x1;
2451     }
2452     return res;
2453 }
2454 
2455 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2456                                uint64_t b)
2457 {
2458     uint64_t res = a - b;
2459     if (res > a) {
2460         res = 0;
2461         env->vxsat = 0x1;
2462     }
2463     return res;
2464 }
2465 
2466 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2467 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2468 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2469 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2470 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2471 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2472 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2473 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2474 
2475 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2476 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2477 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2478 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2479 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2480 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2481 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2482 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2483 
2484 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2485 {
2486     int8_t res = a - b;
2487     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2488         res = a >= 0 ? INT8_MAX : INT8_MIN;
2489         env->vxsat = 0x1;
2490     }
2491     return res;
2492 }
2493 
2494 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2495                              int16_t b)
2496 {
2497     int16_t res = a - b;
2498     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2499         res = a >= 0 ? INT16_MAX : INT16_MIN;
2500         env->vxsat = 0x1;
2501     }
2502     return res;
2503 }
2504 
2505 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2506                              int32_t b)
2507 {
2508     int32_t res = a - b;
2509     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2510         res = a >= 0 ? INT32_MAX : INT32_MIN;
2511         env->vxsat = 0x1;
2512     }
2513     return res;
2514 }
2515 
2516 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2517                              int64_t b)
2518 {
2519     int64_t res = a - b;
2520     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2521         res = a >= 0 ? INT64_MAX : INT64_MIN;
2522         env->vxsat = 0x1;
2523     }
2524     return res;
2525 }
2526 
2527 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2528 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2529 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2530 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2531 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2532 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2533 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2534 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2535 
2536 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2537 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2538 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2539 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2540 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2541 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2542 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2543 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2544 
2545 /* Vector Single-Width Averaging Add and Subtract */
2546 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2547 {
2548     uint8_t d = extract64(v, shift, 1);
2549     uint8_t d1;
2550     uint64_t D1, D2;
2551 
2552     if (shift == 0 || shift > 64) {
2553         return 0;
2554     }
2555 
2556     d1 = extract64(v, shift - 1, 1);
2557     D1 = extract64(v, 0, shift);
2558     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2559         return d1;
2560     } else if (vxrm == 1) { /* round-to-nearest-even */
2561         if (shift > 1) {
2562             D2 = extract64(v, 0, shift - 1);
2563             return d1 & ((D2 != 0) | d);
2564         } else {
2565             return d1 & d;
2566         }
2567     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2568         return !d & (D1 != 0);
2569     }
2570     return 0; /* round-down (truncate) */
2571 }
2572 
2573 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2574                              int32_t b)
2575 {
2576     int64_t res = (int64_t)a + b;
2577     uint8_t round = get_round(vxrm, res, 1);
2578 
2579     return (res >> 1) + round;
2580 }
2581 
2582 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2583                              int64_t b)
2584 {
2585     int64_t res = a + b;
2586     uint8_t round = get_round(vxrm, res, 1);
2587     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2588 
2589     /* With signed overflow, bit 64 is inverse of bit 63. */
2590     return ((res >> 1) ^ over) + round;
2591 }
2592 
2593 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2594 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2595 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2596 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2597 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2598 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2599 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2600 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2601 
2602 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2603 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2604 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2605 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2606 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2607 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2608 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2609 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2610 
2611 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2612                                uint32_t a, uint32_t b)
2613 {
2614     uint64_t res = (uint64_t)a + b;
2615     uint8_t round = get_round(vxrm, res, 1);
2616 
2617     return (res >> 1) + round;
2618 }
2619 
2620 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2621                                uint64_t a, uint64_t b)
2622 {
2623     uint64_t res = a + b;
2624     uint8_t round = get_round(vxrm, res, 1);
2625     uint64_t over = (uint64_t)(res < a) << 63;
2626 
2627     return ((res >> 1) | over) + round;
2628 }
2629 
2630 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2631 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2632 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2633 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2634 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2635 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2636 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2637 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2638 
2639 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2640 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2641 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2642 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2643 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2644 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2645 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2646 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2647 
2648 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2649                              int32_t b)
2650 {
2651     int64_t res = (int64_t)a - b;
2652     uint8_t round = get_round(vxrm, res, 1);
2653 
2654     return (res >> 1) + round;
2655 }
2656 
2657 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2658                              int64_t b)
2659 {
2660     int64_t res = (int64_t)a - b;
2661     uint8_t round = get_round(vxrm, res, 1);
2662     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2663 
2664     /* With signed overflow, bit 64 is inverse of bit 63. */
2665     return ((res >> 1) ^ over) + round;
2666 }
2667 
2668 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2669 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2670 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2671 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2672 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2673 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2674 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2675 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2676 
2677 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2678 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2679 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2680 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2681 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2682 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2683 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2684 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2685 
2686 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2687                                uint32_t a, uint32_t b)
2688 {
2689     int64_t res = (int64_t)a - b;
2690     uint8_t round = get_round(vxrm, res, 1);
2691 
2692     return (res >> 1) + round;
2693 }
2694 
2695 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2696                                uint64_t a, uint64_t b)
2697 {
2698     uint64_t res = (uint64_t)a - b;
2699     uint8_t round = get_round(vxrm, res, 1);
2700     uint64_t over = (uint64_t)(res > a) << 63;
2701 
2702     return ((res >> 1) | over) + round;
2703 }
2704 
2705 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2706 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2707 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2708 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2709 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2710 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2711 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2712 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2713 
2714 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2715 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2716 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2717 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2718 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2719 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2720 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2721 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2722 
2723 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2724 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2725 {
2726     uint8_t round;
2727     int16_t res;
2728 
2729     res = (int16_t)a * (int16_t)b;
2730     round = get_round(vxrm, res, 7);
2731     res = (res >> 7) + round;
2732 
2733     if (res > INT8_MAX) {
2734         env->vxsat = 0x1;
2735         return INT8_MAX;
2736     } else if (res < INT8_MIN) {
2737         env->vxsat = 0x1;
2738         return INT8_MIN;
2739     } else {
2740         return res;
2741     }
2742 }
2743 
2744 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2745 {
2746     uint8_t round;
2747     int32_t res;
2748 
2749     res = (int32_t)a * (int32_t)b;
2750     round = get_round(vxrm, res, 15);
2751     res = (res >> 15) + round;
2752 
2753     if (res > INT16_MAX) {
2754         env->vxsat = 0x1;
2755         return INT16_MAX;
2756     } else if (res < INT16_MIN) {
2757         env->vxsat = 0x1;
2758         return INT16_MIN;
2759     } else {
2760         return res;
2761     }
2762 }
2763 
2764 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2765 {
2766     uint8_t round;
2767     int64_t res;
2768 
2769     res = (int64_t)a * (int64_t)b;
2770     round = get_round(vxrm, res, 31);
2771     res = (res >> 31) + round;
2772 
2773     if (res > INT32_MAX) {
2774         env->vxsat = 0x1;
2775         return INT32_MAX;
2776     } else if (res < INT32_MIN) {
2777         env->vxsat = 0x1;
2778         return INT32_MIN;
2779     } else {
2780         return res;
2781     }
2782 }
2783 
2784 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2785 {
2786     uint8_t round;
2787     uint64_t hi_64, lo_64;
2788     int64_t res;
2789 
2790     if (a == INT64_MIN && b == INT64_MIN) {
2791         env->vxsat = 1;
2792         return INT64_MAX;
2793     }
2794 
2795     muls64(&lo_64, &hi_64, a, b);
2796     round = get_round(vxrm, lo_64, 63);
2797     /*
2798      * Cannot overflow, as there are always
2799      * 2 sign bits after multiply.
2800      */
2801     res = (hi_64 << 1) | (lo_64 >> 63);
2802     if (round) {
2803         if (res == INT64_MAX) {
2804             env->vxsat = 1;
2805         } else {
2806             res += 1;
2807         }
2808     }
2809     return res;
2810 }
2811 
2812 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2813 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2814 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2815 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2816 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2817 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2818 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2819 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2820 
2821 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2822 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2823 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2824 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2825 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2826 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2827 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2828 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2829 
2830 /* Vector Single-Width Scaling Shift Instructions */
2831 static inline uint8_t
2832 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2833 {
2834     uint8_t round, shift = b & 0x7;
2835     uint8_t res;
2836 
2837     round = get_round(vxrm, a, shift);
2838     res = (a >> shift) + round;
2839     return res;
2840 }
2841 static inline uint16_t
2842 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2843 {
2844     uint8_t round, shift = b & 0xf;
2845 
2846     round = get_round(vxrm, a, shift);
2847     return (a >> shift) + round;
2848 }
2849 static inline uint32_t
2850 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2851 {
2852     uint8_t round, shift = b & 0x1f;
2853 
2854     round = get_round(vxrm, a, shift);
2855     return (a >> shift) + round;
2856 }
2857 static inline uint64_t
2858 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2859 {
2860     uint8_t round, shift = b & 0x3f;
2861 
2862     round = get_round(vxrm, a, shift);
2863     return (a >> shift) + round;
2864 }
2865 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2866 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2867 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2868 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2869 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2870 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2871 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2872 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2873 
2874 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2875 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2876 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2877 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2878 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2879 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2880 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2881 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2882 
2883 static inline int8_t
2884 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2885 {
2886     uint8_t round, shift = b & 0x7;
2887 
2888     round = get_round(vxrm, a, shift);
2889     return (a >> shift) + round;
2890 }
2891 static inline int16_t
2892 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2893 {
2894     uint8_t round, shift = b & 0xf;
2895 
2896     round = get_round(vxrm, a, shift);
2897     return (a >> shift) + round;
2898 }
2899 static inline int32_t
2900 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2901 {
2902     uint8_t round, shift = b & 0x1f;
2903 
2904     round = get_round(vxrm, a, shift);
2905     return (a >> shift) + round;
2906 }
2907 static inline int64_t
2908 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2909 {
2910     uint8_t round, shift = b & 0x3f;
2911 
2912     round = get_round(vxrm, a, shift);
2913     return (a >> shift) + round;
2914 }
2915 
2916 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2917 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2918 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2919 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2920 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2921 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2922 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2923 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2924 
2925 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2926 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2927 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2928 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2929 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2930 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2931 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2932 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2933 
2934 /* Vector Narrowing Fixed-Point Clip Instructions */
2935 static inline int8_t
2936 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2937 {
2938     uint8_t round, shift = b & 0xf;
2939     int16_t res;
2940 
2941     round = get_round(vxrm, a, shift);
2942     res = (a >> shift) + round;
2943     if (res > INT8_MAX) {
2944         env->vxsat = 0x1;
2945         return INT8_MAX;
2946     } else if (res < INT8_MIN) {
2947         env->vxsat = 0x1;
2948         return INT8_MIN;
2949     } else {
2950         return res;
2951     }
2952 }
2953 
2954 static inline int16_t
2955 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2956 {
2957     uint8_t round, shift = b & 0x1f;
2958     int32_t res;
2959 
2960     round = get_round(vxrm, a, shift);
2961     res = (a >> shift) + round;
2962     if (res > INT16_MAX) {
2963         env->vxsat = 0x1;
2964         return INT16_MAX;
2965     } else if (res < INT16_MIN) {
2966         env->vxsat = 0x1;
2967         return INT16_MIN;
2968     } else {
2969         return res;
2970     }
2971 }
2972 
2973 static inline int32_t
2974 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2975 {
2976     uint8_t round, shift = b & 0x3f;
2977     int64_t res;
2978 
2979     round = get_round(vxrm, a, shift);
2980     res = (a >> shift) + round;
2981     if (res > INT32_MAX) {
2982         env->vxsat = 0x1;
2983         return INT32_MAX;
2984     } else if (res < INT32_MIN) {
2985         env->vxsat = 0x1;
2986         return INT32_MIN;
2987     } else {
2988         return res;
2989     }
2990 }
2991 
2992 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2993 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2994 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2995 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2996 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2997 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2998 
2999 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
3000 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
3001 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
3002 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
3003 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
3004 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
3005 
3006 static inline uint8_t
3007 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
3008 {
3009     uint8_t round, shift = b & 0xf;
3010     uint16_t res;
3011 
3012     round = get_round(vxrm, a, shift);
3013     res = (a >> shift) + round;
3014     if (res > UINT8_MAX) {
3015         env->vxsat = 0x1;
3016         return UINT8_MAX;
3017     } else {
3018         return res;
3019     }
3020 }
3021 
3022 static inline uint16_t
3023 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
3024 {
3025     uint8_t round, shift = b & 0x1f;
3026     uint32_t res;
3027 
3028     round = get_round(vxrm, a, shift);
3029     res = (a >> shift) + round;
3030     if (res > UINT16_MAX) {
3031         env->vxsat = 0x1;
3032         return UINT16_MAX;
3033     } else {
3034         return res;
3035     }
3036 }
3037 
3038 static inline uint32_t
3039 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3040 {
3041     uint8_t round, shift = b & 0x3f;
3042     uint64_t res;
3043 
3044     round = get_round(vxrm, a, shift);
3045     res = (a >> shift) + round;
3046     if (res > UINT32_MAX) {
3047         env->vxsat = 0x1;
3048         return UINT32_MAX;
3049     } else {
3050         return res;
3051     }
3052 }
3053 
3054 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3055 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3056 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3057 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3058 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3059 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3060 
3061 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3062 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3063 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3064 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3065 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3066 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3067 
3068 /*
3069  * Vector Float Point Arithmetic Instructions
3070  */
3071 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3072 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3073 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3074                       CPURISCVState *env)                      \
3075 {                                                              \
3076     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3077     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3078     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3079 }
3080 
3081 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3082 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3083                   void *vs2, CPURISCVState *env,          \
3084                   uint32_t desc)                          \
3085 {                                                         \
3086     uint32_t vm = vext_vm(desc);                          \
3087     uint32_t vl = env->vl;                                \
3088     uint32_t total_elems =                                \
3089         vext_get_total_elems(env, desc, ESZ);             \
3090     uint32_t vta = vext_vta(desc);                        \
3091     uint32_t vma = vext_vma(desc);                        \
3092     uint32_t i;                                           \
3093                                                           \
3094     VSTART_CHECK_EARLY_EXIT(env);                         \
3095                                                           \
3096     for (i = env->vstart; i < vl; i++) {                  \
3097         if (!vm && !vext_elem_mask(v0, i)) {              \
3098             /* set masked-off elements to 1s */           \
3099             vext_set_elems_1s(vd, vma, i * ESZ,           \
3100                               (i + 1) * ESZ);             \
3101             continue;                                     \
3102         }                                                 \
3103         do_##NAME(vd, vs1, vs2, i, env);                  \
3104     }                                                     \
3105     env->vstart = 0;                                      \
3106     /* set tail elements to 1s */                         \
3107     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3108                       total_elems * ESZ);                 \
3109 }
3110 
3111 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3112 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3113 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3114 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3115 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3116 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3117 
3118 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3119 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3120                       CPURISCVState *env)                      \
3121 {                                                              \
3122     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3123     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3124 }
3125 
3126 #define GEN_VEXT_VF(NAME, ESZ)                            \
3127 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3128                   void *vs2, CPURISCVState *env,          \
3129                   uint32_t desc)                          \
3130 {                                                         \
3131     uint32_t vm = vext_vm(desc);                          \
3132     uint32_t vl = env->vl;                                \
3133     uint32_t total_elems =                                \
3134         vext_get_total_elems(env, desc, ESZ);             \
3135     uint32_t vta = vext_vta(desc);                        \
3136     uint32_t vma = vext_vma(desc);                        \
3137     uint32_t i;                                           \
3138                                                           \
3139     VSTART_CHECK_EARLY_EXIT(env);                         \
3140                                                           \
3141     for (i = env->vstart; i < vl; i++) {                  \
3142         if (!vm && !vext_elem_mask(v0, i)) {              \
3143             /* set masked-off elements to 1s */           \
3144             vext_set_elems_1s(vd, vma, i * ESZ,           \
3145                               (i + 1) * ESZ);             \
3146             continue;                                     \
3147         }                                                 \
3148         do_##NAME(vd, s1, vs2, i, env);                   \
3149     }                                                     \
3150     env->vstart = 0;                                      \
3151     /* set tail elements to 1s */                         \
3152     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3153                       total_elems * ESZ);                 \
3154 }
3155 
3156 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3157 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3158 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3159 GEN_VEXT_VF(vfadd_vf_h, 2)
3160 GEN_VEXT_VF(vfadd_vf_w, 4)
3161 GEN_VEXT_VF(vfadd_vf_d, 8)
3162 
3163 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3164 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3165 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3166 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3167 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3168 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3169 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3170 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3171 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3172 GEN_VEXT_VF(vfsub_vf_h, 2)
3173 GEN_VEXT_VF(vfsub_vf_w, 4)
3174 GEN_VEXT_VF(vfsub_vf_d, 8)
3175 
3176 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3177 {
3178     return float16_sub(b, a, s);
3179 }
3180 
3181 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3182 {
3183     return float32_sub(b, a, s);
3184 }
3185 
3186 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3187 {
3188     return float64_sub(b, a, s);
3189 }
3190 
3191 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3192 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3193 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3194 GEN_VEXT_VF(vfrsub_vf_h, 2)
3195 GEN_VEXT_VF(vfrsub_vf_w, 4)
3196 GEN_VEXT_VF(vfrsub_vf_d, 8)
3197 
3198 /* Vector Widening Floating-Point Add/Subtract Instructions */
3199 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3200 {
3201     return float32_add(float16_to_float32(a, true, s),
3202                        float16_to_float32(b, true, s), s);
3203 }
3204 
3205 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3206 {
3207     return float64_add(float32_to_float64(a, s),
3208                        float32_to_float64(b, s), s);
3209 
3210 }
3211 
3212 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3213 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3214 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3215 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3216 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3217 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3218 GEN_VEXT_VF(vfwadd_vf_h, 4)
3219 GEN_VEXT_VF(vfwadd_vf_w, 8)
3220 
3221 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3222 {
3223     return float32_sub(float16_to_float32(a, true, s),
3224                        float16_to_float32(b, true, s), s);
3225 }
3226 
3227 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3228 {
3229     return float64_sub(float32_to_float64(a, s),
3230                        float32_to_float64(b, s), s);
3231 
3232 }
3233 
3234 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3235 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3236 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3237 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3238 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3239 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3240 GEN_VEXT_VF(vfwsub_vf_h, 4)
3241 GEN_VEXT_VF(vfwsub_vf_w, 8)
3242 
3243 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3244 {
3245     return float32_add(a, float16_to_float32(b, true, s), s);
3246 }
3247 
3248 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3249 {
3250     return float64_add(a, float32_to_float64(b, s), s);
3251 }
3252 
3253 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3254 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3255 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3256 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3257 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3258 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3259 GEN_VEXT_VF(vfwadd_wf_h, 4)
3260 GEN_VEXT_VF(vfwadd_wf_w, 8)
3261 
3262 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3263 {
3264     return float32_sub(a, float16_to_float32(b, true, s), s);
3265 }
3266 
3267 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3268 {
3269     return float64_sub(a, float32_to_float64(b, s), s);
3270 }
3271 
3272 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3273 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3274 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3275 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3276 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3277 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3278 GEN_VEXT_VF(vfwsub_wf_h, 4)
3279 GEN_VEXT_VF(vfwsub_wf_w, 8)
3280 
3281 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3282 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3283 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3284 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3285 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3286 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3287 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3288 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3289 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3290 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3291 GEN_VEXT_VF(vfmul_vf_h, 2)
3292 GEN_VEXT_VF(vfmul_vf_w, 4)
3293 GEN_VEXT_VF(vfmul_vf_d, 8)
3294 
3295 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3296 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3297 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3298 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3299 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3300 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3301 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3302 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3303 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3304 GEN_VEXT_VF(vfdiv_vf_h, 2)
3305 GEN_VEXT_VF(vfdiv_vf_w, 4)
3306 GEN_VEXT_VF(vfdiv_vf_d, 8)
3307 
3308 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3309 {
3310     return float16_div(b, a, s);
3311 }
3312 
3313 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3314 {
3315     return float32_div(b, a, s);
3316 }
3317 
3318 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3319 {
3320     return float64_div(b, a, s);
3321 }
3322 
3323 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3324 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3325 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3326 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3327 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3328 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3329 
3330 /* Vector Widening Floating-Point Multiply */
3331 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3332 {
3333     return float32_mul(float16_to_float32(a, true, s),
3334                        float16_to_float32(b, true, s), s);
3335 }
3336 
3337 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3338 {
3339     return float64_mul(float32_to_float64(a, s),
3340                        float32_to_float64(b, s), s);
3341 
3342 }
3343 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3344 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3345 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3346 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3347 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3348 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3349 GEN_VEXT_VF(vfwmul_vf_h, 4)
3350 GEN_VEXT_VF(vfwmul_vf_w, 8)
3351 
3352 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3353 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3354 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3355                       CPURISCVState *env)                          \
3356 {                                                                  \
3357     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3358     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3359     TD d = *((TD *)vd + HD(i));                                    \
3360     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3361 }
3362 
3363 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3364 {
3365     return float16_muladd(a, b, d, 0, s);
3366 }
3367 
3368 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3369 {
3370     return float32_muladd(a, b, d, 0, s);
3371 }
3372 
3373 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3374 {
3375     return float64_muladd(a, b, d, 0, s);
3376 }
3377 
3378 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3379 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3380 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3381 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3382 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3383 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3384 
3385 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3386 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3387                       CPURISCVState *env)                         \
3388 {                                                                 \
3389     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3390     TD d = *((TD *)vd + HD(i));                                   \
3391     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3392 }
3393 
3394 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3395 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3396 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3397 GEN_VEXT_VF(vfmacc_vf_h, 2)
3398 GEN_VEXT_VF(vfmacc_vf_w, 4)
3399 GEN_VEXT_VF(vfmacc_vf_d, 8)
3400 
3401 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3402 {
3403     return float16_muladd(a, b, d, float_muladd_negate_c |
3404                                    float_muladd_negate_product, s);
3405 }
3406 
3407 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3408 {
3409     return float32_muladd(a, b, d, float_muladd_negate_c |
3410                                    float_muladd_negate_product, s);
3411 }
3412 
3413 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3414 {
3415     return float64_muladd(a, b, d, float_muladd_negate_c |
3416                                    float_muladd_negate_product, s);
3417 }
3418 
3419 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3420 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3421 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3422 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3423 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3424 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3425 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3426 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3427 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3428 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3429 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3430 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3431 
3432 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3433 {
3434     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3435 }
3436 
3437 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3438 {
3439     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3440 }
3441 
3442 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3443 {
3444     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3445 }
3446 
3447 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3448 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3449 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3450 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3451 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3452 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3453 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3454 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3455 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3456 GEN_VEXT_VF(vfmsac_vf_h, 2)
3457 GEN_VEXT_VF(vfmsac_vf_w, 4)
3458 GEN_VEXT_VF(vfmsac_vf_d, 8)
3459 
3460 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3461 {
3462     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3463 }
3464 
3465 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3466 {
3467     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3468 }
3469 
3470 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3471 {
3472     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3473 }
3474 
3475 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3476 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3477 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3478 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3479 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3480 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3481 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3482 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3483 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3484 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3485 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3486 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3487 
3488 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3489 {
3490     return float16_muladd(d, b, a, 0, s);
3491 }
3492 
3493 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3494 {
3495     return float32_muladd(d, b, a, 0, s);
3496 }
3497 
3498 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3499 {
3500     return float64_muladd(d, b, a, 0, s);
3501 }
3502 
3503 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3504 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3505 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3506 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3507 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3508 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3509 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3510 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3511 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3512 GEN_VEXT_VF(vfmadd_vf_h, 2)
3513 GEN_VEXT_VF(vfmadd_vf_w, 4)
3514 GEN_VEXT_VF(vfmadd_vf_d, 8)
3515 
3516 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3517 {
3518     return float16_muladd(d, b, a, float_muladd_negate_c |
3519                                    float_muladd_negate_product, s);
3520 }
3521 
3522 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3523 {
3524     return float32_muladd(d, b, a, float_muladd_negate_c |
3525                                    float_muladd_negate_product, s);
3526 }
3527 
3528 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3529 {
3530     return float64_muladd(d, b, a, float_muladd_negate_c |
3531                                    float_muladd_negate_product, s);
3532 }
3533 
3534 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3535 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3536 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3537 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3538 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3539 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3540 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3541 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3542 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3543 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3544 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3545 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3546 
3547 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3548 {
3549     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3550 }
3551 
3552 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3553 {
3554     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3555 }
3556 
3557 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3558 {
3559     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3560 }
3561 
3562 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3563 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3564 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3565 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3566 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3567 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3568 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3569 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3570 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3571 GEN_VEXT_VF(vfmsub_vf_h, 2)
3572 GEN_VEXT_VF(vfmsub_vf_w, 4)
3573 GEN_VEXT_VF(vfmsub_vf_d, 8)
3574 
3575 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3576 {
3577     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3578 }
3579 
3580 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3581 {
3582     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3583 }
3584 
3585 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3586 {
3587     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3588 }
3589 
3590 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3591 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3592 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3593 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3594 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3595 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3596 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3597 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3598 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3599 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3600 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3601 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3602 
3603 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3604 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3605 {
3606     return float32_muladd(float16_to_float32(a, true, s),
3607                           float16_to_float32(b, true, s), d, 0, s);
3608 }
3609 
3610 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3611 {
3612     return float64_muladd(float32_to_float64(a, s),
3613                           float32_to_float64(b, s), d, 0, s);
3614 }
3615 
3616 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3617 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3618 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3619 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3620 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3621 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3622 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3623 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3624 
3625 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3626 {
3627     return float32_muladd(bfloat16_to_float32(a, s),
3628                           bfloat16_to_float32(b, s), d, 0, s);
3629 }
3630 
3631 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3632 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3633 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3634 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3635 
3636 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3637 {
3638     return float32_muladd(float16_to_float32(a, true, s),
3639                           float16_to_float32(b, true, s), d,
3640                           float_muladd_negate_c | float_muladd_negate_product,
3641                           s);
3642 }
3643 
3644 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3645 {
3646     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3647                           d, float_muladd_negate_c |
3648                              float_muladd_negate_product, s);
3649 }
3650 
3651 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3652 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3653 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3654 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3655 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3656 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3657 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3658 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3659 
3660 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3661 {
3662     return float32_muladd(float16_to_float32(a, true, s),
3663                           float16_to_float32(b, true, s), d,
3664                           float_muladd_negate_c, s);
3665 }
3666 
3667 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3668 {
3669     return float64_muladd(float32_to_float64(a, s),
3670                           float32_to_float64(b, s), d,
3671                           float_muladd_negate_c, s);
3672 }
3673 
3674 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3675 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3676 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3677 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3678 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3679 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3680 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3681 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3682 
3683 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3684 {
3685     return float32_muladd(float16_to_float32(a, true, s),
3686                           float16_to_float32(b, true, s), d,
3687                           float_muladd_negate_product, s);
3688 }
3689 
3690 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3691 {
3692     return float64_muladd(float32_to_float64(a, s),
3693                           float32_to_float64(b, s), d,
3694                           float_muladd_negate_product, s);
3695 }
3696 
3697 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3698 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3699 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3700 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3701 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3702 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3703 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3704 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3705 
3706 /* Vector Floating-Point Square-Root Instruction */
3707 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3708 static void do_##NAME(void *vd, void *vs2, int i,      \
3709                       CPURISCVState *env)              \
3710 {                                                      \
3711     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3712     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3713 }
3714 
3715 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3716 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3717                   CPURISCVState *env, uint32_t desc)   \
3718 {                                                      \
3719     uint32_t vm = vext_vm(desc);                       \
3720     uint32_t vl = env->vl;                             \
3721     uint32_t total_elems =                             \
3722         vext_get_total_elems(env, desc, ESZ);          \
3723     uint32_t vta = vext_vta(desc);                     \
3724     uint32_t vma = vext_vma(desc);                     \
3725     uint32_t i;                                        \
3726                                                        \
3727     VSTART_CHECK_EARLY_EXIT(env);                      \
3728                                                        \
3729     if (vl == 0) {                                     \
3730         return;                                        \
3731     }                                                  \
3732     for (i = env->vstart; i < vl; i++) {               \
3733         if (!vm && !vext_elem_mask(v0, i)) {           \
3734             /* set masked-off elements to 1s */        \
3735             vext_set_elems_1s(vd, vma, i * ESZ,        \
3736                               (i + 1) * ESZ);          \
3737             continue;                                  \
3738         }                                              \
3739         do_##NAME(vd, vs2, i, env);                    \
3740     }                                                  \
3741     env->vstart = 0;                                   \
3742     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3743                       total_elems * ESZ);              \
3744 }
3745 
3746 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3747 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3748 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3749 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3750 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3751 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3752 
3753 /*
3754  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3755  *
3756  * Adapted from riscv-v-spec recip.c:
3757  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3758  */
3759 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3760 {
3761     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3762     uint64_t exp = extract64(f, frac_size, exp_size);
3763     uint64_t frac = extract64(f, 0, frac_size);
3764 
3765     const uint8_t lookup_table[] = {
3766         52, 51, 50, 48, 47, 46, 44, 43,
3767         42, 41, 40, 39, 38, 36, 35, 34,
3768         33, 32, 31, 30, 30, 29, 28, 27,
3769         26, 25, 24, 23, 23, 22, 21, 20,
3770         19, 19, 18, 17, 16, 16, 15, 14,
3771         14, 13, 12, 12, 11, 10, 10, 9,
3772         9, 8, 7, 7, 6, 6, 5, 4,
3773         4, 3, 3, 2, 2, 1, 1, 0,
3774         127, 125, 123, 121, 119, 118, 116, 114,
3775         113, 111, 109, 108, 106, 105, 103, 102,
3776         100, 99, 97, 96, 95, 93, 92, 91,
3777         90, 88, 87, 86, 85, 84, 83, 82,
3778         80, 79, 78, 77, 76, 75, 74, 73,
3779         72, 71, 70, 70, 69, 68, 67, 66,
3780         65, 64, 63, 63, 62, 61, 60, 59,
3781         59, 58, 57, 56, 56, 55, 54, 53
3782     };
3783     const int precision = 7;
3784 
3785     if (exp == 0 && frac != 0) { /* subnormal */
3786         /* Normalize the subnormal. */
3787         while (extract64(frac, frac_size - 1, 1) == 0) {
3788             exp--;
3789             frac <<= 1;
3790         }
3791 
3792         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3793     }
3794 
3795     int idx = ((exp & 1) << (precision - 1)) |
3796               (frac >> (frac_size - precision + 1));
3797     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3798                         (frac_size - precision);
3799     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3800 
3801     uint64_t val = 0;
3802     val = deposit64(val, 0, frac_size, out_frac);
3803     val = deposit64(val, frac_size, exp_size, out_exp);
3804     val = deposit64(val, frac_size + exp_size, 1, sign);
3805     return val;
3806 }
3807 
3808 static float16 frsqrt7_h(float16 f, float_status *s)
3809 {
3810     int exp_size = 5, frac_size = 10;
3811     bool sign = float16_is_neg(f);
3812 
3813     /*
3814      * frsqrt7(sNaN) = canonical NaN
3815      * frsqrt7(-inf) = canonical NaN
3816      * frsqrt7(-normal) = canonical NaN
3817      * frsqrt7(-subnormal) = canonical NaN
3818      */
3819     if (float16_is_signaling_nan(f, s) ||
3820         (float16_is_infinity(f) && sign) ||
3821         (float16_is_normal(f) && sign) ||
3822         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3823         s->float_exception_flags |= float_flag_invalid;
3824         return float16_default_nan(s);
3825     }
3826 
3827     /* frsqrt7(qNaN) = canonical NaN */
3828     if (float16_is_quiet_nan(f, s)) {
3829         return float16_default_nan(s);
3830     }
3831 
3832     /* frsqrt7(+-0) = +-inf */
3833     if (float16_is_zero(f)) {
3834         s->float_exception_flags |= float_flag_divbyzero;
3835         return float16_set_sign(float16_infinity, sign);
3836     }
3837 
3838     /* frsqrt7(+inf) = +0 */
3839     if (float16_is_infinity(f) && !sign) {
3840         return float16_set_sign(float16_zero, sign);
3841     }
3842 
3843     /* +normal, +subnormal */
3844     uint64_t val = frsqrt7(f, exp_size, frac_size);
3845     return make_float16(val);
3846 }
3847 
3848 static float32 frsqrt7_s(float32 f, float_status *s)
3849 {
3850     int exp_size = 8, frac_size = 23;
3851     bool sign = float32_is_neg(f);
3852 
3853     /*
3854      * frsqrt7(sNaN) = canonical NaN
3855      * frsqrt7(-inf) = canonical NaN
3856      * frsqrt7(-normal) = canonical NaN
3857      * frsqrt7(-subnormal) = canonical NaN
3858      */
3859     if (float32_is_signaling_nan(f, s) ||
3860         (float32_is_infinity(f) && sign) ||
3861         (float32_is_normal(f) && sign) ||
3862         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3863         s->float_exception_flags |= float_flag_invalid;
3864         return float32_default_nan(s);
3865     }
3866 
3867     /* frsqrt7(qNaN) = canonical NaN */
3868     if (float32_is_quiet_nan(f, s)) {
3869         return float32_default_nan(s);
3870     }
3871 
3872     /* frsqrt7(+-0) = +-inf */
3873     if (float32_is_zero(f)) {
3874         s->float_exception_flags |= float_flag_divbyzero;
3875         return float32_set_sign(float32_infinity, sign);
3876     }
3877 
3878     /* frsqrt7(+inf) = +0 */
3879     if (float32_is_infinity(f) && !sign) {
3880         return float32_set_sign(float32_zero, sign);
3881     }
3882 
3883     /* +normal, +subnormal */
3884     uint64_t val = frsqrt7(f, exp_size, frac_size);
3885     return make_float32(val);
3886 }
3887 
3888 static float64 frsqrt7_d(float64 f, float_status *s)
3889 {
3890     int exp_size = 11, frac_size = 52;
3891     bool sign = float64_is_neg(f);
3892 
3893     /*
3894      * frsqrt7(sNaN) = canonical NaN
3895      * frsqrt7(-inf) = canonical NaN
3896      * frsqrt7(-normal) = canonical NaN
3897      * frsqrt7(-subnormal) = canonical NaN
3898      */
3899     if (float64_is_signaling_nan(f, s) ||
3900         (float64_is_infinity(f) && sign) ||
3901         (float64_is_normal(f) && sign) ||
3902         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3903         s->float_exception_flags |= float_flag_invalid;
3904         return float64_default_nan(s);
3905     }
3906 
3907     /* frsqrt7(qNaN) = canonical NaN */
3908     if (float64_is_quiet_nan(f, s)) {
3909         return float64_default_nan(s);
3910     }
3911 
3912     /* frsqrt7(+-0) = +-inf */
3913     if (float64_is_zero(f)) {
3914         s->float_exception_flags |= float_flag_divbyzero;
3915         return float64_set_sign(float64_infinity, sign);
3916     }
3917 
3918     /* frsqrt7(+inf) = +0 */
3919     if (float64_is_infinity(f) && !sign) {
3920         return float64_set_sign(float64_zero, sign);
3921     }
3922 
3923     /* +normal, +subnormal */
3924     uint64_t val = frsqrt7(f, exp_size, frac_size);
3925     return make_float64(val);
3926 }
3927 
3928 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3929 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3930 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3931 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3932 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3933 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3934 
3935 /*
3936  * Vector Floating-Point Reciprocal Estimate Instruction
3937  *
3938  * Adapted from riscv-v-spec recip.c:
3939  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3940  */
3941 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3942                       float_status *s)
3943 {
3944     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3945     uint64_t exp = extract64(f, frac_size, exp_size);
3946     uint64_t frac = extract64(f, 0, frac_size);
3947 
3948     const uint8_t lookup_table[] = {
3949         127, 125, 123, 121, 119, 117, 116, 114,
3950         112, 110, 109, 107, 105, 104, 102, 100,
3951         99, 97, 96, 94, 93, 91, 90, 88,
3952         87, 85, 84, 83, 81, 80, 79, 77,
3953         76, 75, 74, 72, 71, 70, 69, 68,
3954         66, 65, 64, 63, 62, 61, 60, 59,
3955         58, 57, 56, 55, 54, 53, 52, 51,
3956         50, 49, 48, 47, 46, 45, 44, 43,
3957         42, 41, 40, 40, 39, 38, 37, 36,
3958         35, 35, 34, 33, 32, 31, 31, 30,
3959         29, 28, 28, 27, 26, 25, 25, 24,
3960         23, 23, 22, 21, 21, 20, 19, 19,
3961         18, 17, 17, 16, 15, 15, 14, 14,
3962         13, 12, 12, 11, 11, 10, 9, 9,
3963         8, 8, 7, 7, 6, 5, 5, 4,
3964         4, 3, 3, 2, 2, 1, 1, 0
3965     };
3966     const int precision = 7;
3967 
3968     if (exp == 0 && frac != 0) { /* subnormal */
3969         /* Normalize the subnormal. */
3970         while (extract64(frac, frac_size - 1, 1) == 0) {
3971             exp--;
3972             frac <<= 1;
3973         }
3974 
3975         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3976 
3977         if (exp != 0 && exp != UINT64_MAX) {
3978             /*
3979              * Overflow to inf or max value of same sign,
3980              * depending on sign and rounding mode.
3981              */
3982             s->float_exception_flags |= (float_flag_inexact |
3983                                          float_flag_overflow);
3984 
3985             if ((s->float_rounding_mode == float_round_to_zero) ||
3986                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3987                 ((s->float_rounding_mode == float_round_up) && sign)) {
3988                 /* Return greatest/negative finite value. */
3989                 return (sign << (exp_size + frac_size)) |
3990                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3991             } else {
3992                 /* Return +-inf. */
3993                 return (sign << (exp_size + frac_size)) |
3994                        MAKE_64BIT_MASK(frac_size, exp_size);
3995             }
3996         }
3997     }
3998 
3999     int idx = frac >> (frac_size - precision);
4000     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
4001                         (frac_size - precision);
4002     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
4003 
4004     if (out_exp == 0 || out_exp == UINT64_MAX) {
4005         /*
4006          * The result is subnormal, but don't raise the underflow exception,
4007          * because there's no additional loss of precision.
4008          */
4009         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
4010         if (out_exp == UINT64_MAX) {
4011             out_frac >>= 1;
4012             out_exp = 0;
4013         }
4014     }
4015 
4016     uint64_t val = 0;
4017     val = deposit64(val, 0, frac_size, out_frac);
4018     val = deposit64(val, frac_size, exp_size, out_exp);
4019     val = deposit64(val, frac_size + exp_size, 1, sign);
4020     return val;
4021 }
4022 
4023 static float16 frec7_h(float16 f, float_status *s)
4024 {
4025     int exp_size = 5, frac_size = 10;
4026     bool sign = float16_is_neg(f);
4027 
4028     /* frec7(+-inf) = +-0 */
4029     if (float16_is_infinity(f)) {
4030         return float16_set_sign(float16_zero, sign);
4031     }
4032 
4033     /* frec7(+-0) = +-inf */
4034     if (float16_is_zero(f)) {
4035         s->float_exception_flags |= float_flag_divbyzero;
4036         return float16_set_sign(float16_infinity, sign);
4037     }
4038 
4039     /* frec7(sNaN) = canonical NaN */
4040     if (float16_is_signaling_nan(f, s)) {
4041         s->float_exception_flags |= float_flag_invalid;
4042         return float16_default_nan(s);
4043     }
4044 
4045     /* frec7(qNaN) = canonical NaN */
4046     if (float16_is_quiet_nan(f, s)) {
4047         return float16_default_nan(s);
4048     }
4049 
4050     /* +-normal, +-subnormal */
4051     uint64_t val = frec7(f, exp_size, frac_size, s);
4052     return make_float16(val);
4053 }
4054 
4055 static float32 frec7_s(float32 f, float_status *s)
4056 {
4057     int exp_size = 8, frac_size = 23;
4058     bool sign = float32_is_neg(f);
4059 
4060     /* frec7(+-inf) = +-0 */
4061     if (float32_is_infinity(f)) {
4062         return float32_set_sign(float32_zero, sign);
4063     }
4064 
4065     /* frec7(+-0) = +-inf */
4066     if (float32_is_zero(f)) {
4067         s->float_exception_flags |= float_flag_divbyzero;
4068         return float32_set_sign(float32_infinity, sign);
4069     }
4070 
4071     /* frec7(sNaN) = canonical NaN */
4072     if (float32_is_signaling_nan(f, s)) {
4073         s->float_exception_flags |= float_flag_invalid;
4074         return float32_default_nan(s);
4075     }
4076 
4077     /* frec7(qNaN) = canonical NaN */
4078     if (float32_is_quiet_nan(f, s)) {
4079         return float32_default_nan(s);
4080     }
4081 
4082     /* +-normal, +-subnormal */
4083     uint64_t val = frec7(f, exp_size, frac_size, s);
4084     return make_float32(val);
4085 }
4086 
4087 static float64 frec7_d(float64 f, float_status *s)
4088 {
4089     int exp_size = 11, frac_size = 52;
4090     bool sign = float64_is_neg(f);
4091 
4092     /* frec7(+-inf) = +-0 */
4093     if (float64_is_infinity(f)) {
4094         return float64_set_sign(float64_zero, sign);
4095     }
4096 
4097     /* frec7(+-0) = +-inf */
4098     if (float64_is_zero(f)) {
4099         s->float_exception_flags |= float_flag_divbyzero;
4100         return float64_set_sign(float64_infinity, sign);
4101     }
4102 
4103     /* frec7(sNaN) = canonical NaN */
4104     if (float64_is_signaling_nan(f, s)) {
4105         s->float_exception_flags |= float_flag_invalid;
4106         return float64_default_nan(s);
4107     }
4108 
4109     /* frec7(qNaN) = canonical NaN */
4110     if (float64_is_quiet_nan(f, s)) {
4111         return float64_default_nan(s);
4112     }
4113 
4114     /* +-normal, +-subnormal */
4115     uint64_t val = frec7(f, exp_size, frac_size, s);
4116     return make_float64(val);
4117 }
4118 
4119 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4120 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4121 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4122 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4123 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4124 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4125 
4126 /* Vector Floating-Point MIN/MAX Instructions */
4127 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4128 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4129 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4130 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4131 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4132 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4133 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4134 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4135 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4136 GEN_VEXT_VF(vfmin_vf_h, 2)
4137 GEN_VEXT_VF(vfmin_vf_w, 4)
4138 GEN_VEXT_VF(vfmin_vf_d, 8)
4139 
4140 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4141 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4142 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4143 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4144 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4145 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4146 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4147 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4148 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4149 GEN_VEXT_VF(vfmax_vf_h, 2)
4150 GEN_VEXT_VF(vfmax_vf_w, 4)
4151 GEN_VEXT_VF(vfmax_vf_d, 8)
4152 
4153 /* Vector Floating-Point Sign-Injection Instructions */
4154 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4155 {
4156     return deposit64(b, 0, 15, a);
4157 }
4158 
4159 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4160 {
4161     return deposit64(b, 0, 31, a);
4162 }
4163 
4164 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4165 {
4166     return deposit64(b, 0, 63, a);
4167 }
4168 
4169 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4170 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4171 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4172 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4173 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4174 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4175 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4176 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4177 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4178 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4179 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4180 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4181 
4182 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4183 {
4184     return deposit64(~b, 0, 15, a);
4185 }
4186 
4187 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4188 {
4189     return deposit64(~b, 0, 31, a);
4190 }
4191 
4192 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4193 {
4194     return deposit64(~b, 0, 63, a);
4195 }
4196 
4197 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4198 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4199 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4200 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4201 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4202 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4203 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4204 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4205 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4206 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4207 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4208 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4209 
4210 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4211 {
4212     return deposit64(b ^ a, 0, 15, a);
4213 }
4214 
4215 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4216 {
4217     return deposit64(b ^ a, 0, 31, a);
4218 }
4219 
4220 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4221 {
4222     return deposit64(b ^ a, 0, 63, a);
4223 }
4224 
4225 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4226 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4227 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4228 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4229 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4230 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4231 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4232 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4233 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4234 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4235 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4236 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4237 
4238 /* Vector Floating-Point Compare Instructions */
4239 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4240 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4241                   CPURISCVState *env, uint32_t desc)          \
4242 {                                                             \
4243     uint32_t vm = vext_vm(desc);                              \
4244     uint32_t vl = env->vl;                                    \
4245     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4246     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4247     uint32_t vma = vext_vma(desc);                            \
4248     uint32_t i;                                               \
4249                                                               \
4250     VSTART_CHECK_EARLY_EXIT(env);                             \
4251                                                               \
4252     for (i = env->vstart; i < vl; i++) {                      \
4253         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4254         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4255         if (!vm && !vext_elem_mask(v0, i)) {                  \
4256             /* set masked-off elements to 1s */               \
4257             if (vma) {                                        \
4258                 vext_set_elem_mask(vd, i, 1);                 \
4259             }                                                 \
4260             continue;                                         \
4261         }                                                     \
4262         vext_set_elem_mask(vd, i,                             \
4263                            DO_OP(s2, s1, &env->fp_status));   \
4264     }                                                         \
4265     env->vstart = 0;                                          \
4266     /*
4267      * mask destination register are always tail-agnostic
4268      * set tail elements to 1s
4269      */                                                       \
4270     if (vta_all_1s) {                                         \
4271         for (; i < total_elems; i++) {                        \
4272             vext_set_elem_mask(vd, i, 1);                     \
4273         }                                                     \
4274     }                                                         \
4275 }
4276 
4277 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4278 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4279 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4280 
4281 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4282 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4283                   CPURISCVState *env, uint32_t desc)                \
4284 {                                                                   \
4285     uint32_t vm = vext_vm(desc);                                    \
4286     uint32_t vl = env->vl;                                          \
4287     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4288     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4289     uint32_t vma = vext_vma(desc);                                  \
4290     uint32_t i;                                                     \
4291                                                                     \
4292     VSTART_CHECK_EARLY_EXIT(env);                                   \
4293                                                                     \
4294     for (i = env->vstart; i < vl; i++) {                            \
4295         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4296         if (!vm && !vext_elem_mask(v0, i)) {                        \
4297             /* set masked-off elements to 1s */                     \
4298             if (vma) {                                              \
4299                 vext_set_elem_mask(vd, i, 1);                       \
4300             }                                                       \
4301             continue;                                               \
4302         }                                                           \
4303         vext_set_elem_mask(vd, i,                                   \
4304                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4305     }                                                               \
4306     env->vstart = 0;                                                \
4307     /*
4308      * mask destination register are always tail-agnostic
4309      * set tail elements to 1s
4310      */                                                             \
4311     if (vta_all_1s) {                                               \
4312         for (; i < total_elems; i++) {                              \
4313             vext_set_elem_mask(vd, i, 1);                           \
4314         }                                                           \
4315     }                                                               \
4316 }
4317 
4318 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4319 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4320 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4321 
4322 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4323 {
4324     FloatRelation compare = float16_compare_quiet(a, b, s);
4325     return compare != float_relation_equal;
4326 }
4327 
4328 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4329 {
4330     FloatRelation compare = float32_compare_quiet(a, b, s);
4331     return compare != float_relation_equal;
4332 }
4333 
4334 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4335 {
4336     FloatRelation compare = float64_compare_quiet(a, b, s);
4337     return compare != float_relation_equal;
4338 }
4339 
4340 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4341 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4342 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4343 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4344 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4345 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4346 
4347 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4348 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4349 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4350 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4351 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4352 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4353 
4354 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4355 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4356 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4357 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4358 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4359 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4360 
4361 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4362 {
4363     FloatRelation compare = float16_compare(a, b, s);
4364     return compare == float_relation_greater;
4365 }
4366 
4367 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4368 {
4369     FloatRelation compare = float32_compare(a, b, s);
4370     return compare == float_relation_greater;
4371 }
4372 
4373 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4374 {
4375     FloatRelation compare = float64_compare(a, b, s);
4376     return compare == float_relation_greater;
4377 }
4378 
4379 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4380 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4381 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4382 
4383 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4384 {
4385     FloatRelation compare = float16_compare(a, b, s);
4386     return compare == float_relation_greater ||
4387            compare == float_relation_equal;
4388 }
4389 
4390 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4391 {
4392     FloatRelation compare = float32_compare(a, b, s);
4393     return compare == float_relation_greater ||
4394            compare == float_relation_equal;
4395 }
4396 
4397 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4398 {
4399     FloatRelation compare = float64_compare(a, b, s);
4400     return compare == float_relation_greater ||
4401            compare == float_relation_equal;
4402 }
4403 
4404 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4405 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4406 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4407 
4408 /* Vector Floating-Point Classify Instruction */
4409 target_ulong fclass_h(uint64_t frs1)
4410 {
4411     float16 f = frs1;
4412     bool sign = float16_is_neg(f);
4413 
4414     if (float16_is_infinity(f)) {
4415         return sign ? 1 << 0 : 1 << 7;
4416     } else if (float16_is_zero(f)) {
4417         return sign ? 1 << 3 : 1 << 4;
4418     } else if (float16_is_zero_or_denormal(f)) {
4419         return sign ? 1 << 2 : 1 << 5;
4420     } else if (float16_is_any_nan(f)) {
4421         float_status s = { }; /* for snan_bit_is_one */
4422         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4423     } else {
4424         return sign ? 1 << 1 : 1 << 6;
4425     }
4426 }
4427 
4428 target_ulong fclass_s(uint64_t frs1)
4429 {
4430     float32 f = frs1;
4431     bool sign = float32_is_neg(f);
4432 
4433     if (float32_is_infinity(f)) {
4434         return sign ? 1 << 0 : 1 << 7;
4435     } else if (float32_is_zero(f)) {
4436         return sign ? 1 << 3 : 1 << 4;
4437     } else if (float32_is_zero_or_denormal(f)) {
4438         return sign ? 1 << 2 : 1 << 5;
4439     } else if (float32_is_any_nan(f)) {
4440         float_status s = { }; /* for snan_bit_is_one */
4441         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4442     } else {
4443         return sign ? 1 << 1 : 1 << 6;
4444     }
4445 }
4446 
4447 target_ulong fclass_d(uint64_t frs1)
4448 {
4449     float64 f = frs1;
4450     bool sign = float64_is_neg(f);
4451 
4452     if (float64_is_infinity(f)) {
4453         return sign ? 1 << 0 : 1 << 7;
4454     } else if (float64_is_zero(f)) {
4455         return sign ? 1 << 3 : 1 << 4;
4456     } else if (float64_is_zero_or_denormal(f)) {
4457         return sign ? 1 << 2 : 1 << 5;
4458     } else if (float64_is_any_nan(f)) {
4459         float_status s = { }; /* for snan_bit_is_one */
4460         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4461     } else {
4462         return sign ? 1 << 1 : 1 << 6;
4463     }
4464 }
4465 
4466 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4467 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4468 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4469 GEN_VEXT_V(vfclass_v_h, 2)
4470 GEN_VEXT_V(vfclass_v_w, 4)
4471 GEN_VEXT_V(vfclass_v_d, 8)
4472 
4473 /* Vector Floating-Point Merge Instruction */
4474 
4475 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4476 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4477                   CPURISCVState *env, uint32_t desc)          \
4478 {                                                             \
4479     uint32_t vm = vext_vm(desc);                              \
4480     uint32_t vl = env->vl;                                    \
4481     uint32_t esz = sizeof(ETYPE);                             \
4482     uint32_t total_elems =                                    \
4483         vext_get_total_elems(env, desc, esz);                 \
4484     uint32_t vta = vext_vta(desc);                            \
4485     uint32_t i;                                               \
4486                                                               \
4487     VSTART_CHECK_EARLY_EXIT(env);                             \
4488                                                               \
4489     for (i = env->vstart; i < vl; i++) {                      \
4490         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4491         *((ETYPE *)vd + H(i)) =                               \
4492             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4493     }                                                         \
4494     env->vstart = 0;                                          \
4495     /* set tail elements to 1s */                             \
4496     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4497 }
4498 
4499 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4500 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4501 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4502 
4503 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4504 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4505 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4506 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4507 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4508 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4509 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4510 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4511 
4512 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4513 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4514 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4515 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4516 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4517 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4518 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4519 
4520 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4521 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4522 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4523 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4524 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4525 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4526 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4527 
4528 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4529 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4530 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4531 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4532 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4533 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4534 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4535 
4536 /* Widening Floating-Point/Integer Type-Convert Instructions */
4537 /* (TD, T2, TX2) */
4538 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4539 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4540 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4541 /*
4542  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4543  */
4544 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4545 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4546 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4547 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4548 
4549 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4550 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4551 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4552 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4553 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4554 
4555 /*
4556  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4557  */
4558 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4559 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4560 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4561 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4562 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4563 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4564 
4565 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4566 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4567 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4568 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4569 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4570 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4571 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4572 
4573 /*
4574  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4575  */
4576 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4577 {
4578     return float16_to_float32(a, true, s);
4579 }
4580 
4581 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4582 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4583 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4584 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4585 
4586 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4587 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4588 
4589 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4590 /* (TD, T2, TX2) */
4591 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4592 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4593 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4594 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4595 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4596 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4597 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4598 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4599 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4600 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4601 
4602 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4603 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4604 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4605 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4606 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4607 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4608 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4609 
4610 /*
4611  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4612  */
4613 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4614 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4615 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4616 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4617 
4618 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4619 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4620 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4621 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4622 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4623 
4624 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4625 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4626 {
4627     return float32_to_float16(a, true, s);
4628 }
4629 
4630 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4631 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4632 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4633 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4634 
4635 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4636 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4637 
4638 /*
4639  * Vector Reduction Operations
4640  */
4641 /* Vector Single-Width Integer Reduction Instructions */
4642 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4643 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4644                   void *vs2, CPURISCVState *env,          \
4645                   uint32_t desc)                          \
4646 {                                                         \
4647     uint32_t vm = vext_vm(desc);                          \
4648     uint32_t vl = env->vl;                                \
4649     uint32_t esz = sizeof(TD);                            \
4650     uint32_t vlenb = simd_maxsz(desc);                    \
4651     uint32_t vta = vext_vta(desc);                        \
4652     uint32_t i;                                           \
4653     TD s1 =  *((TD *)vs1 + HD(0));                        \
4654                                                           \
4655     for (i = env->vstart; i < vl; i++) {                  \
4656         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4657         if (!vm && !vext_elem_mask(v0, i)) {              \
4658             continue;                                     \
4659         }                                                 \
4660         s1 = OP(s1, (TD)s2);                              \
4661     }                                                     \
4662     *((TD *)vd + HD(0)) = s1;                             \
4663     env->vstart = 0;                                      \
4664     /* set tail elements to 1s */                         \
4665     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4666 }
4667 
4668 /* vd[0] = sum(vs1[0], vs2[*]) */
4669 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4670 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4671 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4672 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4673 
4674 /* vd[0] = maxu(vs1[0], vs2[*]) */
4675 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4676 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4677 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4678 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4679 
4680 /* vd[0] = max(vs1[0], vs2[*]) */
4681 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4682 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4683 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4684 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4685 
4686 /* vd[0] = minu(vs1[0], vs2[*]) */
4687 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4688 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4689 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4690 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4691 
4692 /* vd[0] = min(vs1[0], vs2[*]) */
4693 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4694 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4695 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4696 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4697 
4698 /* vd[0] = and(vs1[0], vs2[*]) */
4699 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4700 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4701 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4702 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4703 
4704 /* vd[0] = or(vs1[0], vs2[*]) */
4705 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4706 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4707 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4708 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4709 
4710 /* vd[0] = xor(vs1[0], vs2[*]) */
4711 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4712 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4713 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4714 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4715 
4716 /* Vector Widening Integer Reduction Instructions */
4717 /* signed sum reduction into double-width accumulator */
4718 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4719 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4720 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4721 
4722 /* Unsigned sum reduction into double-width accumulator */
4723 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4724 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4725 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4726 
4727 /* Vector Single-Width Floating-Point Reduction Instructions */
4728 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4729 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4730                   void *vs2, CPURISCVState *env,           \
4731                   uint32_t desc)                           \
4732 {                                                          \
4733     uint32_t vm = vext_vm(desc);                           \
4734     uint32_t vl = env->vl;                                 \
4735     uint32_t esz = sizeof(TD);                             \
4736     uint32_t vlenb = simd_maxsz(desc);                     \
4737     uint32_t vta = vext_vta(desc);                         \
4738     uint32_t i;                                            \
4739     TD s1 =  *((TD *)vs1 + HD(0));                         \
4740                                                            \
4741     for (i = env->vstart; i < vl; i++) {                   \
4742         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4743         if (!vm && !vext_elem_mask(v0, i)) {               \
4744             continue;                                      \
4745         }                                                  \
4746         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4747     }                                                      \
4748     *((TD *)vd + HD(0)) = s1;                              \
4749     env->vstart = 0;                                       \
4750     /* set tail elements to 1s */                          \
4751     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4752 }
4753 
4754 /* Unordered sum */
4755 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4756 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4757 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4758 
4759 /* Ordered sum */
4760 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4761 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4762 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4763 
4764 /* Maximum value */
4765 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4766               float16_maximum_number)
4767 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4768               float32_maximum_number)
4769 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4770               float64_maximum_number)
4771 
4772 /* Minimum value */
4773 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4774               float16_minimum_number)
4775 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4776               float32_minimum_number)
4777 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4778               float64_minimum_number)
4779 
4780 /* Vector Widening Floating-Point Add Instructions */
4781 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4782 {
4783     return float32_add(a, float16_to_float32(b, true, s), s);
4784 }
4785 
4786 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4787 {
4788     return float64_add(a, float32_to_float64(b, s), s);
4789 }
4790 
4791 /* Vector Widening Floating-Point Reduction Instructions */
4792 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4793 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4794 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4795 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4796 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4797 
4798 /*
4799  * Vector Mask Operations
4800  */
4801 /* Vector Mask-Register Logical Instructions */
4802 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4803 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4804                   void *vs2, CPURISCVState *env,          \
4805                   uint32_t desc)                          \
4806 {                                                         \
4807     uint32_t vl = env->vl;                                \
4808     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4809     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4810     uint32_t i;                                           \
4811     int a, b;                                             \
4812                                                           \
4813     VSTART_CHECK_EARLY_EXIT(env);                         \
4814                                                           \
4815     for (i = env->vstart; i < vl; i++) {                  \
4816         a = vext_elem_mask(vs1, i);                       \
4817         b = vext_elem_mask(vs2, i);                       \
4818         vext_set_elem_mask(vd, i, OP(b, a));              \
4819     }                                                     \
4820     env->vstart = 0;                                      \
4821     /*
4822      * mask destination register are always tail-agnostic
4823      * set tail elements to 1s
4824      */                                                   \
4825     if (vta_all_1s) {                                     \
4826         for (; i < total_elems; i++) {                    \
4827             vext_set_elem_mask(vd, i, 1);                 \
4828         }                                                 \
4829     }                                                     \
4830 }
4831 
4832 #define DO_NAND(N, M)  (!(N & M))
4833 #define DO_ANDNOT(N, M)  (N & !M)
4834 #define DO_NOR(N, M)  (!(N | M))
4835 #define DO_ORNOT(N, M)  (N | !M)
4836 #define DO_XNOR(N, M)  (!(N ^ M))
4837 
4838 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4839 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4840 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4841 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4842 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4843 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4844 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4845 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4846 
4847 /* Vector count population in mask vcpop */
4848 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4849                              uint32_t desc)
4850 {
4851     target_ulong cnt = 0;
4852     uint32_t vm = vext_vm(desc);
4853     uint32_t vl = env->vl;
4854     int i;
4855 
4856     for (i = env->vstart; i < vl; i++) {
4857         if (vm || vext_elem_mask(v0, i)) {
4858             if (vext_elem_mask(vs2, i)) {
4859                 cnt++;
4860             }
4861         }
4862     }
4863     env->vstart = 0;
4864     return cnt;
4865 }
4866 
4867 /* vfirst find-first-set mask bit */
4868 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4869                               uint32_t desc)
4870 {
4871     uint32_t vm = vext_vm(desc);
4872     uint32_t vl = env->vl;
4873     int i;
4874 
4875     for (i = env->vstart; i < vl; i++) {
4876         if (vm || vext_elem_mask(v0, i)) {
4877             if (vext_elem_mask(vs2, i)) {
4878                 return i;
4879             }
4880         }
4881     }
4882     env->vstart = 0;
4883     return -1LL;
4884 }
4885 
4886 enum set_mask_type {
4887     ONLY_FIRST = 1,
4888     INCLUDE_FIRST,
4889     BEFORE_FIRST,
4890 };
4891 
4892 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4893                    uint32_t desc, enum set_mask_type type)
4894 {
4895     uint32_t vm = vext_vm(desc);
4896     uint32_t vl = env->vl;
4897     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4898     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4899     uint32_t vma = vext_vma(desc);
4900     int i;
4901     bool first_mask_bit = false;
4902 
4903     for (i = env->vstart; i < vl; i++) {
4904         if (!vm && !vext_elem_mask(v0, i)) {
4905             /* set masked-off elements to 1s */
4906             if (vma) {
4907                 vext_set_elem_mask(vd, i, 1);
4908             }
4909             continue;
4910         }
4911         /* write a zero to all following active elements */
4912         if (first_mask_bit) {
4913             vext_set_elem_mask(vd, i, 0);
4914             continue;
4915         }
4916         if (vext_elem_mask(vs2, i)) {
4917             first_mask_bit = true;
4918             if (type == BEFORE_FIRST) {
4919                 vext_set_elem_mask(vd, i, 0);
4920             } else {
4921                 vext_set_elem_mask(vd, i, 1);
4922             }
4923         } else {
4924             if (type == ONLY_FIRST) {
4925                 vext_set_elem_mask(vd, i, 0);
4926             } else {
4927                 vext_set_elem_mask(vd, i, 1);
4928             }
4929         }
4930     }
4931     env->vstart = 0;
4932     /*
4933      * mask destination register are always tail-agnostic
4934      * set tail elements to 1s
4935      */
4936     if (vta_all_1s) {
4937         for (; i < total_elems; i++) {
4938             vext_set_elem_mask(vd, i, 1);
4939         }
4940     }
4941 }
4942 
4943 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4944                      uint32_t desc)
4945 {
4946     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4947 }
4948 
4949 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4950                      uint32_t desc)
4951 {
4952     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4953 }
4954 
4955 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4956                      uint32_t desc)
4957 {
4958     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4959 }
4960 
4961 /* Vector Iota Instruction */
4962 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4963 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4964                   uint32_t desc)                                          \
4965 {                                                                         \
4966     uint32_t vm = vext_vm(desc);                                          \
4967     uint32_t vl = env->vl;                                                \
4968     uint32_t esz = sizeof(ETYPE);                                         \
4969     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4970     uint32_t vta = vext_vta(desc);                                        \
4971     uint32_t vma = vext_vma(desc);                                        \
4972     uint32_t sum = 0;                                                     \
4973     int i;                                                                \
4974                                                                           \
4975     for (i = env->vstart; i < vl; i++) {                                  \
4976         if (!vm && !vext_elem_mask(v0, i)) {                              \
4977             /* set masked-off elements to 1s */                           \
4978             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4979             continue;                                                     \
4980         }                                                                 \
4981         *((ETYPE *)vd + H(i)) = sum;                                      \
4982         if (vext_elem_mask(vs2, i)) {                                     \
4983             sum++;                                                        \
4984         }                                                                 \
4985     }                                                                     \
4986     env->vstart = 0;                                                      \
4987     /* set tail elements to 1s */                                         \
4988     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4989 }
4990 
4991 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4992 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4993 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4994 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4995 
4996 /* Vector Element Index Instruction */
4997 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4998 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4999 {                                                                         \
5000     uint32_t vm = vext_vm(desc);                                          \
5001     uint32_t vl = env->vl;                                                \
5002     uint32_t esz = sizeof(ETYPE);                                         \
5003     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5004     uint32_t vta = vext_vta(desc);                                        \
5005     uint32_t vma = vext_vma(desc);                                        \
5006     int i;                                                                \
5007                                                                           \
5008     VSTART_CHECK_EARLY_EXIT(env);                                         \
5009                                                                           \
5010     for (i = env->vstart; i < vl; i++) {                                  \
5011         if (!vm && !vext_elem_mask(v0, i)) {                              \
5012             /* set masked-off elements to 1s */                           \
5013             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5014             continue;                                                     \
5015         }                                                                 \
5016         *((ETYPE *)vd + H(i)) = i;                                        \
5017     }                                                                     \
5018     env->vstart = 0;                                                      \
5019     /* set tail elements to 1s */                                         \
5020     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5021 }
5022 
5023 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5024 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5025 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5026 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5027 
5028 /*
5029  * Vector Permutation Instructions
5030  */
5031 
5032 /* Vector Slide Instructions */
5033 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5034 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5035                   CPURISCVState *env, uint32_t desc)                      \
5036 {                                                                         \
5037     uint32_t vm = vext_vm(desc);                                          \
5038     uint32_t vl = env->vl;                                                \
5039     uint32_t esz = sizeof(ETYPE);                                         \
5040     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5041     uint32_t vta = vext_vta(desc);                                        \
5042     uint32_t vma = vext_vma(desc);                                        \
5043     target_ulong offset = s1, i_min, i;                                   \
5044                                                                           \
5045     VSTART_CHECK_EARLY_EXIT(env);                                         \
5046                                                                           \
5047     i_min = MAX(env->vstart, offset);                                     \
5048     for (i = i_min; i < vl; i++) {                                        \
5049         if (!vm && !vext_elem_mask(v0, i)) {                              \
5050             /* set masked-off elements to 1s */                           \
5051             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5052             continue;                                                     \
5053         }                                                                 \
5054         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5055     }                                                                     \
5056     env->vstart = 0;                                                      \
5057     /* set tail elements to 1s */                                         \
5058     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5059 }
5060 
5061 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5062 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5063 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5064 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5065 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5066 
5067 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5068 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5069                   CPURISCVState *env, uint32_t desc)                      \
5070 {                                                                         \
5071     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5072     uint32_t vm = vext_vm(desc);                                          \
5073     uint32_t vl = env->vl;                                                \
5074     uint32_t esz = sizeof(ETYPE);                                         \
5075     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5076     uint32_t vta = vext_vta(desc);                                        \
5077     uint32_t vma = vext_vma(desc);                                        \
5078     target_ulong i_max, i_min, i;                                         \
5079                                                                           \
5080     VSTART_CHECK_EARLY_EXIT(env);                                         \
5081                                                                           \
5082     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
5083     i_max = MAX(i_min, env->vstart);                                      \
5084     for (i = env->vstart; i < i_max; ++i) {                               \
5085         if (!vm && !vext_elem_mask(v0, i)) {                              \
5086             /* set masked-off elements to 1s */                           \
5087             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5088             continue;                                                     \
5089         }                                                                 \
5090         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5091     }                                                                     \
5092                                                                           \
5093     for (i = i_max; i < vl; ++i) {                                        \
5094         if (vm || vext_elem_mask(v0, i)) {                                \
5095             *((ETYPE *)vd + H(i)) = 0;                                    \
5096         }                                                                 \
5097     }                                                                     \
5098                                                                           \
5099     env->vstart = 0;                                                      \
5100     /* set tail elements to 1s */                                         \
5101     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5102 }
5103 
5104 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5105 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5106 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5107 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5108 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5109 
5110 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5111 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5112                                  void *vs2, CPURISCVState *env,             \
5113                                  uint32_t desc)                             \
5114 {                                                                           \
5115     typedef uint##BITWIDTH##_t ETYPE;                                       \
5116     uint32_t vm = vext_vm(desc);                                            \
5117     uint32_t vl = env->vl;                                                  \
5118     uint32_t esz = sizeof(ETYPE);                                           \
5119     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5120     uint32_t vta = vext_vta(desc);                                          \
5121     uint32_t vma = vext_vma(desc);                                          \
5122     uint32_t i;                                                             \
5123                                                                             \
5124     VSTART_CHECK_EARLY_EXIT(env);                                           \
5125                                                                             \
5126     for (i = env->vstart; i < vl; i++) {                                    \
5127         if (!vm && !vext_elem_mask(v0, i)) {                                \
5128             /* set masked-off elements to 1s */                             \
5129             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5130             continue;                                                       \
5131         }                                                                   \
5132         if (i == 0) {                                                       \
5133             *((ETYPE *)vd + H(i)) = s1;                                     \
5134         } else {                                                            \
5135             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5136         }                                                                   \
5137     }                                                                       \
5138     env->vstart = 0;                                                        \
5139     /* set tail elements to 1s */                                           \
5140     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5141 }
5142 
5143 GEN_VEXT_VSLIE1UP(8,  H1)
5144 GEN_VEXT_VSLIE1UP(16, H2)
5145 GEN_VEXT_VSLIE1UP(32, H4)
5146 GEN_VEXT_VSLIE1UP(64, H8)
5147 
5148 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5149 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5150                   CPURISCVState *env, uint32_t desc)              \
5151 {                                                                 \
5152     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5153 }
5154 
5155 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5156 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5157 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5158 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5159 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5160 
5161 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5162 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5163                                    void *vs2, CPURISCVState *env,             \
5164                                    uint32_t desc)                             \
5165 {                                                                             \
5166     typedef uint##BITWIDTH##_t ETYPE;                                         \
5167     uint32_t vm = vext_vm(desc);                                              \
5168     uint32_t vl = env->vl;                                                    \
5169     uint32_t esz = sizeof(ETYPE);                                             \
5170     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5171     uint32_t vta = vext_vta(desc);                                            \
5172     uint32_t vma = vext_vma(desc);                                            \
5173     uint32_t i;                                                               \
5174                                                                               \
5175     VSTART_CHECK_EARLY_EXIT(env);                                             \
5176                                                                               \
5177     for (i = env->vstart; i < vl; i++) {                                      \
5178         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5179             /* set masked-off elements to 1s */                               \
5180             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5181             continue;                                                         \
5182         }                                                                     \
5183         if (i == vl - 1) {                                                    \
5184             *((ETYPE *)vd + H(i)) = s1;                                       \
5185         } else {                                                              \
5186             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5187         }                                                                     \
5188     }                                                                         \
5189     env->vstart = 0;                                                          \
5190     /* set tail elements to 1s */                                             \
5191     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5192 }
5193 
5194 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5195 GEN_VEXT_VSLIDE1DOWN(16, H2)
5196 GEN_VEXT_VSLIDE1DOWN(32, H4)
5197 GEN_VEXT_VSLIDE1DOWN(64, H8)
5198 
5199 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5200 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5201                   CPURISCVState *env, uint32_t desc)              \
5202 {                                                                 \
5203     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5204 }
5205 
5206 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5207 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5208 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5209 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5210 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5211 
5212 /* Vector Floating-Point Slide Instructions */
5213 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5214 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5215                   CPURISCVState *env, uint32_t desc)          \
5216 {                                                             \
5217     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5218 }
5219 
5220 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5221 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5222 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5223 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5224 
5225 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5226 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5227                   CPURISCVState *env, uint32_t desc)          \
5228 {                                                             \
5229     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5230 }
5231 
5232 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5233 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5234 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5235 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5236 
5237 /* Vector Register Gather Instruction */
5238 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5239 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5240                   CPURISCVState *env, uint32_t desc)                      \
5241 {                                                                         \
5242     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5243     uint32_t vm = vext_vm(desc);                                          \
5244     uint32_t vl = env->vl;                                                \
5245     uint32_t esz = sizeof(TS2);                                           \
5246     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5247     uint32_t vta = vext_vta(desc);                                        \
5248     uint32_t vma = vext_vma(desc);                                        \
5249     uint64_t index;                                                       \
5250     uint32_t i;                                                           \
5251                                                                           \
5252     VSTART_CHECK_EARLY_EXIT(env);                                         \
5253                                                                           \
5254     for (i = env->vstart; i < vl; i++) {                                  \
5255         if (!vm && !vext_elem_mask(v0, i)) {                              \
5256             /* set masked-off elements to 1s */                           \
5257             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5258             continue;                                                     \
5259         }                                                                 \
5260         index = *((TS1 *)vs1 + HS1(i));                                   \
5261         if (index >= vlmax) {                                             \
5262             *((TS2 *)vd + HS2(i)) = 0;                                    \
5263         } else {                                                          \
5264             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5265         }                                                                 \
5266     }                                                                     \
5267     env->vstart = 0;                                                      \
5268     /* set tail elements to 1s */                                         \
5269     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5270 }
5271 
5272 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5273 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5274 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5275 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5276 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5277 
5278 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5279 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5280 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5281 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5282 
5283 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5284 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5285                   CPURISCVState *env, uint32_t desc)                      \
5286 {                                                                         \
5287     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5288     uint32_t vm = vext_vm(desc);                                          \
5289     uint32_t vl = env->vl;                                                \
5290     uint32_t esz = sizeof(ETYPE);                                         \
5291     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5292     uint32_t vta = vext_vta(desc);                                        \
5293     uint32_t vma = vext_vma(desc);                                        \
5294     uint64_t index = s1;                                                  \
5295     uint32_t i;                                                           \
5296                                                                           \
5297     VSTART_CHECK_EARLY_EXIT(env);                                         \
5298                                                                           \
5299     for (i = env->vstart; i < vl; i++) {                                  \
5300         if (!vm && !vext_elem_mask(v0, i)) {                              \
5301             /* set masked-off elements to 1s */                           \
5302             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5303             continue;                                                     \
5304         }                                                                 \
5305         if (index >= vlmax) {                                             \
5306             *((ETYPE *)vd + H(i)) = 0;                                    \
5307         } else {                                                          \
5308             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5309         }                                                                 \
5310     }                                                                     \
5311     env->vstart = 0;                                                      \
5312     /* set tail elements to 1s */                                         \
5313     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5314 }
5315 
5316 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5317 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5318 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5319 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5320 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5321 
5322 /* Vector Compress Instruction */
5323 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5324 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5325                   CPURISCVState *env, uint32_t desc)                      \
5326 {                                                                         \
5327     uint32_t vl = env->vl;                                                \
5328     uint32_t esz = sizeof(ETYPE);                                         \
5329     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5330     uint32_t vta = vext_vta(desc);                                        \
5331     uint32_t num = 0, i;                                                  \
5332                                                                           \
5333     for (i = env->vstart; i < vl; i++) {                                  \
5334         if (!vext_elem_mask(vs1, i)) {                                    \
5335             continue;                                                     \
5336         }                                                                 \
5337         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5338         num++;                                                            \
5339     }                                                                     \
5340     env->vstart = 0;                                                      \
5341     /* set tail elements to 1s */                                         \
5342     vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
5343 }
5344 
5345 /* Compress into vd elements of vs2 where vs1 is enabled */
5346 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5347 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5348 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5349 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5350 
5351 /* Vector Whole Register Move */
5352 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5353 {
5354     /* EEW = SEW */
5355     uint32_t maxsz = simd_maxsz(desc);
5356     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5357     uint32_t startb = env->vstart * sewb;
5358     uint32_t i = startb;
5359 
5360     if (startb >= maxsz) {
5361         env->vstart = 0;
5362         return;
5363     }
5364 
5365     if (HOST_BIG_ENDIAN && i % 8 != 0) {
5366         uint32_t j = ROUND_UP(i, 8);
5367         memcpy((uint8_t *)vd + H1(j - 1),
5368                (uint8_t *)vs2 + H1(j - 1),
5369                j - i);
5370         i = j;
5371     }
5372 
5373     memcpy((uint8_t *)vd + H1(i),
5374            (uint8_t *)vs2 + H1(i),
5375            maxsz - i);
5376 
5377     env->vstart = 0;
5378 }
5379 
5380 /* Vector Integer Extension */
5381 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5382 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5383                   CPURISCVState *env, uint32_t desc)             \
5384 {                                                                \
5385     uint32_t vl = env->vl;                                       \
5386     uint32_t vm = vext_vm(desc);                                 \
5387     uint32_t esz = sizeof(ETYPE);                                \
5388     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5389     uint32_t vta = vext_vta(desc);                               \
5390     uint32_t vma = vext_vma(desc);                               \
5391     uint32_t i;                                                  \
5392                                                                  \
5393     VSTART_CHECK_EARLY_EXIT(env);                                \
5394                                                                  \
5395     for (i = env->vstart; i < vl; i++) {                         \
5396         if (!vm && !vext_elem_mask(v0, i)) {                     \
5397             /* set masked-off elements to 1s */                  \
5398             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5399             continue;                                            \
5400         }                                                        \
5401         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5402     }                                                            \
5403     env->vstart = 0;                                             \
5404     /* set tail elements to 1s */                                \
5405     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5406 }
5407 
5408 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5409 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5410 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5411 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5412 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5413 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5414 
5415 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5416 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5417 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5418 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5419 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5420 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5421