xref: /qemu/target/riscv/vector_helper.c (revision 22a7c2f239229b2ee9fcbac03cb598d9aebb9196)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/page-protection.h"
27 #include "exec/helper-proto.h"
28 #include "exec/tlb-flags.h"
29 #include "exec/tswap.h"
30 #include "fpu/softfloat.h"
31 #include "tcg/tcg-gvec-desc.h"
32 #include "internals.h"
33 #include "vector_internals.h"
34 #include <math.h>
35 
36 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
37                             target_ulong s2)
38 {
39     int vlmax, vl;
40     RISCVCPU *cpu = env_archcpu(env);
41     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
42     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
43     uint16_t sew = 8 << vsew;
44     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
45     int xlen = riscv_cpu_xlen(env);
46     bool vill = (s2 >> (xlen - 1)) & 0x1;
47     target_ulong reserved = s2 &
48                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
49                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
50     uint16_t vlen = cpu->cfg.vlenb << 3;
51     int8_t lmul;
52 
53     if (vlmul & 4) {
54         /*
55          * Fractional LMUL, check:
56          *
57          * VLEN * LMUL >= SEW
58          * VLEN >> (8 - lmul) >= sew
59          * (vlenb << 3) >> (8 - lmul) >= sew
60          */
61         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
62             vill = true;
63         }
64     }
65 
66     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
67         /* only set vill bit. */
68         env->vill = 1;
69         env->vtype = 0;
70         env->vl = 0;
71         env->vstart = 0;
72         return 0;
73     }
74 
75     /* lmul encoded as in DisasContext::lmul */
76     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
77     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
78     if (s1 <= vlmax) {
79         vl = s1;
80     } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
81         vl = (s1 + 1) >> 1;
82     } else {
83         vl = vlmax;
84     }
85     env->vl = vl;
86     env->vtype = s2;
87     env->vstart = 0;
88     env->vill = 0;
89     return vl;
90 }
91 
92 /*
93  * Get the maximum number of elements can be operated.
94  *
95  * log2_esz: log2 of element size in bytes.
96  */
97 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
98 {
99     /*
100      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
101      * so vlen in bytes (vlenb) is encoded as maxsz.
102      */
103     uint32_t vlenb = simd_maxsz(desc);
104 
105     /* Return VLMAX */
106     int scale = vext_lmul(desc) - log2_esz;
107     return scale < 0 ? vlenb >> -scale : vlenb << scale;
108 }
109 
110 /*
111  * This function checks watchpoint before real load operation.
112  *
113  * In system mode, the TLB API probe_access is enough for watchpoint check.
114  * In user mode, there is no watchpoint support now.
115  *
116  * It will trigger an exception if there is no mapping in TLB
117  * and page table walk can't fill the TLB entry. Then the guest
118  * software can return here after process the exception or never return.
119  */
120 static void probe_pages(CPURISCVState *env, target_ulong addr,
121                         target_ulong len, uintptr_t ra,
122                         MMUAccessType access_type)
123 {
124     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
125     target_ulong curlen = MIN(pagelen, len);
126     int mmu_index = riscv_env_mmu_index(env, false);
127 
128     probe_access(env, adjust_addr(env, addr), curlen, access_type,
129                  mmu_index, ra);
130     if (len > curlen) {
131         addr += curlen;
132         curlen = len - curlen;
133         probe_access(env, adjust_addr(env, addr), curlen, access_type,
134                      mmu_index, ra);
135     }
136 }
137 
138 static inline void vext_set_elem_mask(void *v0, int index,
139                                       uint8_t value)
140 {
141     int idx = index / 64;
142     int pos = index % 64;
143     uint64_t old = ((uint64_t *)v0)[idx];
144     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
145 }
146 
147 /* elements operations for load and store */
148 typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
149                                    uint32_t idx, void *vd, uintptr_t retaddr);
150 typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
151 
152 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
153 static inline QEMU_ALWAYS_INLINE                            \
154 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
155                 uint32_t idx, void *vd, uintptr_t retaddr)  \
156 {                                                           \
157     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
158     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
159 }                                                           \
160                                                             \
161 static inline QEMU_ALWAYS_INLINE                            \
162 void NAME##_host(void *vd, uint32_t idx, void *host)        \
163 {                                                           \
164     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
165     *cur = (ETYPE)LDSUF##_p(host);                          \
166 }
167 
168 GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
169 GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
170 GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
171 GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
172 
173 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
174 static inline QEMU_ALWAYS_INLINE                            \
175 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
176                 uint32_t idx, void *vd, uintptr_t retaddr)  \
177 {                                                           \
178     ETYPE data = *((ETYPE *)vd + H(idx));                   \
179     cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
180 }                                                           \
181                                                             \
182 static inline QEMU_ALWAYS_INLINE                            \
183 void NAME##_host(void *vd, uint32_t idx, void *host)        \
184 {                                                           \
185     ETYPE data = *((ETYPE *)vd + H(idx));                   \
186     STSUF##_p(host, data);                                  \
187 }
188 
189 GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
190 GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
191 GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
192 GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
193 
194 static inline QEMU_ALWAYS_INLINE void
195 vext_continuous_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
196                        void *vd, uint32_t evl, target_ulong addr,
197                        uint32_t reg_start, uintptr_t ra, uint32_t esz,
198                        bool is_load)
199 {
200     uint32_t i;
201     for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
202         ldst_tlb(env, adjust_addr(env, addr), i, vd, ra);
203     }
204 }
205 
206 static inline QEMU_ALWAYS_INLINE void
207 vext_continuous_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
208                         void *vd, uint32_t evl, uint32_t reg_start, void *host,
209                         uint32_t esz, bool is_load)
210 {
211 #if HOST_BIG_ENDIAN
212     for (; reg_start < evl; reg_start++, host += esz) {
213         ldst_host(vd, reg_start, host);
214     }
215 #else
216     if (esz == 1) {
217         uint32_t byte_offset = reg_start * esz;
218         uint32_t size = (evl - reg_start) * esz;
219 
220         if (is_load) {
221             memcpy(vd + byte_offset, host, size);
222         } else {
223             memcpy(host, vd + byte_offset, size);
224         }
225     } else {
226         for (; reg_start < evl; reg_start++, host += esz) {
227             ldst_host(vd, reg_start, host);
228         }
229     }
230 #endif
231 }
232 
233 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
234                                    uint32_t desc, uint32_t nf,
235                                    uint32_t esz, uint32_t max_elems)
236 {
237     uint32_t vta = vext_vta(desc);
238     int k;
239 
240     if (vta == 0) {
241         return;
242     }
243 
244     for (k = 0; k < nf; ++k) {
245         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
246                           (k * max_elems + max_elems) * esz);
247     }
248 }
249 
250 /*
251  * stride: access vector element from strided memory
252  */
253 static void
254 vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride,
255                  CPURISCVState *env, uint32_t desc, uint32_t vm,
256                  vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
257                  uintptr_t ra)
258 {
259     uint32_t i, k;
260     uint32_t nf = vext_nf(desc);
261     uint32_t max_elems = vext_max_elems(desc, log2_esz);
262     uint32_t esz = 1 << log2_esz;
263     uint32_t vma = vext_vma(desc);
264 
265     VSTART_CHECK_EARLY_EXIT(env, env->vl);
266 
267     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
268         k = 0;
269         while (k < nf) {
270             if (!vm && !vext_elem_mask(v0, i)) {
271                 /* set masked-off elements to 1s */
272                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
273                                   (i + k * max_elems + 1) * esz);
274                 k++;
275                 continue;
276             }
277             target_ulong addr = base + stride * i + (k << log2_esz);
278             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
279             k++;
280         }
281     }
282     env->vstart = 0;
283 
284     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
285 }
286 
287 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
288 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
289                   target_ulong stride, CPURISCVState *env,              \
290                   uint32_t desc)                                        \
291 {                                                                       \
292     uint32_t vm = vext_vm(desc);                                        \
293     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
294                      ctzl(sizeof(ETYPE)), GETPC());                     \
295 }
296 
297 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
298 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
299 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
300 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
301 
302 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
303 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
304                   target_ulong stride, CPURISCVState *env,              \
305                   uint32_t desc)                                        \
306 {                                                                       \
307     uint32_t vm = vext_vm(desc);                                        \
308     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
309                      ctzl(sizeof(ETYPE)), GETPC());                     \
310 }
311 
312 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
313 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
314 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
315 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
316 
317 /*
318  * unit-stride: access elements stored contiguously in memory
319  */
320 
321 /* unmasked unit-stride load and store operation */
322 static inline QEMU_ALWAYS_INLINE void
323 vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
324                   uint32_t elems, uint32_t nf, uint32_t max_elems,
325                   uint32_t log2_esz, bool is_load, int mmu_index,
326                   vext_ldst_elem_fn_tlb *ldst_tlb,
327                   vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
328 {
329     void *host;
330     int i, k, flags;
331     uint32_t esz = 1 << log2_esz;
332     uint32_t size = (elems * nf) << log2_esz;
333     uint32_t evl = env->vstart + elems;
334     MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
335 
336     /* Check page permission/pmp/watchpoint/etc. */
337     flags = probe_access_flags(env, adjust_addr(env, addr), size, access_type,
338                                mmu_index, true, &host, ra);
339 
340     if (flags == 0) {
341         if (nf == 1) {
342             vext_continuous_ldst_host(env, ldst_host, vd, evl, env->vstart,
343                                       host, esz, is_load);
344         } else {
345             for (i = env->vstart; i < evl; ++i) {
346                 k = 0;
347                 while (k < nf) {
348                     ldst_host(vd, i + k * max_elems, host);
349                     host += esz;
350                     k++;
351                 }
352             }
353         }
354         env->vstart += elems;
355     } else {
356         if (nf == 1) {
357             vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
358                                    ra, esz, is_load);
359         } else {
360             /* load bytes from guest memory */
361             for (i = env->vstart; i < evl; env->vstart = ++i) {
362                 k = 0;
363                 while (k < nf) {
364                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
365                              vd, ra);
366                     addr += esz;
367                     k++;
368                 }
369             }
370         }
371     }
372 }
373 
374 static inline QEMU_ALWAYS_INLINE void
375 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
376              vext_ldst_elem_fn_tlb *ldst_tlb,
377              vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
378              uint32_t evl, uintptr_t ra, bool is_load)
379 {
380     uint32_t k;
381     target_ulong page_split, elems, addr;
382     uint32_t nf = vext_nf(desc);
383     uint32_t max_elems = vext_max_elems(desc, log2_esz);
384     uint32_t esz = 1 << log2_esz;
385     uint32_t msize = nf * esz;
386     int mmu_index = riscv_env_mmu_index(env, false);
387 
388     VSTART_CHECK_EARLY_EXIT(env, evl);
389 
390 #if defined(CONFIG_USER_ONLY)
391     /*
392      * For data sizes <= 6 bytes we get better performance by simply calling
393      * vext_continuous_ldst_tlb
394      */
395     if (nf == 1 && (evl << log2_esz) <= 6) {
396         addr = base + (env->vstart << log2_esz);
397         vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart, ra,
398                                  esz, is_load);
399 
400         env->vstart = 0;
401         vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
402         return;
403     }
404 #endif
405 
406     /* Calculate the page range of first page */
407     addr = base + ((env->vstart * nf) << log2_esz);
408     page_split = -(addr | TARGET_PAGE_MASK);
409     /* Get number of elements */
410     elems = page_split / msize;
411     if (unlikely(env->vstart + elems >= evl)) {
412         elems = evl - env->vstart;
413     }
414 
415     /* Load/store elements in the first page */
416     if (likely(elems)) {
417         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
418                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
419     }
420 
421     /* Load/store elements in the second page */
422     if (unlikely(env->vstart < evl)) {
423         /* Cross page element */
424         if (unlikely(page_split % msize)) {
425             for (k = 0; k < nf; k++) {
426                 addr = base + ((env->vstart * nf + k) << log2_esz);
427                 ldst_tlb(env, adjust_addr(env, addr),
428                         env->vstart + k * max_elems, vd, ra);
429             }
430             env->vstart++;
431         }
432 
433         addr = base + ((env->vstart * nf) << log2_esz);
434         /* Get number of elements of second page */
435         elems = evl - env->vstart;
436 
437         /* Load/store elements in the second page */
438         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
439                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
440     }
441 
442     env->vstart = 0;
443     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
444 }
445 
446 /*
447  * masked unit-stride load and store operation will be a special case of
448  * stride, stride = NF * sizeof (ETYPE)
449  */
450 
451 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
452 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
453                          CPURISCVState *env, uint32_t desc)         \
454 {                                                                   \
455     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
456     vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
457                      LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
458 }                                                                   \
459                                                                     \
460 void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
461                   CPURISCVState *env, uint32_t desc)                \
462 {                                                                   \
463     vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
464                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
465 }
466 
467 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
468 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
469 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
470 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
471 
472 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
473 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
474                          CPURISCVState *env, uint32_t desc)              \
475 {                                                                        \
476     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
477     vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
478                      STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
479 }                                                                        \
480                                                                          \
481 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
482                   CPURISCVState *env, uint32_t desc)                     \
483 {                                                                        \
484     vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
485                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
486 }
487 
488 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
489 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
490 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
491 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
492 
493 /*
494  * unit stride mask load and store, EEW = 1
495  */
496 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
497                     CPURISCVState *env, uint32_t desc)
498 {
499     /* evl = ceil(vl/8) */
500     uint8_t evl = (env->vl + 7) >> 3;
501     vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
502                  0, evl, GETPC(), true);
503 }
504 
505 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
506                     CPURISCVState *env, uint32_t desc)
507 {
508     /* evl = ceil(vl/8) */
509     uint8_t evl = (env->vl + 7) >> 3;
510     vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
511                  0, evl, GETPC(), false);
512 }
513 
514 /*
515  * index: access vector element from indexed memory
516  */
517 typedef target_ulong vext_get_index_addr(target_ulong base,
518         uint32_t idx, void *vs2);
519 
520 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
521 static target_ulong NAME(target_ulong base,            \
522                          uint32_t idx, void *vs2)      \
523 {                                                      \
524     return (base + *((ETYPE *)vs2 + H(idx)));          \
525 }
526 
527 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
528 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
529 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
530 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
531 
532 static inline void
533 vext_ldst_index(void *vd, void *v0, target_ulong base,
534                 void *vs2, CPURISCVState *env, uint32_t desc,
535                 vext_get_index_addr get_index_addr,
536                 vext_ldst_elem_fn_tlb *ldst_elem,
537                 uint32_t log2_esz, uintptr_t ra)
538 {
539     uint32_t i, k;
540     uint32_t nf = vext_nf(desc);
541     uint32_t vm = vext_vm(desc);
542     uint32_t max_elems = vext_max_elems(desc, log2_esz);
543     uint32_t esz = 1 << log2_esz;
544     uint32_t vma = vext_vma(desc);
545 
546     VSTART_CHECK_EARLY_EXIT(env, env->vl);
547 
548     /* load bytes from guest memory */
549     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
550         k = 0;
551         while (k < nf) {
552             if (!vm && !vext_elem_mask(v0, i)) {
553                 /* set masked-off elements to 1s */
554                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
555                                   (i + k * max_elems + 1) * esz);
556                 k++;
557                 continue;
558             }
559             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
560             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
561             k++;
562         }
563     }
564     env->vstart = 0;
565 
566     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
567 }
568 
569 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
570 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
571                   void *vs2, CPURISCVState *env, uint32_t desc)            \
572 {                                                                          \
573     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
574                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
575 }
576 
577 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
578 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
579 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
580 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
581 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
582 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
583 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
584 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
585 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
586 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
587 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
588 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
589 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
590 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
591 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
592 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
593 
594 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
595 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
596                   void *vs2, CPURISCVState *env, uint32_t desc)  \
597 {                                                                \
598     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
599                     STORE_FN, ctzl(sizeof(ETYPE)),               \
600                     GETPC());                                    \
601 }
602 
603 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
604 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
605 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
606 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
607 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
608 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
609 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
610 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
611 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
612 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
613 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
614 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
615 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
616 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
617 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
618 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
619 
620 /*
621  * unit-stride fault-only-fisrt load instructions
622  */
623 static inline void
624 vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
625           uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
626           vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
627 {
628     uint32_t i, k, vl = 0;
629     uint32_t nf = vext_nf(desc);
630     uint32_t vm = vext_vm(desc);
631     uint32_t max_elems = vext_max_elems(desc, log2_esz);
632     uint32_t esz = 1 << log2_esz;
633     uint32_t msize = nf * esz;
634     uint32_t vma = vext_vma(desc);
635     target_ulong addr, addr_probe, addr_i, offset, remain, page_split, elems;
636     int mmu_index = riscv_env_mmu_index(env, false);
637     int flags;
638     void *host;
639 
640     VSTART_CHECK_EARLY_EXIT(env, env->vl);
641 
642     addr = base + ((env->vstart * nf) << log2_esz);
643     page_split = -(addr | TARGET_PAGE_MASK);
644     /* Get number of elements */
645     elems = page_split / msize;
646     if (unlikely(env->vstart + elems >= env->vl)) {
647         elems = env->vl - env->vstart;
648     }
649 
650     /* Check page permission/pmp/watchpoint/etc. */
651     flags = probe_access_flags(env, adjust_addr(env, addr), elems * msize,
652                                MMU_DATA_LOAD, mmu_index, true, &host, ra);
653 
654     /* If we are crossing a page check also the second page. */
655     if (env->vl > elems) {
656         addr_probe = addr + (elems << log2_esz);
657         flags |= probe_access_flags(env, adjust_addr(env, addr_probe),
658                                     elems * msize, MMU_DATA_LOAD, mmu_index,
659                                     true, &host, ra);
660     }
661 
662     if (flags & ~TLB_WATCHPOINT) {
663         /* probe every access */
664         for (i = env->vstart; i < env->vl; i++) {
665             if (!vm && !vext_elem_mask(v0, i)) {
666                 continue;
667             }
668             addr_i = adjust_addr(env, base + i * (nf << log2_esz));
669             if (i == 0) {
670                 /* Allow fault on first element. */
671                 probe_pages(env, addr_i, nf << log2_esz, ra, MMU_DATA_LOAD);
672             } else {
673                 remain = nf << log2_esz;
674                 while (remain > 0) {
675                     offset = -(addr_i | TARGET_PAGE_MASK);
676 
677                     /* Probe nonfault on subsequent elements. */
678                     flags = probe_access_flags(env, addr_i, offset,
679                                                MMU_DATA_LOAD, mmu_index, true,
680                                                &host, 0);
681 
682                     /*
683                      * Stop if invalid (unmapped) or mmio (transaction may
684                      * fail). Do not stop if watchpoint, as the spec says that
685                      * first-fault should continue to access the same
686                      * elements regardless of any watchpoint.
687                      */
688                     if (flags & ~TLB_WATCHPOINT) {
689                         vl = i;
690                         goto ProbeSuccess;
691                     }
692                     if (remain <= offset) {
693                         break;
694                     }
695                     remain -= offset;
696                     addr_i = adjust_addr(env, addr_i + offset);
697                 }
698             }
699         }
700     }
701 ProbeSuccess:
702     /* load bytes from guest memory */
703     if (vl != 0) {
704         env->vl = vl;
705     }
706 
707     if (env->vstart < env->vl) {
708         if (vm) {
709             /* Load/store elements in the first page */
710             if (likely(elems)) {
711                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
712                                   log2_esz, true, mmu_index, ldst_tlb,
713                                   ldst_host, ra);
714             }
715 
716             /* Load/store elements in the second page */
717             if (unlikely(env->vstart < env->vl)) {
718                 /* Cross page element */
719                 if (unlikely(page_split % msize)) {
720                     for (k = 0; k < nf; k++) {
721                         addr = base + ((env->vstart * nf + k) << log2_esz);
722                         ldst_tlb(env, adjust_addr(env, addr),
723                                  env->vstart + k * max_elems, vd, ra);
724                     }
725                     env->vstart++;
726                 }
727 
728                 addr = base + ((env->vstart * nf) << log2_esz);
729                 /* Get number of elements of second page */
730                 elems = env->vl - env->vstart;
731 
732                 /* Load/store elements in the second page */
733                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
734                                   log2_esz, true, mmu_index, ldst_tlb,
735                                   ldst_host, ra);
736             }
737         } else {
738             for (i = env->vstart; i < env->vl; i++) {
739                 k = 0;
740                 while (k < nf) {
741                     if (!vext_elem_mask(v0, i)) {
742                         /* set masked-off elements to 1s */
743                         vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
744                                           (i + k * max_elems + 1) * esz);
745                         k++;
746                         continue;
747                     }
748                     addr = base + ((i * nf + k) << log2_esz);
749                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
750                              vd, ra);
751                     k++;
752                 }
753             }
754         }
755     }
756     env->vstart = 0;
757 
758     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
759 }
760 
761 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
762 void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
763                   CPURISCVState *env, uint32_t desc)            \
764 {                                                               \
765     vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
766               LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
767 }
768 
769 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
770 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
771 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
772 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
773 
774 #define DO_SWAP(N, M) (M)
775 #define DO_AND(N, M)  (N & M)
776 #define DO_XOR(N, M)  (N ^ M)
777 #define DO_OR(N, M)   (N | M)
778 #define DO_ADD(N, M)  (N + M)
779 
780 /* Signed min/max */
781 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
782 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
783 
784 /*
785  * load and store whole register instructions
786  */
787 static inline QEMU_ALWAYS_INLINE void
788 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
789                 vext_ldst_elem_fn_tlb *ldst_tlb,
790                 vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
791                 uintptr_t ra, bool is_load)
792 {
793     target_ulong page_split, elems, addr;
794     uint32_t nf = vext_nf(desc);
795     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
796     uint32_t max_elems = vlenb >> log2_esz;
797     uint32_t evl = nf * max_elems;
798     uint32_t esz = 1 << log2_esz;
799     int mmu_index = riscv_env_mmu_index(env, false);
800 
801     /* Calculate the page range of first page */
802     addr = base + (env->vstart << log2_esz);
803     page_split = -(addr | TARGET_PAGE_MASK);
804     /* Get number of elements */
805     elems = page_split / esz;
806     if (unlikely(env->vstart + elems >= evl)) {
807         elems = evl - env->vstart;
808     }
809 
810     /* Load/store elements in the first page */
811     if (likely(elems)) {
812         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
813                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
814     }
815 
816     /* Load/store elements in the second page */
817     if (unlikely(env->vstart < evl)) {
818         /* Cross page element */
819         if (unlikely(page_split % esz)) {
820             addr = base + (env->vstart << log2_esz);
821             ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
822             env->vstart++;
823         }
824 
825         addr = base + (env->vstart << log2_esz);
826         /* Get number of elements of second page */
827         elems = evl - env->vstart;
828 
829         /* Load/store elements in the second page */
830         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
831                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
832     }
833 
834     env->vstart = 0;
835 }
836 
837 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
838 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
839                   uint32_t desc)                                    \
840 {                                                                   \
841     vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
842                     ctzl(sizeof(ETYPE)), GETPC(), true);            \
843 }
844 
845 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
846 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
847 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
848 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
849 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
850 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
851 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
852 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
853 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
854 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
855 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
856 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
857 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
858 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
859 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
860 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
861 
862 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
863 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
864                   uint32_t desc)                                        \
865 {                                                                       \
866     vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
867                     ctzl(sizeof(ETYPE)), GETPC(), false);               \
868 }
869 
870 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
871 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
872 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
873 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
874 
875 /*
876  * Vector Integer Arithmetic Instructions
877  */
878 
879 /* (TD, T1, T2, TX1, TX2) */
880 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
881 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
882 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
883 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
884 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
885 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
886 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
887 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
888 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
889 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
890 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
891 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
892 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
893 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
894 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
895 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
896 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
897 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
898 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
899 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
900 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
901 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
902 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
903 
904 #define DO_SUB(N, M) (N - M)
905 #define DO_RSUB(N, M) (M - N)
906 
907 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
908 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
909 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
910 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
911 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
912 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
913 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
914 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
915 
916 GEN_VEXT_VV(vadd_vv_b, 1)
917 GEN_VEXT_VV(vadd_vv_h, 2)
918 GEN_VEXT_VV(vadd_vv_w, 4)
919 GEN_VEXT_VV(vadd_vv_d, 8)
920 GEN_VEXT_VV(vsub_vv_b, 1)
921 GEN_VEXT_VV(vsub_vv_h, 2)
922 GEN_VEXT_VV(vsub_vv_w, 4)
923 GEN_VEXT_VV(vsub_vv_d, 8)
924 
925 
926 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
927 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
928 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
929 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
930 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
931 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
932 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
933 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
934 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
935 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
936 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
937 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
938 
939 GEN_VEXT_VX(vadd_vx_b, 1)
940 GEN_VEXT_VX(vadd_vx_h, 2)
941 GEN_VEXT_VX(vadd_vx_w, 4)
942 GEN_VEXT_VX(vadd_vx_d, 8)
943 GEN_VEXT_VX(vsub_vx_b, 1)
944 GEN_VEXT_VX(vsub_vx_h, 2)
945 GEN_VEXT_VX(vsub_vx_w, 4)
946 GEN_VEXT_VX(vsub_vx_d, 8)
947 GEN_VEXT_VX(vrsub_vx_b, 1)
948 GEN_VEXT_VX(vrsub_vx_h, 2)
949 GEN_VEXT_VX(vrsub_vx_w, 4)
950 GEN_VEXT_VX(vrsub_vx_d, 8)
951 
952 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
953 {
954     intptr_t oprsz = simd_oprsz(desc);
955     intptr_t i;
956 
957     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
958         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
959     }
960 }
961 
962 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
963 {
964     intptr_t oprsz = simd_oprsz(desc);
965     intptr_t i;
966 
967     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
968         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
969     }
970 }
971 
972 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
973 {
974     intptr_t oprsz = simd_oprsz(desc);
975     intptr_t i;
976 
977     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
978         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
979     }
980 }
981 
982 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
983 {
984     intptr_t oprsz = simd_oprsz(desc);
985     intptr_t i;
986 
987     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
988         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
989     }
990 }
991 
992 /* Vector Widening Integer Add/Subtract */
993 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
994 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
995 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
996 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
997 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
998 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
999 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
1000 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
1001 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
1002 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
1003 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
1004 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
1005 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
1006 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
1007 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
1008 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
1009 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
1010 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1011 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1012 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1013 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1014 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1015 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1016 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1017 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1018 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1019 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1020 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1021 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1022 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1023 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1024 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1025 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1026 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1027 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1028 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1029 GEN_VEXT_VV(vwaddu_vv_b, 2)
1030 GEN_VEXT_VV(vwaddu_vv_h, 4)
1031 GEN_VEXT_VV(vwaddu_vv_w, 8)
1032 GEN_VEXT_VV(vwsubu_vv_b, 2)
1033 GEN_VEXT_VV(vwsubu_vv_h, 4)
1034 GEN_VEXT_VV(vwsubu_vv_w, 8)
1035 GEN_VEXT_VV(vwadd_vv_b, 2)
1036 GEN_VEXT_VV(vwadd_vv_h, 4)
1037 GEN_VEXT_VV(vwadd_vv_w, 8)
1038 GEN_VEXT_VV(vwsub_vv_b, 2)
1039 GEN_VEXT_VV(vwsub_vv_h, 4)
1040 GEN_VEXT_VV(vwsub_vv_w, 8)
1041 GEN_VEXT_VV(vwaddu_wv_b, 2)
1042 GEN_VEXT_VV(vwaddu_wv_h, 4)
1043 GEN_VEXT_VV(vwaddu_wv_w, 8)
1044 GEN_VEXT_VV(vwsubu_wv_b, 2)
1045 GEN_VEXT_VV(vwsubu_wv_h, 4)
1046 GEN_VEXT_VV(vwsubu_wv_w, 8)
1047 GEN_VEXT_VV(vwadd_wv_b, 2)
1048 GEN_VEXT_VV(vwadd_wv_h, 4)
1049 GEN_VEXT_VV(vwadd_wv_w, 8)
1050 GEN_VEXT_VV(vwsub_wv_b, 2)
1051 GEN_VEXT_VV(vwsub_wv_h, 4)
1052 GEN_VEXT_VV(vwsub_wv_w, 8)
1053 
1054 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1055 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1056 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1057 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1058 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1059 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1060 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1061 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1062 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1063 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1064 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1065 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1066 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1067 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1068 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1069 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1070 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1071 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1072 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1073 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1074 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1075 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1076 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1077 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1078 GEN_VEXT_VX(vwaddu_vx_b, 2)
1079 GEN_VEXT_VX(vwaddu_vx_h, 4)
1080 GEN_VEXT_VX(vwaddu_vx_w, 8)
1081 GEN_VEXT_VX(vwsubu_vx_b, 2)
1082 GEN_VEXT_VX(vwsubu_vx_h, 4)
1083 GEN_VEXT_VX(vwsubu_vx_w, 8)
1084 GEN_VEXT_VX(vwadd_vx_b, 2)
1085 GEN_VEXT_VX(vwadd_vx_h, 4)
1086 GEN_VEXT_VX(vwadd_vx_w, 8)
1087 GEN_VEXT_VX(vwsub_vx_b, 2)
1088 GEN_VEXT_VX(vwsub_vx_h, 4)
1089 GEN_VEXT_VX(vwsub_vx_w, 8)
1090 GEN_VEXT_VX(vwaddu_wx_b, 2)
1091 GEN_VEXT_VX(vwaddu_wx_h, 4)
1092 GEN_VEXT_VX(vwaddu_wx_w, 8)
1093 GEN_VEXT_VX(vwsubu_wx_b, 2)
1094 GEN_VEXT_VX(vwsubu_wx_h, 4)
1095 GEN_VEXT_VX(vwsubu_wx_w, 8)
1096 GEN_VEXT_VX(vwadd_wx_b, 2)
1097 GEN_VEXT_VX(vwadd_wx_h, 4)
1098 GEN_VEXT_VX(vwadd_wx_w, 8)
1099 GEN_VEXT_VX(vwsub_wx_b, 2)
1100 GEN_VEXT_VX(vwsub_wx_h, 4)
1101 GEN_VEXT_VX(vwsub_wx_w, 8)
1102 
1103 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1104 #define DO_VADC(N, M, C) (N + M + C)
1105 #define DO_VSBC(N, M, C) (N - M - C)
1106 
1107 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1108 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1109                   CPURISCVState *env, uint32_t desc)          \
1110 {                                                             \
1111     uint32_t vl = env->vl;                                    \
1112     uint32_t esz = sizeof(ETYPE);                             \
1113     uint32_t total_elems =                                    \
1114         vext_get_total_elems(env, desc, esz);                 \
1115     uint32_t vta = vext_vta(desc);                            \
1116     uint32_t i;                                               \
1117                                                               \
1118     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1119                                                               \
1120     for (i = env->vstart; i < vl; i++) {                      \
1121         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1122         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1123         ETYPE carry = vext_elem_mask(v0, i);                  \
1124                                                               \
1125         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1126     }                                                         \
1127     env->vstart = 0;                                          \
1128     /* set tail elements to 1s */                             \
1129     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1130 }
1131 
1132 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1133 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1134 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1135 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1136 
1137 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1138 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1139 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1140 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1141 
1142 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1143 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1144                   CPURISCVState *env, uint32_t desc)                     \
1145 {                                                                        \
1146     uint32_t vl = env->vl;                                               \
1147     uint32_t esz = sizeof(ETYPE);                                        \
1148     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1149     uint32_t vta = vext_vta(desc);                                       \
1150     uint32_t i;                                                          \
1151                                                                          \
1152     VSTART_CHECK_EARLY_EXIT(env, vl);                                    \
1153                                                                          \
1154     for (i = env->vstart; i < vl; i++) {                                 \
1155         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1156         ETYPE carry = vext_elem_mask(v0, i);                             \
1157                                                                          \
1158         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1159     }                                                                    \
1160     env->vstart = 0;                                                     \
1161     /* set tail elements to 1s */                                        \
1162     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1163 }
1164 
1165 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1166 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1167 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1168 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1169 
1170 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1171 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1172 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1173 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1174 
1175 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1176                           (__typeof(N))(N + M) < N)
1177 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1178 
1179 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1180 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1181                   CPURISCVState *env, uint32_t desc)          \
1182 {                                                             \
1183     uint32_t vl = env->vl;                                    \
1184     uint32_t vm = vext_vm(desc);                              \
1185     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1186     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1187     uint32_t i;                                               \
1188                                                               \
1189     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1190                                                               \
1191     for (i = env->vstart; i < vl; i++) {                      \
1192         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1193         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1194         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1195         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1196     }                                                         \
1197     env->vstart = 0;                                          \
1198     /*
1199      * mask destination register are always tail-agnostic
1200      * set tail elements to 1s
1201      */                                                       \
1202     if (vta_all_1s) {                                         \
1203         for (; i < total_elems; i++) {                        \
1204             vext_set_elem_mask(vd, i, 1);                     \
1205         }                                                     \
1206     }                                                         \
1207 }
1208 
1209 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1210 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1211 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1212 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1213 
1214 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1215 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1216 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1217 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1218 
1219 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1220 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1221                   void *vs2, CPURISCVState *env, uint32_t desc) \
1222 {                                                               \
1223     uint32_t vl = env->vl;                                      \
1224     uint32_t vm = vext_vm(desc);                                \
1225     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1226     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1227     uint32_t i;                                                 \
1228                                                                 \
1229     VSTART_CHECK_EARLY_EXIT(env, vl);                           \
1230                                                                 \
1231     for (i = env->vstart; i < vl; i++) {                        \
1232         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1233         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1234         vext_set_elem_mask(vd, i,                               \
1235                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1236     }                                                           \
1237     env->vstart = 0;                                            \
1238     /*
1239      * mask destination register are always tail-agnostic
1240      * set tail elements to 1s
1241      */                                                         \
1242     if (vta_all_1s) {                                           \
1243         for (; i < total_elems; i++) {                          \
1244             vext_set_elem_mask(vd, i, 1);                       \
1245         }                                                       \
1246     }                                                           \
1247 }
1248 
1249 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1250 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1251 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1252 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1253 
1254 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1255 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1256 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1257 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1258 
1259 /* Vector Bitwise Logical Instructions */
1260 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1261 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1262 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1263 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1264 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1265 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1266 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1267 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1268 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1269 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1270 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1271 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1272 GEN_VEXT_VV(vand_vv_b, 1)
1273 GEN_VEXT_VV(vand_vv_h, 2)
1274 GEN_VEXT_VV(vand_vv_w, 4)
1275 GEN_VEXT_VV(vand_vv_d, 8)
1276 GEN_VEXT_VV(vor_vv_b, 1)
1277 GEN_VEXT_VV(vor_vv_h, 2)
1278 GEN_VEXT_VV(vor_vv_w, 4)
1279 GEN_VEXT_VV(vor_vv_d, 8)
1280 GEN_VEXT_VV(vxor_vv_b, 1)
1281 GEN_VEXT_VV(vxor_vv_h, 2)
1282 GEN_VEXT_VV(vxor_vv_w, 4)
1283 GEN_VEXT_VV(vxor_vv_d, 8)
1284 
1285 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1286 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1287 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1288 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1289 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1290 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1291 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1292 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1293 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1294 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1295 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1296 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1297 GEN_VEXT_VX(vand_vx_b, 1)
1298 GEN_VEXT_VX(vand_vx_h, 2)
1299 GEN_VEXT_VX(vand_vx_w, 4)
1300 GEN_VEXT_VX(vand_vx_d, 8)
1301 GEN_VEXT_VX(vor_vx_b, 1)
1302 GEN_VEXT_VX(vor_vx_h, 2)
1303 GEN_VEXT_VX(vor_vx_w, 4)
1304 GEN_VEXT_VX(vor_vx_d, 8)
1305 GEN_VEXT_VX(vxor_vx_b, 1)
1306 GEN_VEXT_VX(vxor_vx_h, 2)
1307 GEN_VEXT_VX(vxor_vx_w, 4)
1308 GEN_VEXT_VX(vxor_vx_d, 8)
1309 
1310 /* Vector Single-Width Bit Shift Instructions */
1311 #define DO_SLL(N, M)  (N << (M))
1312 #define DO_SRL(N, M)  (N >> (M))
1313 
1314 /* generate the helpers for shift instructions with two vector operators */
1315 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1316 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1317                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1318 {                                                                         \
1319     uint32_t vm = vext_vm(desc);                                          \
1320     uint32_t vl = env->vl;                                                \
1321     uint32_t esz = sizeof(TS1);                                           \
1322     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1323     uint32_t vta = vext_vta(desc);                                        \
1324     uint32_t vma = vext_vma(desc);                                        \
1325     uint32_t i;                                                           \
1326                                                                           \
1327     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
1328                                                                           \
1329     for (i = env->vstart; i < vl; i++) {                                  \
1330         if (!vm && !vext_elem_mask(v0, i)) {                              \
1331             /* set masked-off elements to 1s */                           \
1332             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1333             continue;                                                     \
1334         }                                                                 \
1335         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1336         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1337         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1338     }                                                                     \
1339     env->vstart = 0;                                                      \
1340     /* set tail elements to 1s */                                         \
1341     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1342 }
1343 
1344 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1345 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1346 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1347 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1348 
1349 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1350 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1351 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1352 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1353 
1354 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1355 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1356 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1357 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1358 
1359 /*
1360  * generate the helpers for shift instructions with one vector and one scalar
1361  */
1362 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1363 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1364                   void *vs2, CPURISCVState *env,            \
1365                   uint32_t desc)                            \
1366 {                                                           \
1367     uint32_t vm = vext_vm(desc);                            \
1368     uint32_t vl = env->vl;                                  \
1369     uint32_t esz = sizeof(TD);                              \
1370     uint32_t total_elems =                                  \
1371         vext_get_total_elems(env, desc, esz);               \
1372     uint32_t vta = vext_vta(desc);                          \
1373     uint32_t vma = vext_vma(desc);                          \
1374     uint32_t i;                                             \
1375                                                             \
1376     VSTART_CHECK_EARLY_EXIT(env, vl);                       \
1377                                                             \
1378     for (i = env->vstart; i < vl; i++) {                    \
1379         if (!vm && !vext_elem_mask(v0, i)) {                \
1380             /* set masked-off elements to 1s */             \
1381             vext_set_elems_1s(vd, vma, i * esz,             \
1382                               (i + 1) * esz);               \
1383             continue;                                       \
1384         }                                                   \
1385         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1386         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1387     }                                                       \
1388     env->vstart = 0;                                        \
1389     /* set tail elements to 1s */                           \
1390     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1391 }
1392 
1393 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1394 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1395 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1396 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1397 
1398 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1399 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1400 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1401 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1402 
1403 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1404 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1405 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1406 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1407 
1408 /* Vector Narrowing Integer Right Shift Instructions */
1409 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1410 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1411 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1412 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1413 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1414 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1415 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1416 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1417 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1418 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1419 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1420 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1421 
1422 /* Vector Integer Comparison Instructions */
1423 #define DO_MSEQ(N, M) (N == M)
1424 #define DO_MSNE(N, M) (N != M)
1425 #define DO_MSLT(N, M) (N < M)
1426 #define DO_MSLE(N, M) (N <= M)
1427 #define DO_MSGT(N, M) (N > M)
1428 
1429 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1430 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1431                   CPURISCVState *env, uint32_t desc)          \
1432 {                                                             \
1433     uint32_t vm = vext_vm(desc);                              \
1434     uint32_t vl = env->vl;                                    \
1435     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1436     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1437     uint32_t vma = vext_vma(desc);                            \
1438     uint32_t i;                                               \
1439                                                               \
1440     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1441                                                               \
1442     for (i = env->vstart; i < vl; i++) {                      \
1443         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1444         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1445         if (!vm && !vext_elem_mask(v0, i)) {                  \
1446             /* set masked-off elements to 1s */               \
1447             if (vma) {                                        \
1448                 vext_set_elem_mask(vd, i, 1);                 \
1449             }                                                 \
1450             continue;                                         \
1451         }                                                     \
1452         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1453     }                                                         \
1454     env->vstart = 0;                                          \
1455     /*
1456      * mask destination register are always tail-agnostic
1457      * set tail elements to 1s
1458      */                                                       \
1459     if (vta_all_1s) {                                         \
1460         for (; i < total_elems; i++) {                        \
1461             vext_set_elem_mask(vd, i, 1);                     \
1462         }                                                     \
1463     }                                                         \
1464 }
1465 
1466 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1467 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1468 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1469 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1470 
1471 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1472 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1473 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1474 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1475 
1476 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1477 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1478 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1479 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1480 
1481 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1482 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1483 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1484 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1485 
1486 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1487 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1488 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1489 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1490 
1491 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1492 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1493 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1494 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1495 
1496 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1497 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1498                   CPURISCVState *env, uint32_t desc)                \
1499 {                                                                   \
1500     uint32_t vm = vext_vm(desc);                                    \
1501     uint32_t vl = env->vl;                                          \
1502     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1503     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1504     uint32_t vma = vext_vma(desc);                                  \
1505     uint32_t i;                                                     \
1506                                                                     \
1507     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
1508                                                                     \
1509     for (i = env->vstart; i < vl; i++) {                            \
1510         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1511         if (!vm && !vext_elem_mask(v0, i)) {                        \
1512             /* set masked-off elements to 1s */                     \
1513             if (vma) {                                              \
1514                 vext_set_elem_mask(vd, i, 1);                       \
1515             }                                                       \
1516             continue;                                               \
1517         }                                                           \
1518         vext_set_elem_mask(vd, i,                                   \
1519                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1520     }                                                               \
1521     env->vstart = 0;                                                \
1522     /*
1523      * mask destination register are always tail-agnostic
1524      * set tail elements to 1s
1525      */                                                             \
1526     if (vta_all_1s) {                                               \
1527         for (; i < total_elems; i++) {                              \
1528             vext_set_elem_mask(vd, i, 1);                           \
1529         }                                                           \
1530     }                                                               \
1531 }
1532 
1533 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1534 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1535 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1536 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1537 
1538 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1539 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1540 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1541 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1542 
1543 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1544 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1545 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1546 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1547 
1548 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1549 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1550 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1551 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1552 
1553 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1554 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1555 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1556 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1557 
1558 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1559 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1560 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1561 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1562 
1563 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1564 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1565 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1566 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1567 
1568 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1569 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1570 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1571 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1572 
1573 /* Vector Integer Min/Max Instructions */
1574 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1575 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1576 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1577 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1578 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1579 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1580 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1581 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1582 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1583 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1584 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1585 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1586 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1587 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1588 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1589 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1590 GEN_VEXT_VV(vminu_vv_b, 1)
1591 GEN_VEXT_VV(vminu_vv_h, 2)
1592 GEN_VEXT_VV(vminu_vv_w, 4)
1593 GEN_VEXT_VV(vminu_vv_d, 8)
1594 GEN_VEXT_VV(vmin_vv_b, 1)
1595 GEN_VEXT_VV(vmin_vv_h, 2)
1596 GEN_VEXT_VV(vmin_vv_w, 4)
1597 GEN_VEXT_VV(vmin_vv_d, 8)
1598 GEN_VEXT_VV(vmaxu_vv_b, 1)
1599 GEN_VEXT_VV(vmaxu_vv_h, 2)
1600 GEN_VEXT_VV(vmaxu_vv_w, 4)
1601 GEN_VEXT_VV(vmaxu_vv_d, 8)
1602 GEN_VEXT_VV(vmax_vv_b, 1)
1603 GEN_VEXT_VV(vmax_vv_h, 2)
1604 GEN_VEXT_VV(vmax_vv_w, 4)
1605 GEN_VEXT_VV(vmax_vv_d, 8)
1606 
1607 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1608 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1609 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1610 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1611 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1612 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1613 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1614 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1615 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1616 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1617 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1618 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1619 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1620 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1621 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1622 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1623 GEN_VEXT_VX(vminu_vx_b, 1)
1624 GEN_VEXT_VX(vminu_vx_h, 2)
1625 GEN_VEXT_VX(vminu_vx_w, 4)
1626 GEN_VEXT_VX(vminu_vx_d, 8)
1627 GEN_VEXT_VX(vmin_vx_b, 1)
1628 GEN_VEXT_VX(vmin_vx_h, 2)
1629 GEN_VEXT_VX(vmin_vx_w, 4)
1630 GEN_VEXT_VX(vmin_vx_d, 8)
1631 GEN_VEXT_VX(vmaxu_vx_b, 1)
1632 GEN_VEXT_VX(vmaxu_vx_h, 2)
1633 GEN_VEXT_VX(vmaxu_vx_w, 4)
1634 GEN_VEXT_VX(vmaxu_vx_d, 8)
1635 GEN_VEXT_VX(vmax_vx_b, 1)
1636 GEN_VEXT_VX(vmax_vx_h, 2)
1637 GEN_VEXT_VX(vmax_vx_w, 4)
1638 GEN_VEXT_VX(vmax_vx_d, 8)
1639 
1640 /* Vector Single-Width Integer Multiply Instructions */
1641 #define DO_MUL(N, M) (N * M)
1642 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1643 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1644 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1645 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1646 GEN_VEXT_VV(vmul_vv_b, 1)
1647 GEN_VEXT_VV(vmul_vv_h, 2)
1648 GEN_VEXT_VV(vmul_vv_w, 4)
1649 GEN_VEXT_VV(vmul_vv_d, 8)
1650 
1651 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1652 {
1653     return (int16_t)s2 * (int16_t)s1 >> 8;
1654 }
1655 
1656 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1657 {
1658     return (int32_t)s2 * (int32_t)s1 >> 16;
1659 }
1660 
1661 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1662 {
1663     return (int64_t)s2 * (int64_t)s1 >> 32;
1664 }
1665 
1666 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1667 {
1668     uint64_t hi_64, lo_64;
1669 
1670     muls64(&lo_64, &hi_64, s1, s2);
1671     return hi_64;
1672 }
1673 
1674 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1675 {
1676     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1677 }
1678 
1679 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1680 {
1681     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1682 }
1683 
1684 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1685 {
1686     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1687 }
1688 
1689 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1690 {
1691     uint64_t hi_64, lo_64;
1692 
1693     mulu64(&lo_64, &hi_64, s2, s1);
1694     return hi_64;
1695 }
1696 
1697 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1698 {
1699     return (int16_t)s2 * (uint16_t)s1 >> 8;
1700 }
1701 
1702 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1703 {
1704     return (int32_t)s2 * (uint32_t)s1 >> 16;
1705 }
1706 
1707 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1708 {
1709     return (int64_t)s2 * (uint64_t)s1 >> 32;
1710 }
1711 
1712 /*
1713  * Let  A = signed operand,
1714  *      B = unsigned operand
1715  *      P = mulu64(A, B), unsigned product
1716  *
1717  * LET  X = 2 ** 64  - A, 2's complement of A
1718  *      SP = signed product
1719  * THEN
1720  *      IF A < 0
1721  *          SP = -X * B
1722  *             = -(2 ** 64 - A) * B
1723  *             = A * B - 2 ** 64 * B
1724  *             = P - 2 ** 64 * B
1725  *      ELSE
1726  *          SP = P
1727  * THEN
1728  *      HI_P -= (A < 0 ? B : 0)
1729  */
1730 
1731 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1732 {
1733     uint64_t hi_64, lo_64;
1734 
1735     mulu64(&lo_64, &hi_64, s2, s1);
1736 
1737     hi_64 -= s2 < 0 ? s1 : 0;
1738     return hi_64;
1739 }
1740 
1741 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1742 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1743 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1744 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1745 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1746 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1747 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1748 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1749 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1750 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1751 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1752 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1753 GEN_VEXT_VV(vmulh_vv_b, 1)
1754 GEN_VEXT_VV(vmulh_vv_h, 2)
1755 GEN_VEXT_VV(vmulh_vv_w, 4)
1756 GEN_VEXT_VV(vmulh_vv_d, 8)
1757 GEN_VEXT_VV(vmulhu_vv_b, 1)
1758 GEN_VEXT_VV(vmulhu_vv_h, 2)
1759 GEN_VEXT_VV(vmulhu_vv_w, 4)
1760 GEN_VEXT_VV(vmulhu_vv_d, 8)
1761 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1762 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1763 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1764 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1765 
1766 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1767 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1768 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1769 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1770 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1771 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1772 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1773 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1774 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1775 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1776 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1777 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1778 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1779 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1780 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1781 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1782 GEN_VEXT_VX(vmul_vx_b, 1)
1783 GEN_VEXT_VX(vmul_vx_h, 2)
1784 GEN_VEXT_VX(vmul_vx_w, 4)
1785 GEN_VEXT_VX(vmul_vx_d, 8)
1786 GEN_VEXT_VX(vmulh_vx_b, 1)
1787 GEN_VEXT_VX(vmulh_vx_h, 2)
1788 GEN_VEXT_VX(vmulh_vx_w, 4)
1789 GEN_VEXT_VX(vmulh_vx_d, 8)
1790 GEN_VEXT_VX(vmulhu_vx_b, 1)
1791 GEN_VEXT_VX(vmulhu_vx_h, 2)
1792 GEN_VEXT_VX(vmulhu_vx_w, 4)
1793 GEN_VEXT_VX(vmulhu_vx_d, 8)
1794 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1795 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1796 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1797 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1798 
1799 /* Vector Integer Divide Instructions */
1800 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1801 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1802 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1803         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1804 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1805         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1806 
1807 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1808 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1809 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1810 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1811 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1812 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1813 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1814 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1815 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1816 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1817 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1818 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1819 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1820 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1821 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1822 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1823 GEN_VEXT_VV(vdivu_vv_b, 1)
1824 GEN_VEXT_VV(vdivu_vv_h, 2)
1825 GEN_VEXT_VV(vdivu_vv_w, 4)
1826 GEN_VEXT_VV(vdivu_vv_d, 8)
1827 GEN_VEXT_VV(vdiv_vv_b, 1)
1828 GEN_VEXT_VV(vdiv_vv_h, 2)
1829 GEN_VEXT_VV(vdiv_vv_w, 4)
1830 GEN_VEXT_VV(vdiv_vv_d, 8)
1831 GEN_VEXT_VV(vremu_vv_b, 1)
1832 GEN_VEXT_VV(vremu_vv_h, 2)
1833 GEN_VEXT_VV(vremu_vv_w, 4)
1834 GEN_VEXT_VV(vremu_vv_d, 8)
1835 GEN_VEXT_VV(vrem_vv_b, 1)
1836 GEN_VEXT_VV(vrem_vv_h, 2)
1837 GEN_VEXT_VV(vrem_vv_w, 4)
1838 GEN_VEXT_VV(vrem_vv_d, 8)
1839 
1840 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1841 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1842 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1843 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1844 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1845 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1846 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1847 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1848 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1849 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1850 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1851 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1852 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1853 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1854 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1855 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1856 GEN_VEXT_VX(vdivu_vx_b, 1)
1857 GEN_VEXT_VX(vdivu_vx_h, 2)
1858 GEN_VEXT_VX(vdivu_vx_w, 4)
1859 GEN_VEXT_VX(vdivu_vx_d, 8)
1860 GEN_VEXT_VX(vdiv_vx_b, 1)
1861 GEN_VEXT_VX(vdiv_vx_h, 2)
1862 GEN_VEXT_VX(vdiv_vx_w, 4)
1863 GEN_VEXT_VX(vdiv_vx_d, 8)
1864 GEN_VEXT_VX(vremu_vx_b, 1)
1865 GEN_VEXT_VX(vremu_vx_h, 2)
1866 GEN_VEXT_VX(vremu_vx_w, 4)
1867 GEN_VEXT_VX(vremu_vx_d, 8)
1868 GEN_VEXT_VX(vrem_vx_b, 1)
1869 GEN_VEXT_VX(vrem_vx_h, 2)
1870 GEN_VEXT_VX(vrem_vx_w, 4)
1871 GEN_VEXT_VX(vrem_vx_d, 8)
1872 
1873 /* Vector Widening Integer Multiply Instructions */
1874 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1875 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1876 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1877 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1878 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1879 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1880 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1881 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1882 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1883 GEN_VEXT_VV(vwmul_vv_b, 2)
1884 GEN_VEXT_VV(vwmul_vv_h, 4)
1885 GEN_VEXT_VV(vwmul_vv_w, 8)
1886 GEN_VEXT_VV(vwmulu_vv_b, 2)
1887 GEN_VEXT_VV(vwmulu_vv_h, 4)
1888 GEN_VEXT_VV(vwmulu_vv_w, 8)
1889 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1890 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1891 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1892 
1893 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1894 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1895 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1896 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1897 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1898 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1899 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1900 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1901 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1902 GEN_VEXT_VX(vwmul_vx_b, 2)
1903 GEN_VEXT_VX(vwmul_vx_h, 4)
1904 GEN_VEXT_VX(vwmul_vx_w, 8)
1905 GEN_VEXT_VX(vwmulu_vx_b, 2)
1906 GEN_VEXT_VX(vwmulu_vx_h, 4)
1907 GEN_VEXT_VX(vwmulu_vx_w, 8)
1908 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1909 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1910 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1911 
1912 /* Vector Single-Width Integer Multiply-Add Instructions */
1913 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1914 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1915 {                                                                  \
1916     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1917     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1918     TD d = *((TD *)vd + HD(i));                                    \
1919     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1920 }
1921 
1922 #define DO_MACC(N, M, D) (M * N + D)
1923 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1924 #define DO_MADD(N, M, D) (M * D + N)
1925 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1926 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1927 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1928 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1929 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1930 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1931 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1932 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1933 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1934 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1935 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1936 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1937 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1938 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1939 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1940 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1941 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1942 GEN_VEXT_VV(vmacc_vv_b, 1)
1943 GEN_VEXT_VV(vmacc_vv_h, 2)
1944 GEN_VEXT_VV(vmacc_vv_w, 4)
1945 GEN_VEXT_VV(vmacc_vv_d, 8)
1946 GEN_VEXT_VV(vnmsac_vv_b, 1)
1947 GEN_VEXT_VV(vnmsac_vv_h, 2)
1948 GEN_VEXT_VV(vnmsac_vv_w, 4)
1949 GEN_VEXT_VV(vnmsac_vv_d, 8)
1950 GEN_VEXT_VV(vmadd_vv_b, 1)
1951 GEN_VEXT_VV(vmadd_vv_h, 2)
1952 GEN_VEXT_VV(vmadd_vv_w, 4)
1953 GEN_VEXT_VV(vmadd_vv_d, 8)
1954 GEN_VEXT_VV(vnmsub_vv_b, 1)
1955 GEN_VEXT_VV(vnmsub_vv_h, 2)
1956 GEN_VEXT_VV(vnmsub_vv_w, 4)
1957 GEN_VEXT_VV(vnmsub_vv_d, 8)
1958 
1959 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1960 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1961 {                                                                   \
1962     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1963     TD d = *((TD *)vd + HD(i));                                     \
1964     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1965 }
1966 
1967 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1968 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1969 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1970 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1971 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1972 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1973 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1974 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1975 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1976 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1977 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1978 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1979 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1980 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1981 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1982 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1983 GEN_VEXT_VX(vmacc_vx_b, 1)
1984 GEN_VEXT_VX(vmacc_vx_h, 2)
1985 GEN_VEXT_VX(vmacc_vx_w, 4)
1986 GEN_VEXT_VX(vmacc_vx_d, 8)
1987 GEN_VEXT_VX(vnmsac_vx_b, 1)
1988 GEN_VEXT_VX(vnmsac_vx_h, 2)
1989 GEN_VEXT_VX(vnmsac_vx_w, 4)
1990 GEN_VEXT_VX(vnmsac_vx_d, 8)
1991 GEN_VEXT_VX(vmadd_vx_b, 1)
1992 GEN_VEXT_VX(vmadd_vx_h, 2)
1993 GEN_VEXT_VX(vmadd_vx_w, 4)
1994 GEN_VEXT_VX(vmadd_vx_d, 8)
1995 GEN_VEXT_VX(vnmsub_vx_b, 1)
1996 GEN_VEXT_VX(vnmsub_vx_h, 2)
1997 GEN_VEXT_VX(vnmsub_vx_w, 4)
1998 GEN_VEXT_VX(vnmsub_vx_d, 8)
1999 
2000 /* Vector Widening Integer Multiply-Add Instructions */
2001 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
2002 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
2003 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
2004 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
2005 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
2006 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
2007 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
2008 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
2009 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
2010 GEN_VEXT_VV(vwmaccu_vv_b, 2)
2011 GEN_VEXT_VV(vwmaccu_vv_h, 4)
2012 GEN_VEXT_VV(vwmaccu_vv_w, 8)
2013 GEN_VEXT_VV(vwmacc_vv_b, 2)
2014 GEN_VEXT_VV(vwmacc_vv_h, 4)
2015 GEN_VEXT_VV(vwmacc_vv_w, 8)
2016 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
2017 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
2018 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
2019 
2020 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
2021 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
2022 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
2023 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
2024 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
2025 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
2026 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
2027 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
2028 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
2029 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
2030 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
2031 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
2032 GEN_VEXT_VX(vwmaccu_vx_b, 2)
2033 GEN_VEXT_VX(vwmaccu_vx_h, 4)
2034 GEN_VEXT_VX(vwmaccu_vx_w, 8)
2035 GEN_VEXT_VX(vwmacc_vx_b, 2)
2036 GEN_VEXT_VX(vwmacc_vx_h, 4)
2037 GEN_VEXT_VX(vwmacc_vx_w, 8)
2038 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2039 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2040 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2041 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2042 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2043 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2044 
2045 /* Vector Integer Merge and Move Instructions */
2046 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2047 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2048                   uint32_t desc)                                     \
2049 {                                                                    \
2050     uint32_t vl = env->vl;                                           \
2051     uint32_t esz = sizeof(ETYPE);                                    \
2052     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2053     uint32_t vta = vext_vta(desc);                                   \
2054     uint32_t i;                                                      \
2055                                                                      \
2056     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2057                                                                      \
2058     for (i = env->vstart; i < vl; i++) {                             \
2059         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2060         *((ETYPE *)vd + H(i)) = s1;                                  \
2061     }                                                                \
2062     env->vstart = 0;                                                 \
2063     /* set tail elements to 1s */                                    \
2064     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2065 }
2066 
2067 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2068 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2069 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2070 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2071 
2072 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2073 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2074                   uint32_t desc)                                     \
2075 {                                                                    \
2076     uint32_t vl = env->vl;                                           \
2077     uint32_t esz = sizeof(ETYPE);                                    \
2078     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2079     uint32_t vta = vext_vta(desc);                                   \
2080     uint32_t i;                                                      \
2081                                                                      \
2082     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2083                                                                      \
2084     for (i = env->vstart; i < vl; i++) {                             \
2085         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2086     }                                                                \
2087     env->vstart = 0;                                                 \
2088     /* set tail elements to 1s */                                    \
2089     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2090 }
2091 
2092 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2093 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2094 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2095 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2096 
2097 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2098 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2099                   CPURISCVState *env, uint32_t desc)                 \
2100 {                                                                    \
2101     uint32_t vl = env->vl;                                           \
2102     uint32_t esz = sizeof(ETYPE);                                    \
2103     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2104     uint32_t vta = vext_vta(desc);                                   \
2105     uint32_t i;                                                      \
2106                                                                      \
2107     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2108                                                                      \
2109     for (i = env->vstart; i < vl; i++) {                             \
2110         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2111         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2112     }                                                                \
2113     env->vstart = 0;                                                 \
2114     /* set tail elements to 1s */                                    \
2115     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2116 }
2117 
2118 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2119 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2120 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2121 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2122 
2123 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2124 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2125                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2126 {                                                                    \
2127     uint32_t vl = env->vl;                                           \
2128     uint32_t esz = sizeof(ETYPE);                                    \
2129     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2130     uint32_t vta = vext_vta(desc);                                   \
2131     uint32_t i;                                                      \
2132                                                                      \
2133     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2134                                                                      \
2135     for (i = env->vstart; i < vl; i++) {                             \
2136         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2137         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2138                    (ETYPE)(target_long)s1);                          \
2139         *((ETYPE *)vd + H(i)) = d;                                   \
2140     }                                                                \
2141     env->vstart = 0;                                                 \
2142     /* set tail elements to 1s */                                    \
2143     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2144 }
2145 
2146 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2147 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2148 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2149 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2150 
2151 /*
2152  * Vector Fixed-Point Arithmetic Instructions
2153  */
2154 
2155 /* Vector Single-Width Saturating Add and Subtract */
2156 
2157 /*
2158  * As fixed point instructions probably have round mode and saturation,
2159  * define common macros for fixed point here.
2160  */
2161 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2162                           CPURISCVState *env, int vxrm);
2163 
2164 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2165 static inline void                                                  \
2166 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2167           CPURISCVState *env, int vxrm)                             \
2168 {                                                                   \
2169     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2170     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2171     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2172 }
2173 
2174 static inline void
2175 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2176              CPURISCVState *env,
2177              uint32_t vl, uint32_t vm, int vxrm,
2178              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2179 {
2180     for (uint32_t i = env->vstart; i < vl; i++) {
2181         if (!vm && !vext_elem_mask(v0, i)) {
2182             /* set masked-off elements to 1s */
2183             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2184             continue;
2185         }
2186         fn(vd, vs1, vs2, i, env, vxrm);
2187     }
2188     env->vstart = 0;
2189 }
2190 
2191 static inline void
2192 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2193              CPURISCVState *env,
2194              uint32_t desc,
2195              opivv2_rm_fn *fn, uint32_t esz)
2196 {
2197     uint32_t vm = vext_vm(desc);
2198     uint32_t vl = env->vl;
2199     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2200     uint32_t vta = vext_vta(desc);
2201     uint32_t vma = vext_vma(desc);
2202 
2203     VSTART_CHECK_EARLY_EXIT(env, vl);
2204 
2205     switch (env->vxrm) {
2206     case 0: /* rnu */
2207         vext_vv_rm_1(vd, v0, vs1, vs2,
2208                      env, vl, vm, 0, fn, vma, esz);
2209         break;
2210     case 1: /* rne */
2211         vext_vv_rm_1(vd, v0, vs1, vs2,
2212                      env, vl, vm, 1, fn, vma, esz);
2213         break;
2214     case 2: /* rdn */
2215         vext_vv_rm_1(vd, v0, vs1, vs2,
2216                      env, vl, vm, 2, fn, vma, esz);
2217         break;
2218     default: /* rod */
2219         vext_vv_rm_1(vd, v0, vs1, vs2,
2220                      env, vl, vm, 3, fn, vma, esz);
2221         break;
2222     }
2223     /* set tail elements to 1s */
2224     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2225 }
2226 
2227 /* generate helpers for fixed point instructions with OPIVV format */
2228 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2229 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2230                   CPURISCVState *env, uint32_t desc)            \
2231 {                                                               \
2232     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2233                  do_##NAME, ESZ);                               \
2234 }
2235 
2236 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2237                              uint8_t b)
2238 {
2239     uint8_t res = a + b;
2240     if (res < a) {
2241         res = UINT8_MAX;
2242         env->vxsat = 0x1;
2243     }
2244     return res;
2245 }
2246 
2247 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2248                                uint16_t b)
2249 {
2250     uint16_t res = a + b;
2251     if (res < a) {
2252         res = UINT16_MAX;
2253         env->vxsat = 0x1;
2254     }
2255     return res;
2256 }
2257 
2258 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2259                                uint32_t b)
2260 {
2261     uint32_t res = a + b;
2262     if (res < a) {
2263         res = UINT32_MAX;
2264         env->vxsat = 0x1;
2265     }
2266     return res;
2267 }
2268 
2269 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2270                                uint64_t b)
2271 {
2272     uint64_t res = a + b;
2273     if (res < a) {
2274         res = UINT64_MAX;
2275         env->vxsat = 0x1;
2276     }
2277     return res;
2278 }
2279 
2280 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2281 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2282 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2283 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2284 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2285 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2286 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2287 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2288 
2289 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2290                           CPURISCVState *env, int vxrm);
2291 
2292 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2293 static inline void                                                  \
2294 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2295           CPURISCVState *env, int vxrm)                             \
2296 {                                                                   \
2297     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2298     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2299 }
2300 
2301 static inline void
2302 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2303              CPURISCVState *env,
2304              uint32_t vl, uint32_t vm, int vxrm,
2305              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2306 {
2307     for (uint32_t i = env->vstart; i < vl; i++) {
2308         if (!vm && !vext_elem_mask(v0, i)) {
2309             /* set masked-off elements to 1s */
2310             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2311             continue;
2312         }
2313         fn(vd, s1, vs2, i, env, vxrm);
2314     }
2315     env->vstart = 0;
2316 }
2317 
2318 static inline void
2319 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2320              CPURISCVState *env,
2321              uint32_t desc,
2322              opivx2_rm_fn *fn, uint32_t esz)
2323 {
2324     uint32_t vm = vext_vm(desc);
2325     uint32_t vl = env->vl;
2326     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2327     uint32_t vta = vext_vta(desc);
2328     uint32_t vma = vext_vma(desc);
2329 
2330     VSTART_CHECK_EARLY_EXIT(env, vl);
2331 
2332     switch (env->vxrm) {
2333     case 0: /* rnu */
2334         vext_vx_rm_1(vd, v0, s1, vs2,
2335                      env, vl, vm, 0, fn, vma, esz);
2336         break;
2337     case 1: /* rne */
2338         vext_vx_rm_1(vd, v0, s1, vs2,
2339                      env, vl, vm, 1, fn, vma, esz);
2340         break;
2341     case 2: /* rdn */
2342         vext_vx_rm_1(vd, v0, s1, vs2,
2343                      env, vl, vm, 2, fn, vma, esz);
2344         break;
2345     default: /* rod */
2346         vext_vx_rm_1(vd, v0, s1, vs2,
2347                      env, vl, vm, 3, fn, vma, esz);
2348         break;
2349     }
2350     /* set tail elements to 1s */
2351     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2352 }
2353 
2354 /* generate helpers for fixed point instructions with OPIVX format */
2355 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2356 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2357                   void *vs2, CPURISCVState *env,          \
2358                   uint32_t desc)                          \
2359 {                                                         \
2360     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2361                  do_##NAME, ESZ);                         \
2362 }
2363 
2364 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2365 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2366 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2367 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2368 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2369 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2370 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2371 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2372 
2373 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2374 {
2375     int8_t res = a + b;
2376     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2377         res = a > 0 ? INT8_MAX : INT8_MIN;
2378         env->vxsat = 0x1;
2379     }
2380     return res;
2381 }
2382 
2383 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2384                              int16_t b)
2385 {
2386     int16_t res = a + b;
2387     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2388         res = a > 0 ? INT16_MAX : INT16_MIN;
2389         env->vxsat = 0x1;
2390     }
2391     return res;
2392 }
2393 
2394 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2395                              int32_t b)
2396 {
2397     int32_t res = a + b;
2398     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2399         res = a > 0 ? INT32_MAX : INT32_MIN;
2400         env->vxsat = 0x1;
2401     }
2402     return res;
2403 }
2404 
2405 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2406                              int64_t b)
2407 {
2408     int64_t res = a + b;
2409     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2410         res = a > 0 ? INT64_MAX : INT64_MIN;
2411         env->vxsat = 0x1;
2412     }
2413     return res;
2414 }
2415 
2416 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2417 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2418 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2419 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2420 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2421 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2422 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2423 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2424 
2425 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2426 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2427 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2428 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2429 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2430 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2431 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2432 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2433 
2434 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2435                              uint8_t b)
2436 {
2437     uint8_t res = a - b;
2438     if (res > a) {
2439         res = 0;
2440         env->vxsat = 0x1;
2441     }
2442     return res;
2443 }
2444 
2445 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2446                                uint16_t b)
2447 {
2448     uint16_t res = a - b;
2449     if (res > a) {
2450         res = 0;
2451         env->vxsat = 0x1;
2452     }
2453     return res;
2454 }
2455 
2456 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2457                                uint32_t b)
2458 {
2459     uint32_t res = a - b;
2460     if (res > a) {
2461         res = 0;
2462         env->vxsat = 0x1;
2463     }
2464     return res;
2465 }
2466 
2467 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2468                                uint64_t b)
2469 {
2470     uint64_t res = a - b;
2471     if (res > a) {
2472         res = 0;
2473         env->vxsat = 0x1;
2474     }
2475     return res;
2476 }
2477 
2478 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2479 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2480 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2481 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2482 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2483 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2484 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2485 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2486 
2487 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2488 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2489 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2490 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2491 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2492 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2493 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2494 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2495 
2496 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2497 {
2498     int8_t res = a - b;
2499     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2500         res = a >= 0 ? INT8_MAX : INT8_MIN;
2501         env->vxsat = 0x1;
2502     }
2503     return res;
2504 }
2505 
2506 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2507                              int16_t b)
2508 {
2509     int16_t res = a - b;
2510     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2511         res = a >= 0 ? INT16_MAX : INT16_MIN;
2512         env->vxsat = 0x1;
2513     }
2514     return res;
2515 }
2516 
2517 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2518                              int32_t b)
2519 {
2520     int32_t res = a - b;
2521     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2522         res = a >= 0 ? INT32_MAX : INT32_MIN;
2523         env->vxsat = 0x1;
2524     }
2525     return res;
2526 }
2527 
2528 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2529                              int64_t b)
2530 {
2531     int64_t res = a - b;
2532     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2533         res = a >= 0 ? INT64_MAX : INT64_MIN;
2534         env->vxsat = 0x1;
2535     }
2536     return res;
2537 }
2538 
2539 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2540 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2541 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2542 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2543 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2544 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2545 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2546 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2547 
2548 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2549 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2550 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2551 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2552 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2553 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2554 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2555 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2556 
2557 /* Vector Single-Width Averaging Add and Subtract */
2558 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2559 {
2560     uint8_t d = extract64(v, shift, 1);
2561     uint8_t d1;
2562     uint64_t D1, D2;
2563 
2564     if (shift == 0 || shift > 64) {
2565         return 0;
2566     }
2567 
2568     d1 = extract64(v, shift - 1, 1);
2569     D1 = extract64(v, 0, shift);
2570     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2571         return d1;
2572     } else if (vxrm == 1) { /* round-to-nearest-even */
2573         if (shift > 1) {
2574             D2 = extract64(v, 0, shift - 1);
2575             return d1 & ((D2 != 0) | d);
2576         } else {
2577             return d1 & d;
2578         }
2579     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2580         return !d & (D1 != 0);
2581     }
2582     return 0; /* round-down (truncate) */
2583 }
2584 
2585 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2586                              int32_t b)
2587 {
2588     int64_t res = (int64_t)a + b;
2589     uint8_t round = get_round(vxrm, res, 1);
2590 
2591     return (res >> 1) + round;
2592 }
2593 
2594 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2595                              int64_t b)
2596 {
2597     int64_t res = a + b;
2598     uint8_t round = get_round(vxrm, res, 1);
2599     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2600 
2601     /* With signed overflow, bit 64 is inverse of bit 63. */
2602     return ((res >> 1) ^ over) + round;
2603 }
2604 
2605 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2606 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2607 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2608 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2609 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2610 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2611 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2612 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2613 
2614 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2615 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2616 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2617 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2618 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2619 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2620 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2621 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2622 
2623 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2624                                uint32_t a, uint32_t b)
2625 {
2626     uint64_t res = (uint64_t)a + b;
2627     uint8_t round = get_round(vxrm, res, 1);
2628 
2629     return (res >> 1) + round;
2630 }
2631 
2632 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2633                                uint64_t a, uint64_t b)
2634 {
2635     uint64_t res = a + b;
2636     uint8_t round = get_round(vxrm, res, 1);
2637     uint64_t over = (uint64_t)(res < a) << 63;
2638 
2639     return ((res >> 1) | over) + round;
2640 }
2641 
2642 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2643 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2644 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2645 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2646 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2647 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2648 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2649 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2650 
2651 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2652 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2653 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2654 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2655 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2656 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2657 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2658 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2659 
2660 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2661                              int32_t b)
2662 {
2663     int64_t res = (int64_t)a - b;
2664     uint8_t round = get_round(vxrm, res, 1);
2665 
2666     return (res >> 1) + round;
2667 }
2668 
2669 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2670                              int64_t b)
2671 {
2672     int64_t res = (int64_t)a - b;
2673     uint8_t round = get_round(vxrm, res, 1);
2674     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2675 
2676     /* With signed overflow, bit 64 is inverse of bit 63. */
2677     return ((res >> 1) ^ over) + round;
2678 }
2679 
2680 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2681 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2682 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2683 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2684 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2685 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2686 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2687 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2688 
2689 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2690 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2691 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2692 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2693 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2694 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2695 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2696 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2697 
2698 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2699                                uint32_t a, uint32_t b)
2700 {
2701     int64_t res = (int64_t)a - b;
2702     uint8_t round = get_round(vxrm, res, 1);
2703 
2704     return (res >> 1) + round;
2705 }
2706 
2707 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2708                                uint64_t a, uint64_t b)
2709 {
2710     uint64_t res = (uint64_t)a - b;
2711     uint8_t round = get_round(vxrm, res, 1);
2712     uint64_t over = (uint64_t)(res > a) << 63;
2713 
2714     return ((res >> 1) | over) + round;
2715 }
2716 
2717 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2718 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2719 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2720 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2721 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2722 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2723 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2724 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2725 
2726 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2727 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2728 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2729 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2730 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2731 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2732 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2733 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2734 
2735 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2736 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2737 {
2738     uint8_t round;
2739     int16_t res;
2740 
2741     res = (int16_t)a * (int16_t)b;
2742     round = get_round(vxrm, res, 7);
2743     res = (res >> 7) + round;
2744 
2745     if (res > INT8_MAX) {
2746         env->vxsat = 0x1;
2747         return INT8_MAX;
2748     } else if (res < INT8_MIN) {
2749         env->vxsat = 0x1;
2750         return INT8_MIN;
2751     } else {
2752         return res;
2753     }
2754 }
2755 
2756 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2757 {
2758     uint8_t round;
2759     int32_t res;
2760 
2761     res = (int32_t)a * (int32_t)b;
2762     round = get_round(vxrm, res, 15);
2763     res = (res >> 15) + round;
2764 
2765     if (res > INT16_MAX) {
2766         env->vxsat = 0x1;
2767         return INT16_MAX;
2768     } else if (res < INT16_MIN) {
2769         env->vxsat = 0x1;
2770         return INT16_MIN;
2771     } else {
2772         return res;
2773     }
2774 }
2775 
2776 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2777 {
2778     uint8_t round;
2779     int64_t res;
2780 
2781     res = (int64_t)a * (int64_t)b;
2782     round = get_round(vxrm, res, 31);
2783     res = (res >> 31) + round;
2784 
2785     if (res > INT32_MAX) {
2786         env->vxsat = 0x1;
2787         return INT32_MAX;
2788     } else if (res < INT32_MIN) {
2789         env->vxsat = 0x1;
2790         return INT32_MIN;
2791     } else {
2792         return res;
2793     }
2794 }
2795 
2796 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2797 {
2798     uint8_t round;
2799     uint64_t hi_64, lo_64;
2800     int64_t res;
2801 
2802     if (a == INT64_MIN && b == INT64_MIN) {
2803         env->vxsat = 1;
2804         return INT64_MAX;
2805     }
2806 
2807     muls64(&lo_64, &hi_64, a, b);
2808     round = get_round(vxrm, lo_64, 63);
2809     /*
2810      * Cannot overflow, as there are always
2811      * 2 sign bits after multiply.
2812      */
2813     res = (hi_64 << 1) | (lo_64 >> 63);
2814     if (round) {
2815         if (res == INT64_MAX) {
2816             env->vxsat = 1;
2817         } else {
2818             res += 1;
2819         }
2820     }
2821     return res;
2822 }
2823 
2824 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2825 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2826 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2827 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2828 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2829 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2830 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2831 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2832 
2833 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2834 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2835 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2836 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2837 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2838 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2839 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2840 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2841 
2842 /* Vector Single-Width Scaling Shift Instructions */
2843 static inline uint8_t
2844 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2845 {
2846     uint8_t round, shift = b & 0x7;
2847     uint8_t res;
2848 
2849     round = get_round(vxrm, a, shift);
2850     res = (a >> shift) + round;
2851     return res;
2852 }
2853 static inline uint16_t
2854 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2855 {
2856     uint8_t round, shift = b & 0xf;
2857 
2858     round = get_round(vxrm, a, shift);
2859     return (a >> shift) + round;
2860 }
2861 static inline uint32_t
2862 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2863 {
2864     uint8_t round, shift = b & 0x1f;
2865 
2866     round = get_round(vxrm, a, shift);
2867     return (a >> shift) + round;
2868 }
2869 static inline uint64_t
2870 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2871 {
2872     uint8_t round, shift = b & 0x3f;
2873 
2874     round = get_round(vxrm, a, shift);
2875     return (a >> shift) + round;
2876 }
2877 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2878 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2879 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2880 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2881 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2882 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2883 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2884 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2885 
2886 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2887 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2888 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2889 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2890 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2891 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2892 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2893 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2894 
2895 static inline int8_t
2896 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2897 {
2898     uint8_t round, shift = b & 0x7;
2899 
2900     round = get_round(vxrm, a, shift);
2901     return (a >> shift) + round;
2902 }
2903 static inline int16_t
2904 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2905 {
2906     uint8_t round, shift = b & 0xf;
2907 
2908     round = get_round(vxrm, a, shift);
2909     return (a >> shift) + round;
2910 }
2911 static inline int32_t
2912 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2913 {
2914     uint8_t round, shift = b & 0x1f;
2915 
2916     round = get_round(vxrm, a, shift);
2917     return (a >> shift) + round;
2918 }
2919 static inline int64_t
2920 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2921 {
2922     uint8_t round, shift = b & 0x3f;
2923 
2924     round = get_round(vxrm, a, shift);
2925     return (a >> shift) + round;
2926 }
2927 
2928 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2929 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2930 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2931 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2932 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2933 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2934 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2935 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2936 
2937 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2938 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2939 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2940 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2941 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2942 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2943 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2944 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2945 
2946 /* Vector Narrowing Fixed-Point Clip Instructions */
2947 static inline int8_t
2948 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2949 {
2950     uint8_t round, shift = b & 0xf;
2951     int16_t res;
2952 
2953     round = get_round(vxrm, a, shift);
2954     res = (a >> shift) + round;
2955     if (res > INT8_MAX) {
2956         env->vxsat = 0x1;
2957         return INT8_MAX;
2958     } else if (res < INT8_MIN) {
2959         env->vxsat = 0x1;
2960         return INT8_MIN;
2961     } else {
2962         return res;
2963     }
2964 }
2965 
2966 static inline int16_t
2967 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2968 {
2969     uint8_t round, shift = b & 0x1f;
2970     int32_t res;
2971 
2972     round = get_round(vxrm, a, shift);
2973     res = (a >> shift) + round;
2974     if (res > INT16_MAX) {
2975         env->vxsat = 0x1;
2976         return INT16_MAX;
2977     } else if (res < INT16_MIN) {
2978         env->vxsat = 0x1;
2979         return INT16_MIN;
2980     } else {
2981         return res;
2982     }
2983 }
2984 
2985 static inline int32_t
2986 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2987 {
2988     uint8_t round, shift = b & 0x3f;
2989     int64_t res;
2990 
2991     round = get_round(vxrm, a, shift);
2992     res = (a >> shift) + round;
2993     if (res > INT32_MAX) {
2994         env->vxsat = 0x1;
2995         return INT32_MAX;
2996     } else if (res < INT32_MIN) {
2997         env->vxsat = 0x1;
2998         return INT32_MIN;
2999     } else {
3000         return res;
3001     }
3002 }
3003 
3004 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
3005 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
3006 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
3007 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
3008 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
3009 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
3010 
3011 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
3012 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
3013 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
3014 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
3015 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
3016 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
3017 
3018 static inline uint8_t
3019 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
3020 {
3021     uint8_t round, shift = b & 0xf;
3022     uint16_t res;
3023 
3024     round = get_round(vxrm, a, shift);
3025     res = (a >> shift) + round;
3026     if (res > UINT8_MAX) {
3027         env->vxsat = 0x1;
3028         return UINT8_MAX;
3029     } else {
3030         return res;
3031     }
3032 }
3033 
3034 static inline uint16_t
3035 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
3036 {
3037     uint8_t round, shift = b & 0x1f;
3038     uint32_t res;
3039 
3040     round = get_round(vxrm, a, shift);
3041     res = (a >> shift) + round;
3042     if (res > UINT16_MAX) {
3043         env->vxsat = 0x1;
3044         return UINT16_MAX;
3045     } else {
3046         return res;
3047     }
3048 }
3049 
3050 static inline uint32_t
3051 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3052 {
3053     uint8_t round, shift = b & 0x3f;
3054     uint64_t res;
3055 
3056     round = get_round(vxrm, a, shift);
3057     res = (a >> shift) + round;
3058     if (res > UINT32_MAX) {
3059         env->vxsat = 0x1;
3060         return UINT32_MAX;
3061     } else {
3062         return res;
3063     }
3064 }
3065 
3066 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3067 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3068 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3069 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3070 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3071 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3072 
3073 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3074 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3075 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3076 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3077 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3078 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3079 
3080 /*
3081  * Vector Float Point Arithmetic Instructions
3082  */
3083 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3084 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3085 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3086                       CPURISCVState *env)                      \
3087 {                                                              \
3088     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3089     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3090     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3091 }
3092 
3093 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3094 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3095                   void *vs2, CPURISCVState *env,          \
3096                   uint32_t desc)                          \
3097 {                                                         \
3098     uint32_t vm = vext_vm(desc);                          \
3099     uint32_t vl = env->vl;                                \
3100     uint32_t total_elems =                                \
3101         vext_get_total_elems(env, desc, ESZ);             \
3102     uint32_t vta = vext_vta(desc);                        \
3103     uint32_t vma = vext_vma(desc);                        \
3104     uint32_t i;                                           \
3105                                                           \
3106     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3107                                                           \
3108     for (i = env->vstart; i < vl; i++) {                  \
3109         if (!vm && !vext_elem_mask(v0, i)) {              \
3110             /* set masked-off elements to 1s */           \
3111             vext_set_elems_1s(vd, vma, i * ESZ,           \
3112                               (i + 1) * ESZ);             \
3113             continue;                                     \
3114         }                                                 \
3115         do_##NAME(vd, vs1, vs2, i, env);                  \
3116     }                                                     \
3117     env->vstart = 0;                                      \
3118     /* set tail elements to 1s */                         \
3119     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3120                       total_elems * ESZ);                 \
3121 }
3122 
3123 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3124 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3125 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3126 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3127 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3128 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3129 
3130 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3131 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3132                       CPURISCVState *env)                      \
3133 {                                                              \
3134     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3135     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3136 }
3137 
3138 #define GEN_VEXT_VF(NAME, ESZ)                            \
3139 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3140                   void *vs2, CPURISCVState *env,          \
3141                   uint32_t desc)                          \
3142 {                                                         \
3143     uint32_t vm = vext_vm(desc);                          \
3144     uint32_t vl = env->vl;                                \
3145     uint32_t total_elems =                                \
3146         vext_get_total_elems(env, desc, ESZ);             \
3147     uint32_t vta = vext_vta(desc);                        \
3148     uint32_t vma = vext_vma(desc);                        \
3149     uint32_t i;                                           \
3150                                                           \
3151     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3152                                                           \
3153     for (i = env->vstart; i < vl; i++) {                  \
3154         if (!vm && !vext_elem_mask(v0, i)) {              \
3155             /* set masked-off elements to 1s */           \
3156             vext_set_elems_1s(vd, vma, i * ESZ,           \
3157                               (i + 1) * ESZ);             \
3158             continue;                                     \
3159         }                                                 \
3160         do_##NAME(vd, s1, vs2, i, env);                   \
3161     }                                                     \
3162     env->vstart = 0;                                      \
3163     /* set tail elements to 1s */                         \
3164     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3165                       total_elems * ESZ);                 \
3166 }
3167 
3168 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3169 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3170 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3171 GEN_VEXT_VF(vfadd_vf_h, 2)
3172 GEN_VEXT_VF(vfadd_vf_w, 4)
3173 GEN_VEXT_VF(vfadd_vf_d, 8)
3174 
3175 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3176 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3177 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3178 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3179 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3180 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3181 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3182 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3183 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3184 GEN_VEXT_VF(vfsub_vf_h, 2)
3185 GEN_VEXT_VF(vfsub_vf_w, 4)
3186 GEN_VEXT_VF(vfsub_vf_d, 8)
3187 
3188 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3189 {
3190     return float16_sub(b, a, s);
3191 }
3192 
3193 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3194 {
3195     return float32_sub(b, a, s);
3196 }
3197 
3198 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3199 {
3200     return float64_sub(b, a, s);
3201 }
3202 
3203 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3204 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3205 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3206 GEN_VEXT_VF(vfrsub_vf_h, 2)
3207 GEN_VEXT_VF(vfrsub_vf_w, 4)
3208 GEN_VEXT_VF(vfrsub_vf_d, 8)
3209 
3210 /* Vector Widening Floating-Point Add/Subtract Instructions */
3211 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3212 {
3213     return float32_add(float16_to_float32(a, true, s),
3214                        float16_to_float32(b, true, s), s);
3215 }
3216 
3217 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3218 {
3219     return float64_add(float32_to_float64(a, s),
3220                        float32_to_float64(b, s), s);
3221 
3222 }
3223 
3224 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3225 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3226 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3227 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3228 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3229 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3230 GEN_VEXT_VF(vfwadd_vf_h, 4)
3231 GEN_VEXT_VF(vfwadd_vf_w, 8)
3232 
3233 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3234 {
3235     return float32_sub(float16_to_float32(a, true, s),
3236                        float16_to_float32(b, true, s), s);
3237 }
3238 
3239 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3240 {
3241     return float64_sub(float32_to_float64(a, s),
3242                        float32_to_float64(b, s), s);
3243 
3244 }
3245 
3246 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3247 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3248 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3249 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3250 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3251 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3252 GEN_VEXT_VF(vfwsub_vf_h, 4)
3253 GEN_VEXT_VF(vfwsub_vf_w, 8)
3254 
3255 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3256 {
3257     return float32_add(a, float16_to_float32(b, true, s), s);
3258 }
3259 
3260 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3261 {
3262     return float64_add(a, float32_to_float64(b, s), s);
3263 }
3264 
3265 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3266 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3267 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3268 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3269 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3270 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3271 GEN_VEXT_VF(vfwadd_wf_h, 4)
3272 GEN_VEXT_VF(vfwadd_wf_w, 8)
3273 
3274 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3275 {
3276     return float32_sub(a, float16_to_float32(b, true, s), s);
3277 }
3278 
3279 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3280 {
3281     return float64_sub(a, float32_to_float64(b, s), s);
3282 }
3283 
3284 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3285 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3286 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3287 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3288 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3289 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3290 GEN_VEXT_VF(vfwsub_wf_h, 4)
3291 GEN_VEXT_VF(vfwsub_wf_w, 8)
3292 
3293 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3294 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3295 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3296 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3297 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3298 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3299 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3300 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3301 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3302 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3303 GEN_VEXT_VF(vfmul_vf_h, 2)
3304 GEN_VEXT_VF(vfmul_vf_w, 4)
3305 GEN_VEXT_VF(vfmul_vf_d, 8)
3306 
3307 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3308 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3309 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3310 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3311 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3312 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3313 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3314 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3315 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3316 GEN_VEXT_VF(vfdiv_vf_h, 2)
3317 GEN_VEXT_VF(vfdiv_vf_w, 4)
3318 GEN_VEXT_VF(vfdiv_vf_d, 8)
3319 
3320 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3321 {
3322     return float16_div(b, a, s);
3323 }
3324 
3325 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3326 {
3327     return float32_div(b, a, s);
3328 }
3329 
3330 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3331 {
3332     return float64_div(b, a, s);
3333 }
3334 
3335 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3336 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3337 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3338 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3339 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3340 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3341 
3342 /* Vector Widening Floating-Point Multiply */
3343 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3344 {
3345     return float32_mul(float16_to_float32(a, true, s),
3346                        float16_to_float32(b, true, s), s);
3347 }
3348 
3349 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3350 {
3351     return float64_mul(float32_to_float64(a, s),
3352                        float32_to_float64(b, s), s);
3353 
3354 }
3355 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3356 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3357 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3358 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3359 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3360 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3361 GEN_VEXT_VF(vfwmul_vf_h, 4)
3362 GEN_VEXT_VF(vfwmul_vf_w, 8)
3363 
3364 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3365 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3366 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3367                       CPURISCVState *env)                          \
3368 {                                                                  \
3369     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3370     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3371     TD d = *((TD *)vd + HD(i));                                    \
3372     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3373 }
3374 
3375 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3376 {
3377     return float16_muladd(a, b, d, 0, s);
3378 }
3379 
3380 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3381 {
3382     return float32_muladd(a, b, d, 0, s);
3383 }
3384 
3385 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3386 {
3387     return float64_muladd(a, b, d, 0, s);
3388 }
3389 
3390 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3391 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3392 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3393 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3394 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3395 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3396 
3397 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3398 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3399                       CPURISCVState *env)                         \
3400 {                                                                 \
3401     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3402     TD d = *((TD *)vd + HD(i));                                   \
3403     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3404 }
3405 
3406 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3407 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3408 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3409 GEN_VEXT_VF(vfmacc_vf_h, 2)
3410 GEN_VEXT_VF(vfmacc_vf_w, 4)
3411 GEN_VEXT_VF(vfmacc_vf_d, 8)
3412 
3413 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3414 {
3415     return float16_muladd(a, b, d, float_muladd_negate_c |
3416                                    float_muladd_negate_product, s);
3417 }
3418 
3419 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3420 {
3421     return float32_muladd(a, b, d, float_muladd_negate_c |
3422                                    float_muladd_negate_product, s);
3423 }
3424 
3425 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3426 {
3427     return float64_muladd(a, b, d, float_muladd_negate_c |
3428                                    float_muladd_negate_product, s);
3429 }
3430 
3431 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3432 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3433 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3434 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3435 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3436 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3437 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3438 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3439 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3440 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3441 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3442 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3443 
3444 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3445 {
3446     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3447 }
3448 
3449 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3450 {
3451     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3452 }
3453 
3454 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3455 {
3456     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3457 }
3458 
3459 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3460 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3461 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3462 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3463 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3464 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3465 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3466 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3467 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3468 GEN_VEXT_VF(vfmsac_vf_h, 2)
3469 GEN_VEXT_VF(vfmsac_vf_w, 4)
3470 GEN_VEXT_VF(vfmsac_vf_d, 8)
3471 
3472 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3473 {
3474     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3475 }
3476 
3477 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3478 {
3479     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3480 }
3481 
3482 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3483 {
3484     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3485 }
3486 
3487 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3488 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3489 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3490 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3491 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3492 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3493 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3494 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3495 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3496 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3497 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3498 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3499 
3500 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3501 {
3502     return float16_muladd(d, b, a, 0, s);
3503 }
3504 
3505 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3506 {
3507     return float32_muladd(d, b, a, 0, s);
3508 }
3509 
3510 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3511 {
3512     return float64_muladd(d, b, a, 0, s);
3513 }
3514 
3515 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3516 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3517 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3518 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3519 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3520 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3521 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3522 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3523 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3524 GEN_VEXT_VF(vfmadd_vf_h, 2)
3525 GEN_VEXT_VF(vfmadd_vf_w, 4)
3526 GEN_VEXT_VF(vfmadd_vf_d, 8)
3527 
3528 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3529 {
3530     return float16_muladd(d, b, a, float_muladd_negate_c |
3531                                    float_muladd_negate_product, s);
3532 }
3533 
3534 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3535 {
3536     return float32_muladd(d, b, a, float_muladd_negate_c |
3537                                    float_muladd_negate_product, s);
3538 }
3539 
3540 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3541 {
3542     return float64_muladd(d, b, a, float_muladd_negate_c |
3543                                    float_muladd_negate_product, s);
3544 }
3545 
3546 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3547 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3548 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3549 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3550 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3551 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3552 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3553 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3554 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3555 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3556 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3557 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3558 
3559 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3560 {
3561     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3562 }
3563 
3564 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3565 {
3566     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3567 }
3568 
3569 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3570 {
3571     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3572 }
3573 
3574 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3575 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3576 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3577 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3578 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3579 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3580 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3581 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3582 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3583 GEN_VEXT_VF(vfmsub_vf_h, 2)
3584 GEN_VEXT_VF(vfmsub_vf_w, 4)
3585 GEN_VEXT_VF(vfmsub_vf_d, 8)
3586 
3587 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3588 {
3589     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3590 }
3591 
3592 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3593 {
3594     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3595 }
3596 
3597 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3598 {
3599     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3600 }
3601 
3602 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3603 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3604 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3605 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3606 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3607 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3608 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3609 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3610 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3611 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3612 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3613 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3614 
3615 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3616 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3617 {
3618     return float32_muladd(float16_to_float32(a, true, s),
3619                           float16_to_float32(b, true, s), d, 0, s);
3620 }
3621 
3622 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3623 {
3624     return float64_muladd(float32_to_float64(a, s),
3625                           float32_to_float64(b, s), d, 0, s);
3626 }
3627 
3628 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3629 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3630 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3631 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3632 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3633 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3634 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3635 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3636 
3637 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3638 {
3639     return float32_muladd(bfloat16_to_float32(a, s),
3640                           bfloat16_to_float32(b, s), d, 0, s);
3641 }
3642 
3643 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3644 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3645 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3646 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3647 
3648 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3649 {
3650     return float32_muladd(float16_to_float32(a, true, s),
3651                           float16_to_float32(b, true, s), d,
3652                           float_muladd_negate_c | float_muladd_negate_product,
3653                           s);
3654 }
3655 
3656 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3657 {
3658     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3659                           d, float_muladd_negate_c |
3660                              float_muladd_negate_product, s);
3661 }
3662 
3663 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3664 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3665 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3666 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3667 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3668 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3669 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3670 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3671 
3672 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3673 {
3674     return float32_muladd(float16_to_float32(a, true, s),
3675                           float16_to_float32(b, true, s), d,
3676                           float_muladd_negate_c, s);
3677 }
3678 
3679 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3680 {
3681     return float64_muladd(float32_to_float64(a, s),
3682                           float32_to_float64(b, s), d,
3683                           float_muladd_negate_c, s);
3684 }
3685 
3686 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3687 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3688 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3689 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3690 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3691 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3692 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3693 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3694 
3695 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3696 {
3697     return float32_muladd(float16_to_float32(a, true, s),
3698                           float16_to_float32(b, true, s), d,
3699                           float_muladd_negate_product, s);
3700 }
3701 
3702 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3703 {
3704     return float64_muladd(float32_to_float64(a, s),
3705                           float32_to_float64(b, s), d,
3706                           float_muladd_negate_product, s);
3707 }
3708 
3709 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3710 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3711 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3712 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3713 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3714 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3715 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3716 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3717 
3718 /* Vector Floating-Point Square-Root Instruction */
3719 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3720 static void do_##NAME(void *vd, void *vs2, int i,      \
3721                       CPURISCVState *env)              \
3722 {                                                      \
3723     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3724     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3725 }
3726 
3727 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3728 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3729                   CPURISCVState *env, uint32_t desc)   \
3730 {                                                      \
3731     uint32_t vm = vext_vm(desc);                       \
3732     uint32_t vl = env->vl;                             \
3733     uint32_t total_elems =                             \
3734         vext_get_total_elems(env, desc, ESZ);          \
3735     uint32_t vta = vext_vta(desc);                     \
3736     uint32_t vma = vext_vma(desc);                     \
3737     uint32_t i;                                        \
3738                                                        \
3739     VSTART_CHECK_EARLY_EXIT(env, vl);                  \
3740                                                        \
3741     if (vl == 0) {                                     \
3742         return;                                        \
3743     }                                                  \
3744     for (i = env->vstart; i < vl; i++) {               \
3745         if (!vm && !vext_elem_mask(v0, i)) {           \
3746             /* set masked-off elements to 1s */        \
3747             vext_set_elems_1s(vd, vma, i * ESZ,        \
3748                               (i + 1) * ESZ);          \
3749             continue;                                  \
3750         }                                              \
3751         do_##NAME(vd, vs2, i, env);                    \
3752     }                                                  \
3753     env->vstart = 0;                                   \
3754     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3755                       total_elems * ESZ);              \
3756 }
3757 
3758 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3759 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3760 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3761 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3762 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3763 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3764 
3765 /*
3766  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3767  *
3768  * Adapted from riscv-v-spec recip.c:
3769  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3770  */
3771 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3772 {
3773     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3774     uint64_t exp = extract64(f, frac_size, exp_size);
3775     uint64_t frac = extract64(f, 0, frac_size);
3776 
3777     const uint8_t lookup_table[] = {
3778         52, 51, 50, 48, 47, 46, 44, 43,
3779         42, 41, 40, 39, 38, 36, 35, 34,
3780         33, 32, 31, 30, 30, 29, 28, 27,
3781         26, 25, 24, 23, 23, 22, 21, 20,
3782         19, 19, 18, 17, 16, 16, 15, 14,
3783         14, 13, 12, 12, 11, 10, 10, 9,
3784         9, 8, 7, 7, 6, 6, 5, 4,
3785         4, 3, 3, 2, 2, 1, 1, 0,
3786         127, 125, 123, 121, 119, 118, 116, 114,
3787         113, 111, 109, 108, 106, 105, 103, 102,
3788         100, 99, 97, 96, 95, 93, 92, 91,
3789         90, 88, 87, 86, 85, 84, 83, 82,
3790         80, 79, 78, 77, 76, 75, 74, 73,
3791         72, 71, 70, 70, 69, 68, 67, 66,
3792         65, 64, 63, 63, 62, 61, 60, 59,
3793         59, 58, 57, 56, 56, 55, 54, 53
3794     };
3795     const int precision = 7;
3796 
3797     if (exp == 0 && frac != 0) { /* subnormal */
3798         /* Normalize the subnormal. */
3799         while (extract64(frac, frac_size - 1, 1) == 0) {
3800             exp--;
3801             frac <<= 1;
3802         }
3803 
3804         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3805     }
3806 
3807     int idx = ((exp & 1) << (precision - 1)) |
3808               (frac >> (frac_size - precision + 1));
3809     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3810                         (frac_size - precision);
3811     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3812 
3813     uint64_t val = 0;
3814     val = deposit64(val, 0, frac_size, out_frac);
3815     val = deposit64(val, frac_size, exp_size, out_exp);
3816     val = deposit64(val, frac_size + exp_size, 1, sign);
3817     return val;
3818 }
3819 
3820 static float16 frsqrt7_h(float16 f, float_status *s)
3821 {
3822     int exp_size = 5, frac_size = 10;
3823     bool sign = float16_is_neg(f);
3824 
3825     /*
3826      * frsqrt7(sNaN) = canonical NaN
3827      * frsqrt7(-inf) = canonical NaN
3828      * frsqrt7(-normal) = canonical NaN
3829      * frsqrt7(-subnormal) = canonical NaN
3830      */
3831     if (float16_is_signaling_nan(f, s) ||
3832         (float16_is_infinity(f) && sign) ||
3833         (float16_is_normal(f) && sign) ||
3834         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3835         s->float_exception_flags |= float_flag_invalid;
3836         return float16_default_nan(s);
3837     }
3838 
3839     /* frsqrt7(qNaN) = canonical NaN */
3840     if (float16_is_quiet_nan(f, s)) {
3841         return float16_default_nan(s);
3842     }
3843 
3844     /* frsqrt7(+-0) = +-inf */
3845     if (float16_is_zero(f)) {
3846         s->float_exception_flags |= float_flag_divbyzero;
3847         return float16_set_sign(float16_infinity, sign);
3848     }
3849 
3850     /* frsqrt7(+inf) = +0 */
3851     if (float16_is_infinity(f) && !sign) {
3852         return float16_set_sign(float16_zero, sign);
3853     }
3854 
3855     /* +normal, +subnormal */
3856     uint64_t val = frsqrt7(f, exp_size, frac_size);
3857     return make_float16(val);
3858 }
3859 
3860 static float32 frsqrt7_s(float32 f, float_status *s)
3861 {
3862     int exp_size = 8, frac_size = 23;
3863     bool sign = float32_is_neg(f);
3864 
3865     /*
3866      * frsqrt7(sNaN) = canonical NaN
3867      * frsqrt7(-inf) = canonical NaN
3868      * frsqrt7(-normal) = canonical NaN
3869      * frsqrt7(-subnormal) = canonical NaN
3870      */
3871     if (float32_is_signaling_nan(f, s) ||
3872         (float32_is_infinity(f) && sign) ||
3873         (float32_is_normal(f) && sign) ||
3874         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3875         s->float_exception_flags |= float_flag_invalid;
3876         return float32_default_nan(s);
3877     }
3878 
3879     /* frsqrt7(qNaN) = canonical NaN */
3880     if (float32_is_quiet_nan(f, s)) {
3881         return float32_default_nan(s);
3882     }
3883 
3884     /* frsqrt7(+-0) = +-inf */
3885     if (float32_is_zero(f)) {
3886         s->float_exception_flags |= float_flag_divbyzero;
3887         return float32_set_sign(float32_infinity, sign);
3888     }
3889 
3890     /* frsqrt7(+inf) = +0 */
3891     if (float32_is_infinity(f) && !sign) {
3892         return float32_set_sign(float32_zero, sign);
3893     }
3894 
3895     /* +normal, +subnormal */
3896     uint64_t val = frsqrt7(f, exp_size, frac_size);
3897     return make_float32(val);
3898 }
3899 
3900 static float64 frsqrt7_d(float64 f, float_status *s)
3901 {
3902     int exp_size = 11, frac_size = 52;
3903     bool sign = float64_is_neg(f);
3904 
3905     /*
3906      * frsqrt7(sNaN) = canonical NaN
3907      * frsqrt7(-inf) = canonical NaN
3908      * frsqrt7(-normal) = canonical NaN
3909      * frsqrt7(-subnormal) = canonical NaN
3910      */
3911     if (float64_is_signaling_nan(f, s) ||
3912         (float64_is_infinity(f) && sign) ||
3913         (float64_is_normal(f) && sign) ||
3914         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3915         s->float_exception_flags |= float_flag_invalid;
3916         return float64_default_nan(s);
3917     }
3918 
3919     /* frsqrt7(qNaN) = canonical NaN */
3920     if (float64_is_quiet_nan(f, s)) {
3921         return float64_default_nan(s);
3922     }
3923 
3924     /* frsqrt7(+-0) = +-inf */
3925     if (float64_is_zero(f)) {
3926         s->float_exception_flags |= float_flag_divbyzero;
3927         return float64_set_sign(float64_infinity, sign);
3928     }
3929 
3930     /* frsqrt7(+inf) = +0 */
3931     if (float64_is_infinity(f) && !sign) {
3932         return float64_set_sign(float64_zero, sign);
3933     }
3934 
3935     /* +normal, +subnormal */
3936     uint64_t val = frsqrt7(f, exp_size, frac_size);
3937     return make_float64(val);
3938 }
3939 
3940 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3941 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3942 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3943 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3944 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3945 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3946 
3947 /*
3948  * Vector Floating-Point Reciprocal Estimate Instruction
3949  *
3950  * Adapted from riscv-v-spec recip.c:
3951  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3952  */
3953 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3954                       float_status *s)
3955 {
3956     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3957     uint64_t exp = extract64(f, frac_size, exp_size);
3958     uint64_t frac = extract64(f, 0, frac_size);
3959 
3960     const uint8_t lookup_table[] = {
3961         127, 125, 123, 121, 119, 117, 116, 114,
3962         112, 110, 109, 107, 105, 104, 102, 100,
3963         99, 97, 96, 94, 93, 91, 90, 88,
3964         87, 85, 84, 83, 81, 80, 79, 77,
3965         76, 75, 74, 72, 71, 70, 69, 68,
3966         66, 65, 64, 63, 62, 61, 60, 59,
3967         58, 57, 56, 55, 54, 53, 52, 51,
3968         50, 49, 48, 47, 46, 45, 44, 43,
3969         42, 41, 40, 40, 39, 38, 37, 36,
3970         35, 35, 34, 33, 32, 31, 31, 30,
3971         29, 28, 28, 27, 26, 25, 25, 24,
3972         23, 23, 22, 21, 21, 20, 19, 19,
3973         18, 17, 17, 16, 15, 15, 14, 14,
3974         13, 12, 12, 11, 11, 10, 9, 9,
3975         8, 8, 7, 7, 6, 5, 5, 4,
3976         4, 3, 3, 2, 2, 1, 1, 0
3977     };
3978     const int precision = 7;
3979 
3980     if (exp == 0 && frac != 0) { /* subnormal */
3981         /* Normalize the subnormal. */
3982         while (extract64(frac, frac_size - 1, 1) == 0) {
3983             exp--;
3984             frac <<= 1;
3985         }
3986 
3987         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3988 
3989         if (exp != 0 && exp != UINT64_MAX) {
3990             /*
3991              * Overflow to inf or max value of same sign,
3992              * depending on sign and rounding mode.
3993              */
3994             s->float_exception_flags |= (float_flag_inexact |
3995                                          float_flag_overflow);
3996 
3997             if ((s->float_rounding_mode == float_round_to_zero) ||
3998                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3999                 ((s->float_rounding_mode == float_round_up) && sign)) {
4000                 /* Return greatest/negative finite value. */
4001                 return (sign << (exp_size + frac_size)) |
4002                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
4003             } else {
4004                 /* Return +-inf. */
4005                 return (sign << (exp_size + frac_size)) |
4006                        MAKE_64BIT_MASK(frac_size, exp_size);
4007             }
4008         }
4009     }
4010 
4011     int idx = frac >> (frac_size - precision);
4012     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
4013                         (frac_size - precision);
4014     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
4015 
4016     if (out_exp == 0 || out_exp == UINT64_MAX) {
4017         /*
4018          * The result is subnormal, but don't raise the underflow exception,
4019          * because there's no additional loss of precision.
4020          */
4021         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
4022         if (out_exp == UINT64_MAX) {
4023             out_frac >>= 1;
4024             out_exp = 0;
4025         }
4026     }
4027 
4028     uint64_t val = 0;
4029     val = deposit64(val, 0, frac_size, out_frac);
4030     val = deposit64(val, frac_size, exp_size, out_exp);
4031     val = deposit64(val, frac_size + exp_size, 1, sign);
4032     return val;
4033 }
4034 
4035 static float16 frec7_h(float16 f, float_status *s)
4036 {
4037     int exp_size = 5, frac_size = 10;
4038     bool sign = float16_is_neg(f);
4039 
4040     /* frec7(+-inf) = +-0 */
4041     if (float16_is_infinity(f)) {
4042         return float16_set_sign(float16_zero, sign);
4043     }
4044 
4045     /* frec7(+-0) = +-inf */
4046     if (float16_is_zero(f)) {
4047         s->float_exception_flags |= float_flag_divbyzero;
4048         return float16_set_sign(float16_infinity, sign);
4049     }
4050 
4051     /* frec7(sNaN) = canonical NaN */
4052     if (float16_is_signaling_nan(f, s)) {
4053         s->float_exception_flags |= float_flag_invalid;
4054         return float16_default_nan(s);
4055     }
4056 
4057     /* frec7(qNaN) = canonical NaN */
4058     if (float16_is_quiet_nan(f, s)) {
4059         return float16_default_nan(s);
4060     }
4061 
4062     /* +-normal, +-subnormal */
4063     uint64_t val = frec7(f, exp_size, frac_size, s);
4064     return make_float16(val);
4065 }
4066 
4067 static float32 frec7_s(float32 f, float_status *s)
4068 {
4069     int exp_size = 8, frac_size = 23;
4070     bool sign = float32_is_neg(f);
4071 
4072     /* frec7(+-inf) = +-0 */
4073     if (float32_is_infinity(f)) {
4074         return float32_set_sign(float32_zero, sign);
4075     }
4076 
4077     /* frec7(+-0) = +-inf */
4078     if (float32_is_zero(f)) {
4079         s->float_exception_flags |= float_flag_divbyzero;
4080         return float32_set_sign(float32_infinity, sign);
4081     }
4082 
4083     /* frec7(sNaN) = canonical NaN */
4084     if (float32_is_signaling_nan(f, s)) {
4085         s->float_exception_flags |= float_flag_invalid;
4086         return float32_default_nan(s);
4087     }
4088 
4089     /* frec7(qNaN) = canonical NaN */
4090     if (float32_is_quiet_nan(f, s)) {
4091         return float32_default_nan(s);
4092     }
4093 
4094     /* +-normal, +-subnormal */
4095     uint64_t val = frec7(f, exp_size, frac_size, s);
4096     return make_float32(val);
4097 }
4098 
4099 static float64 frec7_d(float64 f, float_status *s)
4100 {
4101     int exp_size = 11, frac_size = 52;
4102     bool sign = float64_is_neg(f);
4103 
4104     /* frec7(+-inf) = +-0 */
4105     if (float64_is_infinity(f)) {
4106         return float64_set_sign(float64_zero, sign);
4107     }
4108 
4109     /* frec7(+-0) = +-inf */
4110     if (float64_is_zero(f)) {
4111         s->float_exception_flags |= float_flag_divbyzero;
4112         return float64_set_sign(float64_infinity, sign);
4113     }
4114 
4115     /* frec7(sNaN) = canonical NaN */
4116     if (float64_is_signaling_nan(f, s)) {
4117         s->float_exception_flags |= float_flag_invalid;
4118         return float64_default_nan(s);
4119     }
4120 
4121     /* frec7(qNaN) = canonical NaN */
4122     if (float64_is_quiet_nan(f, s)) {
4123         return float64_default_nan(s);
4124     }
4125 
4126     /* +-normal, +-subnormal */
4127     uint64_t val = frec7(f, exp_size, frac_size, s);
4128     return make_float64(val);
4129 }
4130 
4131 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4132 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4133 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4134 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4135 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4136 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4137 
4138 /* Vector Floating-Point MIN/MAX Instructions */
4139 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4140 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4141 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4142 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4143 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4144 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4145 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4146 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4147 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4148 GEN_VEXT_VF(vfmin_vf_h, 2)
4149 GEN_VEXT_VF(vfmin_vf_w, 4)
4150 GEN_VEXT_VF(vfmin_vf_d, 8)
4151 
4152 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4153 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4154 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4155 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4156 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4157 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4158 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4159 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4160 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4161 GEN_VEXT_VF(vfmax_vf_h, 2)
4162 GEN_VEXT_VF(vfmax_vf_w, 4)
4163 GEN_VEXT_VF(vfmax_vf_d, 8)
4164 
4165 /* Vector Floating-Point Sign-Injection Instructions */
4166 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4167 {
4168     return deposit64(b, 0, 15, a);
4169 }
4170 
4171 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4172 {
4173     return deposit64(b, 0, 31, a);
4174 }
4175 
4176 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4177 {
4178     return deposit64(b, 0, 63, a);
4179 }
4180 
4181 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4182 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4183 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4184 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4185 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4186 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4187 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4188 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4189 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4190 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4191 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4192 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4193 
4194 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4195 {
4196     return deposit64(~b, 0, 15, a);
4197 }
4198 
4199 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4200 {
4201     return deposit64(~b, 0, 31, a);
4202 }
4203 
4204 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4205 {
4206     return deposit64(~b, 0, 63, a);
4207 }
4208 
4209 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4210 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4211 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4212 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4213 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4214 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4215 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4216 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4217 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4218 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4219 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4220 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4221 
4222 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4223 {
4224     return deposit64(b ^ a, 0, 15, a);
4225 }
4226 
4227 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4228 {
4229     return deposit64(b ^ a, 0, 31, a);
4230 }
4231 
4232 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4233 {
4234     return deposit64(b ^ a, 0, 63, a);
4235 }
4236 
4237 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4238 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4239 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4240 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4241 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4242 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4243 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4244 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4245 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4246 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4247 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4248 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4249 
4250 /* Vector Floating-Point Compare Instructions */
4251 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4252 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4253                   CPURISCVState *env, uint32_t desc)          \
4254 {                                                             \
4255     uint32_t vm = vext_vm(desc);                              \
4256     uint32_t vl = env->vl;                                    \
4257     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4258     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4259     uint32_t vma = vext_vma(desc);                            \
4260     uint32_t i;                                               \
4261                                                               \
4262     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4263                                                               \
4264     for (i = env->vstart; i < vl; i++) {                      \
4265         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4266         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4267         if (!vm && !vext_elem_mask(v0, i)) {                  \
4268             /* set masked-off elements to 1s */               \
4269             if (vma) {                                        \
4270                 vext_set_elem_mask(vd, i, 1);                 \
4271             }                                                 \
4272             continue;                                         \
4273         }                                                     \
4274         vext_set_elem_mask(vd, i,                             \
4275                            DO_OP(s2, s1, &env->fp_status));   \
4276     }                                                         \
4277     env->vstart = 0;                                          \
4278     /*
4279      * mask destination register are always tail-agnostic
4280      * set tail elements to 1s
4281      */                                                       \
4282     if (vta_all_1s) {                                         \
4283         for (; i < total_elems; i++) {                        \
4284             vext_set_elem_mask(vd, i, 1);                     \
4285         }                                                     \
4286     }                                                         \
4287 }
4288 
4289 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4290 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4291 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4292 
4293 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4294 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4295                   CPURISCVState *env, uint32_t desc)                \
4296 {                                                                   \
4297     uint32_t vm = vext_vm(desc);                                    \
4298     uint32_t vl = env->vl;                                          \
4299     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4300     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4301     uint32_t vma = vext_vma(desc);                                  \
4302     uint32_t i;                                                     \
4303                                                                     \
4304     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
4305                                                                     \
4306     for (i = env->vstart; i < vl; i++) {                            \
4307         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4308         if (!vm && !vext_elem_mask(v0, i)) {                        \
4309             /* set masked-off elements to 1s */                     \
4310             if (vma) {                                              \
4311                 vext_set_elem_mask(vd, i, 1);                       \
4312             }                                                       \
4313             continue;                                               \
4314         }                                                           \
4315         vext_set_elem_mask(vd, i,                                   \
4316                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4317     }                                                               \
4318     env->vstart = 0;                                                \
4319     /*
4320      * mask destination register are always tail-agnostic
4321      * set tail elements to 1s
4322      */                                                             \
4323     if (vta_all_1s) {                                               \
4324         for (; i < total_elems; i++) {                              \
4325             vext_set_elem_mask(vd, i, 1);                           \
4326         }                                                           \
4327     }                                                               \
4328 }
4329 
4330 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4331 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4332 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4333 
4334 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4335 {
4336     FloatRelation compare = float16_compare_quiet(a, b, s);
4337     return compare != float_relation_equal;
4338 }
4339 
4340 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4341 {
4342     FloatRelation compare = float32_compare_quiet(a, b, s);
4343     return compare != float_relation_equal;
4344 }
4345 
4346 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4347 {
4348     FloatRelation compare = float64_compare_quiet(a, b, s);
4349     return compare != float_relation_equal;
4350 }
4351 
4352 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4353 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4354 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4355 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4356 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4357 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4358 
4359 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4360 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4361 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4362 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4363 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4364 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4365 
4366 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4367 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4368 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4369 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4370 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4371 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4372 
4373 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4374 {
4375     FloatRelation compare = float16_compare(a, b, s);
4376     return compare == float_relation_greater;
4377 }
4378 
4379 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4380 {
4381     FloatRelation compare = float32_compare(a, b, s);
4382     return compare == float_relation_greater;
4383 }
4384 
4385 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4386 {
4387     FloatRelation compare = float64_compare(a, b, s);
4388     return compare == float_relation_greater;
4389 }
4390 
4391 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4392 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4393 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4394 
4395 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4396 {
4397     FloatRelation compare = float16_compare(a, b, s);
4398     return compare == float_relation_greater ||
4399            compare == float_relation_equal;
4400 }
4401 
4402 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4403 {
4404     FloatRelation compare = float32_compare(a, b, s);
4405     return compare == float_relation_greater ||
4406            compare == float_relation_equal;
4407 }
4408 
4409 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4410 {
4411     FloatRelation compare = float64_compare(a, b, s);
4412     return compare == float_relation_greater ||
4413            compare == float_relation_equal;
4414 }
4415 
4416 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4417 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4418 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4419 
4420 /* Vector Floating-Point Classify Instruction */
4421 target_ulong fclass_h(uint64_t frs1)
4422 {
4423     float16 f = frs1;
4424     bool sign = float16_is_neg(f);
4425 
4426     if (float16_is_infinity(f)) {
4427         return sign ? 1 << 0 : 1 << 7;
4428     } else if (float16_is_zero(f)) {
4429         return sign ? 1 << 3 : 1 << 4;
4430     } else if (float16_is_zero_or_denormal(f)) {
4431         return sign ? 1 << 2 : 1 << 5;
4432     } else if (float16_is_any_nan(f)) {
4433         float_status s = { }; /* for snan_bit_is_one */
4434         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4435     } else {
4436         return sign ? 1 << 1 : 1 << 6;
4437     }
4438 }
4439 
4440 target_ulong fclass_s(uint64_t frs1)
4441 {
4442     float32 f = frs1;
4443     bool sign = float32_is_neg(f);
4444 
4445     if (float32_is_infinity(f)) {
4446         return sign ? 1 << 0 : 1 << 7;
4447     } else if (float32_is_zero(f)) {
4448         return sign ? 1 << 3 : 1 << 4;
4449     } else if (float32_is_zero_or_denormal(f)) {
4450         return sign ? 1 << 2 : 1 << 5;
4451     } else if (float32_is_any_nan(f)) {
4452         float_status s = { }; /* for snan_bit_is_one */
4453         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4454     } else {
4455         return sign ? 1 << 1 : 1 << 6;
4456     }
4457 }
4458 
4459 target_ulong fclass_d(uint64_t frs1)
4460 {
4461     float64 f = frs1;
4462     bool sign = float64_is_neg(f);
4463 
4464     if (float64_is_infinity(f)) {
4465         return sign ? 1 << 0 : 1 << 7;
4466     } else if (float64_is_zero(f)) {
4467         return sign ? 1 << 3 : 1 << 4;
4468     } else if (float64_is_zero_or_denormal(f)) {
4469         return sign ? 1 << 2 : 1 << 5;
4470     } else if (float64_is_any_nan(f)) {
4471         float_status s = { }; /* for snan_bit_is_one */
4472         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4473     } else {
4474         return sign ? 1 << 1 : 1 << 6;
4475     }
4476 }
4477 
4478 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4479 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4480 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4481 GEN_VEXT_V(vfclass_v_h, 2)
4482 GEN_VEXT_V(vfclass_v_w, 4)
4483 GEN_VEXT_V(vfclass_v_d, 8)
4484 
4485 /* Vector Floating-Point Merge Instruction */
4486 
4487 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4488 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4489                   CPURISCVState *env, uint32_t desc)          \
4490 {                                                             \
4491     uint32_t vm = vext_vm(desc);                              \
4492     uint32_t vl = env->vl;                                    \
4493     uint32_t esz = sizeof(ETYPE);                             \
4494     uint32_t total_elems =                                    \
4495         vext_get_total_elems(env, desc, esz);                 \
4496     uint32_t vta = vext_vta(desc);                            \
4497     uint32_t i;                                               \
4498                                                               \
4499     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4500                                                               \
4501     for (i = env->vstart; i < vl; i++) {                      \
4502         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4503         *((ETYPE *)vd + H(i)) =                               \
4504             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4505     }                                                         \
4506     env->vstart = 0;                                          \
4507     /* set tail elements to 1s */                             \
4508     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4509 }
4510 
4511 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4512 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4513 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4514 
4515 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4516 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4517 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4518 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4519 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4520 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4521 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4522 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4523 
4524 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4525 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4526 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4527 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4528 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4529 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4530 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4531 
4532 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4533 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4534 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4535 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4536 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4537 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4538 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4539 
4540 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4541 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4542 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4543 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4544 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4545 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4546 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4547 
4548 /* Widening Floating-Point/Integer Type-Convert Instructions */
4549 /* (TD, T2, TX2) */
4550 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4551 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4552 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4553 /*
4554  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4555  */
4556 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4557 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4558 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4559 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4560 
4561 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4562 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4563 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4564 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4565 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4566 
4567 /*
4568  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4569  */
4570 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4571 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4572 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4573 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4574 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4575 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4576 
4577 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4578 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4579 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4580 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4581 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4582 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4583 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4584 
4585 /*
4586  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4587  */
4588 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4589 {
4590     return float16_to_float32(a, true, s);
4591 }
4592 
4593 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4594 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4595 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4596 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4597 
4598 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4599 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4600 
4601 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4602 /* (TD, T2, TX2) */
4603 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4604 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4605 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4606 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4607 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4608 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4609 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4610 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4611 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4612 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4613 
4614 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4615 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4616 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4617 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4618 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4619 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4620 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4621 
4622 /*
4623  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4624  */
4625 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4626 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4627 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4628 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4629 
4630 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4631 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4632 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4633 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4634 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4635 
4636 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4637 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4638 {
4639     return float32_to_float16(a, true, s);
4640 }
4641 
4642 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4643 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4644 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4645 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4646 
4647 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4648 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4649 
4650 /*
4651  * Vector Reduction Operations
4652  */
4653 /* Vector Single-Width Integer Reduction Instructions */
4654 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4655 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4656                   void *vs2, CPURISCVState *env,          \
4657                   uint32_t desc)                          \
4658 {                                                         \
4659     uint32_t vm = vext_vm(desc);                          \
4660     uint32_t vl = env->vl;                                \
4661     uint32_t esz = sizeof(TD);                            \
4662     uint32_t vlenb = simd_maxsz(desc);                    \
4663     uint32_t vta = vext_vta(desc);                        \
4664     uint32_t i;                                           \
4665     TD s1 =  *((TD *)vs1 + HD(0));                        \
4666                                                           \
4667     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4668                                                           \
4669     for (i = env->vstart; i < vl; i++) {                  \
4670         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4671         if (!vm && !vext_elem_mask(v0, i)) {              \
4672             continue;                                     \
4673         }                                                 \
4674         s1 = OP(s1, (TD)s2);                              \
4675     }                                                     \
4676     if (vl > 0) {                                         \
4677         *((TD *)vd + HD(0)) = s1;                         \
4678     }                                                     \
4679     env->vstart = 0;                                      \
4680     /* set tail elements to 1s */                         \
4681     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4682 }
4683 
4684 /* vd[0] = sum(vs1[0], vs2[*]) */
4685 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4686 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4687 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4688 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4689 
4690 /* vd[0] = maxu(vs1[0], vs2[*]) */
4691 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4692 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4693 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4694 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4695 
4696 /* vd[0] = max(vs1[0], vs2[*]) */
4697 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4698 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4699 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4700 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4701 
4702 /* vd[0] = minu(vs1[0], vs2[*]) */
4703 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4704 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4705 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4706 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4707 
4708 /* vd[0] = min(vs1[0], vs2[*]) */
4709 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4710 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4711 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4712 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4713 
4714 /* vd[0] = and(vs1[0], vs2[*]) */
4715 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4716 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4717 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4718 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4719 
4720 /* vd[0] = or(vs1[0], vs2[*]) */
4721 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4722 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4723 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4724 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4725 
4726 /* vd[0] = xor(vs1[0], vs2[*]) */
4727 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4728 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4729 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4730 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4731 
4732 /* Vector Widening Integer Reduction Instructions */
4733 /* signed sum reduction into double-width accumulator */
4734 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4735 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4736 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4737 
4738 /* Unsigned sum reduction into double-width accumulator */
4739 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4740 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4741 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4742 
4743 /* Vector Single-Width Floating-Point Reduction Instructions */
4744 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4745 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4746                   void *vs2, CPURISCVState *env,           \
4747                   uint32_t desc)                           \
4748 {                                                          \
4749     uint32_t vm = vext_vm(desc);                           \
4750     uint32_t vl = env->vl;                                 \
4751     uint32_t esz = sizeof(TD);                             \
4752     uint32_t vlenb = simd_maxsz(desc);                     \
4753     uint32_t vta = vext_vta(desc);                         \
4754     uint32_t i;                                            \
4755     TD s1 =  *((TD *)vs1 + HD(0));                         \
4756                                                            \
4757     VSTART_CHECK_EARLY_EXIT(env, vl);                      \
4758                                                            \
4759     for (i = env->vstart; i < vl; i++) {                   \
4760         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4761         if (!vm && !vext_elem_mask(v0, i)) {               \
4762             continue;                                      \
4763         }                                                  \
4764         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4765     }                                                      \
4766     if (vl > 0) {                                          \
4767         *((TD *)vd + HD(0)) = s1;                          \
4768     }                                                      \
4769     env->vstart = 0;                                       \
4770     /* set tail elements to 1s */                          \
4771     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4772 }
4773 
4774 /* Unordered sum */
4775 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4776 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4777 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4778 
4779 /* Ordered sum */
4780 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4781 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4782 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4783 
4784 /* Maximum value */
4785 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4786               float16_maximum_number)
4787 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4788               float32_maximum_number)
4789 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4790               float64_maximum_number)
4791 
4792 /* Minimum value */
4793 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4794               float16_minimum_number)
4795 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4796               float32_minimum_number)
4797 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4798               float64_minimum_number)
4799 
4800 /* Vector Widening Floating-Point Add Instructions */
4801 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4802 {
4803     return float32_add(a, float16_to_float32(b, true, s), s);
4804 }
4805 
4806 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4807 {
4808     return float64_add(a, float32_to_float64(b, s), s);
4809 }
4810 
4811 /* Vector Widening Floating-Point Reduction Instructions */
4812 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4813 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4814 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4815 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4816 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4817 
4818 /*
4819  * Vector Mask Operations
4820  */
4821 /* Vector Mask-Register Logical Instructions */
4822 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4823 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4824                   void *vs2, CPURISCVState *env,          \
4825                   uint32_t desc)                          \
4826 {                                                         \
4827     uint32_t vl = env->vl;                                \
4828     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4829     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4830     uint32_t i;                                           \
4831     int a, b;                                             \
4832                                                           \
4833     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4834                                                           \
4835     for (i = env->vstart; i < vl; i++) {                  \
4836         a = vext_elem_mask(vs1, i);                       \
4837         b = vext_elem_mask(vs2, i);                       \
4838         vext_set_elem_mask(vd, i, OP(b, a));              \
4839     }                                                     \
4840     env->vstart = 0;                                      \
4841     /*
4842      * mask destination register are always tail-agnostic
4843      * set tail elements to 1s
4844      */                                                   \
4845     if (vta_all_1s) {                                     \
4846         for (; i < total_elems; i++) {                    \
4847             vext_set_elem_mask(vd, i, 1);                 \
4848         }                                                 \
4849     }                                                     \
4850 }
4851 
4852 #define DO_NAND(N, M)  (!(N & M))
4853 #define DO_ANDNOT(N, M)  (N & !M)
4854 #define DO_NOR(N, M)  (!(N | M))
4855 #define DO_ORNOT(N, M)  (N | !M)
4856 #define DO_XNOR(N, M)  (!(N ^ M))
4857 
4858 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4859 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4860 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4861 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4862 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4863 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4864 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4865 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4866 
4867 /* Vector count population in mask vcpop */
4868 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4869                              uint32_t desc)
4870 {
4871     target_ulong cnt = 0;
4872     uint32_t vm = vext_vm(desc);
4873     uint32_t vl = env->vl;
4874     int i;
4875 
4876     for (i = env->vstart; i < vl; i++) {
4877         if (vm || vext_elem_mask(v0, i)) {
4878             if (vext_elem_mask(vs2, i)) {
4879                 cnt++;
4880             }
4881         }
4882     }
4883     env->vstart = 0;
4884     return cnt;
4885 }
4886 
4887 /* vfirst find-first-set mask bit */
4888 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4889                               uint32_t desc)
4890 {
4891     uint32_t vm = vext_vm(desc);
4892     uint32_t vl = env->vl;
4893     int i;
4894 
4895     for (i = env->vstart; i < vl; i++) {
4896         if (vm || vext_elem_mask(v0, i)) {
4897             if (vext_elem_mask(vs2, i)) {
4898                 return i;
4899             }
4900         }
4901     }
4902     env->vstart = 0;
4903     return -1LL;
4904 }
4905 
4906 enum set_mask_type {
4907     ONLY_FIRST = 1,
4908     INCLUDE_FIRST,
4909     BEFORE_FIRST,
4910 };
4911 
4912 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4913                    uint32_t desc, enum set_mask_type type)
4914 {
4915     uint32_t vm = vext_vm(desc);
4916     uint32_t vl = env->vl;
4917     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4918     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4919     uint32_t vma = vext_vma(desc);
4920     int i;
4921     bool first_mask_bit = false;
4922 
4923     VSTART_CHECK_EARLY_EXIT(env, vl);
4924 
4925     for (i = env->vstart; i < vl; i++) {
4926         if (!vm && !vext_elem_mask(v0, i)) {
4927             /* set masked-off elements to 1s */
4928             if (vma) {
4929                 vext_set_elem_mask(vd, i, 1);
4930             }
4931             continue;
4932         }
4933         /* write a zero to all following active elements */
4934         if (first_mask_bit) {
4935             vext_set_elem_mask(vd, i, 0);
4936             continue;
4937         }
4938         if (vext_elem_mask(vs2, i)) {
4939             first_mask_bit = true;
4940             if (type == BEFORE_FIRST) {
4941                 vext_set_elem_mask(vd, i, 0);
4942             } else {
4943                 vext_set_elem_mask(vd, i, 1);
4944             }
4945         } else {
4946             if (type == ONLY_FIRST) {
4947                 vext_set_elem_mask(vd, i, 0);
4948             } else {
4949                 vext_set_elem_mask(vd, i, 1);
4950             }
4951         }
4952     }
4953     env->vstart = 0;
4954     /*
4955      * mask destination register are always tail-agnostic
4956      * set tail elements to 1s
4957      */
4958     if (vta_all_1s) {
4959         for (; i < total_elems; i++) {
4960             vext_set_elem_mask(vd, i, 1);
4961         }
4962     }
4963 }
4964 
4965 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4966                      uint32_t desc)
4967 {
4968     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4969 }
4970 
4971 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4972                      uint32_t desc)
4973 {
4974     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4975 }
4976 
4977 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4978                      uint32_t desc)
4979 {
4980     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4981 }
4982 
4983 /* Vector Iota Instruction */
4984 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4985 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4986                   uint32_t desc)                                          \
4987 {                                                                         \
4988     uint32_t vm = vext_vm(desc);                                          \
4989     uint32_t vl = env->vl;                                                \
4990     uint32_t esz = sizeof(ETYPE);                                         \
4991     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4992     uint32_t vta = vext_vta(desc);                                        \
4993     uint32_t vma = vext_vma(desc);                                        \
4994     uint32_t sum = 0;                                                     \
4995     int i;                                                                \
4996                                                                           \
4997     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
4998                                                                           \
4999     for (i = env->vstart; i < vl; i++) {                                  \
5000         if (!vm && !vext_elem_mask(v0, i)) {                              \
5001             /* set masked-off elements to 1s */                           \
5002             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5003             continue;                                                     \
5004         }                                                                 \
5005         *((ETYPE *)vd + H(i)) = sum;                                      \
5006         if (vext_elem_mask(vs2, i)) {                                     \
5007             sum++;                                                        \
5008         }                                                                 \
5009     }                                                                     \
5010     env->vstart = 0;                                                      \
5011     /* set tail elements to 1s */                                         \
5012     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5013 }
5014 
5015 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
5016 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
5017 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
5018 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
5019 
5020 /* Vector Element Index Instruction */
5021 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
5022 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
5023 {                                                                         \
5024     uint32_t vm = vext_vm(desc);                                          \
5025     uint32_t vl = env->vl;                                                \
5026     uint32_t esz = sizeof(ETYPE);                                         \
5027     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5028     uint32_t vta = vext_vta(desc);                                        \
5029     uint32_t vma = vext_vma(desc);                                        \
5030     int i;                                                                \
5031                                                                           \
5032     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5033                                                                           \
5034     for (i = env->vstart; i < vl; i++) {                                  \
5035         if (!vm && !vext_elem_mask(v0, i)) {                              \
5036             /* set masked-off elements to 1s */                           \
5037             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5038             continue;                                                     \
5039         }                                                                 \
5040         *((ETYPE *)vd + H(i)) = i;                                        \
5041     }                                                                     \
5042     env->vstart = 0;                                                      \
5043     /* set tail elements to 1s */                                         \
5044     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5045 }
5046 
5047 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5048 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5049 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5050 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5051 
5052 /*
5053  * Vector Permutation Instructions
5054  */
5055 
5056 /* Vector Slide Instructions */
5057 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5058 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5059                   CPURISCVState *env, uint32_t desc)                      \
5060 {                                                                         \
5061     uint32_t vm = vext_vm(desc);                                          \
5062     uint32_t vl = env->vl;                                                \
5063     uint32_t esz = sizeof(ETYPE);                                         \
5064     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5065     uint32_t vta = vext_vta(desc);                                        \
5066     uint32_t vma = vext_vma(desc);                                        \
5067     target_ulong offset = s1, i_min, i;                                   \
5068                                                                           \
5069     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5070                                                                           \
5071     i_min = MAX(env->vstart, offset);                                     \
5072     for (i = i_min; i < vl; i++) {                                        \
5073         if (!vm && !vext_elem_mask(v0, i)) {                              \
5074             /* set masked-off elements to 1s */                           \
5075             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5076             continue;                                                     \
5077         }                                                                 \
5078         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5079     }                                                                     \
5080     env->vstart = 0;                                                      \
5081     /* set tail elements to 1s */                                         \
5082     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5083 }
5084 
5085 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5086 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5087 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5088 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5089 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5090 
5091 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5092 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5093                   CPURISCVState *env, uint32_t desc)                      \
5094 {                                                                         \
5095     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5096     uint32_t vm = vext_vm(desc);                                          \
5097     uint32_t vl = env->vl;                                                \
5098     uint32_t esz = sizeof(ETYPE);                                         \
5099     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5100     uint32_t vta = vext_vta(desc);                                        \
5101     uint32_t vma = vext_vma(desc);                                        \
5102     target_ulong i_max, i_min, i;                                         \
5103                                                                           \
5104     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5105                                                                           \
5106     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
5107     i_max = MAX(i_min, env->vstart);                                      \
5108     for (i = env->vstart; i < i_max; ++i) {                               \
5109         if (!vm && !vext_elem_mask(v0, i)) {                              \
5110             /* set masked-off elements to 1s */                           \
5111             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5112             continue;                                                     \
5113         }                                                                 \
5114         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5115     }                                                                     \
5116                                                                           \
5117     for (i = i_max; i < vl; ++i) {                                        \
5118         if (vm || vext_elem_mask(v0, i)) {                                \
5119             *((ETYPE *)vd + H(i)) = 0;                                    \
5120         }                                                                 \
5121     }                                                                     \
5122                                                                           \
5123     env->vstart = 0;                                                      \
5124     /* set tail elements to 1s */                                         \
5125     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5126 }
5127 
5128 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5129 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5130 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5131 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5132 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5133 
5134 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5135 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5136                                  void *vs2, CPURISCVState *env,             \
5137                                  uint32_t desc)                             \
5138 {                                                                           \
5139     typedef uint##BITWIDTH##_t ETYPE;                                       \
5140     uint32_t vm = vext_vm(desc);                                            \
5141     uint32_t vl = env->vl;                                                  \
5142     uint32_t esz = sizeof(ETYPE);                                           \
5143     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5144     uint32_t vta = vext_vta(desc);                                          \
5145     uint32_t vma = vext_vma(desc);                                          \
5146     uint32_t i;                                                             \
5147                                                                             \
5148     VSTART_CHECK_EARLY_EXIT(env, vl);                                       \
5149                                                                             \
5150     for (i = env->vstart; i < vl; i++) {                                    \
5151         if (!vm && !vext_elem_mask(v0, i)) {                                \
5152             /* set masked-off elements to 1s */                             \
5153             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5154             continue;                                                       \
5155         }                                                                   \
5156         if (i == 0) {                                                       \
5157             *((ETYPE *)vd + H(i)) = s1;                                     \
5158         } else {                                                            \
5159             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5160         }                                                                   \
5161     }                                                                       \
5162     env->vstart = 0;                                                        \
5163     /* set tail elements to 1s */                                           \
5164     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5165 }
5166 
5167 GEN_VEXT_VSLIE1UP(8,  H1)
5168 GEN_VEXT_VSLIE1UP(16, H2)
5169 GEN_VEXT_VSLIE1UP(32, H4)
5170 GEN_VEXT_VSLIE1UP(64, H8)
5171 
5172 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5173 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5174                   CPURISCVState *env, uint32_t desc)              \
5175 {                                                                 \
5176     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5177 }
5178 
5179 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5180 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5181 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5182 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5183 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5184 
5185 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5186 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5187                                    void *vs2, CPURISCVState *env,             \
5188                                    uint32_t desc)                             \
5189 {                                                                             \
5190     typedef uint##BITWIDTH##_t ETYPE;                                         \
5191     uint32_t vm = vext_vm(desc);                                              \
5192     uint32_t vl = env->vl;                                                    \
5193     uint32_t esz = sizeof(ETYPE);                                             \
5194     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5195     uint32_t vta = vext_vta(desc);                                            \
5196     uint32_t vma = vext_vma(desc);                                            \
5197     uint32_t i;                                                               \
5198                                                                               \
5199     VSTART_CHECK_EARLY_EXIT(env, vl);                                         \
5200                                                                               \
5201     for (i = env->vstart; i < vl; i++) {                                      \
5202         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5203             /* set masked-off elements to 1s */                               \
5204             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5205             continue;                                                         \
5206         }                                                                     \
5207         if (i == vl - 1) {                                                    \
5208             *((ETYPE *)vd + H(i)) = s1;                                       \
5209         } else {                                                              \
5210             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5211         }                                                                     \
5212     }                                                                         \
5213     env->vstart = 0;                                                          \
5214     /* set tail elements to 1s */                                             \
5215     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5216 }
5217 
5218 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5219 GEN_VEXT_VSLIDE1DOWN(16, H2)
5220 GEN_VEXT_VSLIDE1DOWN(32, H4)
5221 GEN_VEXT_VSLIDE1DOWN(64, H8)
5222 
5223 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5224 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5225                   CPURISCVState *env, uint32_t desc)              \
5226 {                                                                 \
5227     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5228 }
5229 
5230 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5231 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5232 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5233 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5234 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5235 
5236 /* Vector Floating-Point Slide Instructions */
5237 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5238 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5239                   CPURISCVState *env, uint32_t desc)          \
5240 {                                                             \
5241     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5242 }
5243 
5244 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5245 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5246 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5247 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5248 
5249 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5250 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5251                   CPURISCVState *env, uint32_t desc)          \
5252 {                                                             \
5253     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5254 }
5255 
5256 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5257 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5258 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5259 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5260 
5261 /* Vector Register Gather Instruction */
5262 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5263 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5264                   CPURISCVState *env, uint32_t desc)                      \
5265 {                                                                         \
5266     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5267     uint32_t vm = vext_vm(desc);                                          \
5268     uint32_t vl = env->vl;                                                \
5269     uint32_t esz = sizeof(TS2);                                           \
5270     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5271     uint32_t vta = vext_vta(desc);                                        \
5272     uint32_t vma = vext_vma(desc);                                        \
5273     uint64_t index;                                                       \
5274     uint32_t i;                                                           \
5275                                                                           \
5276     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5277                                                                           \
5278     for (i = env->vstart; i < vl; i++) {                                  \
5279         if (!vm && !vext_elem_mask(v0, i)) {                              \
5280             /* set masked-off elements to 1s */                           \
5281             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5282             continue;                                                     \
5283         }                                                                 \
5284         index = *((TS1 *)vs1 + HS1(i));                                   \
5285         if (index >= vlmax) {                                             \
5286             *((TS2 *)vd + HS2(i)) = 0;                                    \
5287         } else {                                                          \
5288             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5289         }                                                                 \
5290     }                                                                     \
5291     env->vstart = 0;                                                      \
5292     /* set tail elements to 1s */                                         \
5293     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5294 }
5295 
5296 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5297 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5298 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5299 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5300 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5301 
5302 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5303 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5304 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5305 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5306 
5307 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5308 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5309                   CPURISCVState *env, uint32_t desc)                      \
5310 {                                                                         \
5311     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5312     uint32_t vm = vext_vm(desc);                                          \
5313     uint32_t vl = env->vl;                                                \
5314     uint32_t esz = sizeof(ETYPE);                                         \
5315     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5316     uint32_t vta = vext_vta(desc);                                        \
5317     uint32_t vma = vext_vma(desc);                                        \
5318     uint64_t index = s1;                                                  \
5319     uint32_t i;                                                           \
5320                                                                           \
5321     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5322                                                                           \
5323     for (i = env->vstart; i < vl; i++) {                                  \
5324         if (!vm && !vext_elem_mask(v0, i)) {                              \
5325             /* set masked-off elements to 1s */                           \
5326             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5327             continue;                                                     \
5328         }                                                                 \
5329         if (index >= vlmax) {                                             \
5330             *((ETYPE *)vd + H(i)) = 0;                                    \
5331         } else {                                                          \
5332             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5333         }                                                                 \
5334     }                                                                     \
5335     env->vstart = 0;                                                      \
5336     /* set tail elements to 1s */                                         \
5337     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5338 }
5339 
5340 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5341 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5342 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5343 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5344 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5345 
5346 /* Vector Compress Instruction */
5347 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5348 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5349                   CPURISCVState *env, uint32_t desc)                      \
5350 {                                                                         \
5351     uint32_t vl = env->vl;                                                \
5352     uint32_t esz = sizeof(ETYPE);                                         \
5353     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5354     uint32_t vta = vext_vta(desc);                                        \
5355     uint32_t num = 0, i;                                                  \
5356                                                                           \
5357     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5358                                                                           \
5359     for (i = env->vstart; i < vl; i++) {                                  \
5360         if (!vext_elem_mask(vs1, i)) {                                    \
5361             continue;                                                     \
5362         }                                                                 \
5363         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5364         num++;                                                            \
5365     }                                                                     \
5366     env->vstart = 0;                                                      \
5367     /* set tail elements to 1s */                                         \
5368     vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
5369 }
5370 
5371 /* Compress into vd elements of vs2 where vs1 is enabled */
5372 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5373 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5374 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5375 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5376 
5377 /* Vector Whole Register Move */
5378 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5379 {
5380     /* EEW = SEW */
5381     uint32_t maxsz = simd_maxsz(desc);
5382     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5383     uint32_t startb = env->vstart * sewb;
5384     uint32_t i = startb;
5385 
5386     if (startb >= maxsz) {
5387         env->vstart = 0;
5388         return;
5389     }
5390 
5391     if (HOST_BIG_ENDIAN && i % 8 != 0) {
5392         uint32_t j = ROUND_UP(i, 8);
5393         memcpy((uint8_t *)vd + H1(j - 1),
5394                (uint8_t *)vs2 + H1(j - 1),
5395                j - i);
5396         i = j;
5397     }
5398 
5399     memcpy((uint8_t *)vd + H1(i),
5400            (uint8_t *)vs2 + H1(i),
5401            maxsz - i);
5402 
5403     env->vstart = 0;
5404 }
5405 
5406 /* Vector Integer Extension */
5407 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5408 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5409                   CPURISCVState *env, uint32_t desc)             \
5410 {                                                                \
5411     uint32_t vl = env->vl;                                       \
5412     uint32_t vm = vext_vm(desc);                                 \
5413     uint32_t esz = sizeof(ETYPE);                                \
5414     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5415     uint32_t vta = vext_vta(desc);                               \
5416     uint32_t vma = vext_vma(desc);                               \
5417     uint32_t i;                                                  \
5418                                                                  \
5419     VSTART_CHECK_EARLY_EXIT(env, vl);                            \
5420                                                                  \
5421     for (i = env->vstart; i < vl; i++) {                         \
5422         if (!vm && !vext_elem_mask(v0, i)) {                     \
5423             /* set masked-off elements to 1s */                  \
5424             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5425             continue;                                            \
5426         }                                                        \
5427         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5428     }                                                            \
5429     env->vstart = 0;                                             \
5430     /* set tail elements to 1s */                                \
5431     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5432 }
5433 
5434 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5435 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5436 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5437 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5438 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5439 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5440 
5441 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5442 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5443 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5444 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5445 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5446 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5447