xref: /qemu/target/riscv/vector_helper.c (revision cc944932ecef3b7a56ae62d89dd92fb9e56c5cc8)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/page-protection.h"
27 #include "exec/helper-proto.h"
28 #include "exec/tlb-flags.h"
29 #include "fpu/softfloat.h"
30 #include "tcg/tcg-gvec-desc.h"
31 #include "internals.h"
32 #include "vector_internals.h"
33 #include <math.h>
34 
35 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
36                             target_ulong s2)
37 {
38     int vlmax, vl;
39     RISCVCPU *cpu = env_archcpu(env);
40     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
41     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
42     uint16_t sew = 8 << vsew;
43     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
44     int xlen = riscv_cpu_xlen(env);
45     bool vill = (s2 >> (xlen - 1)) & 0x1;
46     target_ulong reserved = s2 &
47                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
48                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
49     uint16_t vlen = cpu->cfg.vlenb << 3;
50     int8_t lmul;
51 
52     if (vlmul & 4) {
53         /*
54          * Fractional LMUL, check:
55          *
56          * VLEN * LMUL >= SEW
57          * VLEN >> (8 - lmul) >= sew
58          * (vlenb << 3) >> (8 - lmul) >= sew
59          */
60         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
61             vill = true;
62         }
63     }
64 
65     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
66         /* only set vill bit. */
67         env->vill = 1;
68         env->vtype = 0;
69         env->vl = 0;
70         env->vstart = 0;
71         return 0;
72     }
73 
74     /* lmul encoded as in DisasContext::lmul */
75     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
76     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
77     if (s1 <= vlmax) {
78         vl = s1;
79     } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
80         vl = (s1 + 1) >> 1;
81     } else {
82         vl = vlmax;
83     }
84     env->vl = vl;
85     env->vtype = s2;
86     env->vstart = 0;
87     env->vill = 0;
88     return vl;
89 }
90 
91 /*
92  * Get the maximum number of elements can be operated.
93  *
94  * log2_esz: log2 of element size in bytes.
95  */
96 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
97 {
98     /*
99      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
100      * so vlen in bytes (vlenb) is encoded as maxsz.
101      */
102     uint32_t vlenb = simd_maxsz(desc);
103 
104     /* Return VLMAX */
105     int scale = vext_lmul(desc) - log2_esz;
106     return scale < 0 ? vlenb >> -scale : vlenb << scale;
107 }
108 
109 /*
110  * This function checks watchpoint before real load operation.
111  *
112  * In system mode, the TLB API probe_access is enough for watchpoint check.
113  * In user mode, there is no watchpoint support now.
114  *
115  * It will trigger an exception if there is no mapping in TLB
116  * and page table walk can't fill the TLB entry. Then the guest
117  * software can return here after process the exception or never return.
118  */
119 static void probe_pages(CPURISCVState *env, target_ulong addr,
120                         target_ulong len, uintptr_t ra,
121                         MMUAccessType access_type)
122 {
123     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
124     target_ulong curlen = MIN(pagelen, len);
125     int mmu_index = riscv_env_mmu_index(env, false);
126 
127     probe_access(env, adjust_addr(env, addr), curlen, access_type,
128                  mmu_index, ra);
129     if (len > curlen) {
130         addr += curlen;
131         curlen = len - curlen;
132         probe_access(env, adjust_addr(env, addr), curlen, access_type,
133                      mmu_index, ra);
134     }
135 }
136 
137 static inline void vext_set_elem_mask(void *v0, int index,
138                                       uint8_t value)
139 {
140     int idx = index / 64;
141     int pos = index % 64;
142     uint64_t old = ((uint64_t *)v0)[idx];
143     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
144 }
145 
146 /* elements operations for load and store */
147 typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
148                                    uint32_t idx, void *vd, uintptr_t retaddr);
149 typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
150 
151 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
152 static inline QEMU_ALWAYS_INLINE                            \
153 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
154                 uint32_t idx, void *vd, uintptr_t retaddr)  \
155 {                                                           \
156     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
157     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
158 }                                                           \
159                                                             \
160 static inline QEMU_ALWAYS_INLINE                            \
161 void NAME##_host(void *vd, uint32_t idx, void *host)        \
162 {                                                           \
163     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
164     *cur = (ETYPE)LDSUF##_p(host);                          \
165 }
166 
167 GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
168 GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
169 GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
170 GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
171 
172 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
173 static inline QEMU_ALWAYS_INLINE                            \
174 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
175                 uint32_t idx, void *vd, uintptr_t retaddr)  \
176 {                                                           \
177     ETYPE data = *((ETYPE *)vd + H(idx));                   \
178     cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
179 }                                                           \
180                                                             \
181 static inline QEMU_ALWAYS_INLINE                            \
182 void NAME##_host(void *vd, uint32_t idx, void *host)        \
183 {                                                           \
184     ETYPE data = *((ETYPE *)vd + H(idx));                   \
185     STSUF##_p(host, data);                                  \
186 }
187 
188 GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
189 GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
190 GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
191 GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
192 
193 static inline QEMU_ALWAYS_INLINE void
194 vext_continuous_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
195                        void *vd, uint32_t evl, target_ulong addr,
196                        uint32_t reg_start, uintptr_t ra, uint32_t esz,
197                        bool is_load)
198 {
199     uint32_t i;
200     for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
201         ldst_tlb(env, adjust_addr(env, addr), i, vd, ra);
202     }
203 }
204 
205 static inline QEMU_ALWAYS_INLINE void
206 vext_continuous_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
207                         void *vd, uint32_t evl, uint32_t reg_start, void *host,
208                         uint32_t esz, bool is_load)
209 {
210 #if HOST_BIG_ENDIAN
211     for (; reg_start < evl; reg_start++, host += esz) {
212         ldst_host(vd, reg_start, host);
213     }
214 #else
215     if (esz == 1) {
216         uint32_t byte_offset = reg_start * esz;
217         uint32_t size = (evl - reg_start) * esz;
218 
219         if (is_load) {
220             memcpy(vd + byte_offset, host, size);
221         } else {
222             memcpy(host, vd + byte_offset, size);
223         }
224     } else {
225         for (; reg_start < evl; reg_start++, host += esz) {
226             ldst_host(vd, reg_start, host);
227         }
228     }
229 #endif
230 }
231 
232 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
233                                    uint32_t desc, uint32_t nf,
234                                    uint32_t esz, uint32_t max_elems)
235 {
236     uint32_t vta = vext_vta(desc);
237     int k;
238 
239     if (vta == 0) {
240         return;
241     }
242 
243     for (k = 0; k < nf; ++k) {
244         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
245                           (k * max_elems + max_elems) * esz);
246     }
247 }
248 
249 /*
250  * stride: access vector element from strided memory
251  */
252 static void
253 vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride,
254                  CPURISCVState *env, uint32_t desc, uint32_t vm,
255                  vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
256                  uintptr_t ra)
257 {
258     uint32_t i, k;
259     uint32_t nf = vext_nf(desc);
260     uint32_t max_elems = vext_max_elems(desc, log2_esz);
261     uint32_t esz = 1 << log2_esz;
262     uint32_t vma = vext_vma(desc);
263 
264     VSTART_CHECK_EARLY_EXIT(env, env->vl);
265 
266     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
267         k = 0;
268         while (k < nf) {
269             if (!vm && !vext_elem_mask(v0, i)) {
270                 /* set masked-off elements to 1s */
271                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
272                                   (i + k * max_elems + 1) * esz);
273                 k++;
274                 continue;
275             }
276             target_ulong addr = base + stride * i + (k << log2_esz);
277             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
278             k++;
279         }
280     }
281     env->vstart = 0;
282 
283     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
284 }
285 
286 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
287 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
288                   target_ulong stride, CPURISCVState *env,              \
289                   uint32_t desc)                                        \
290 {                                                                       \
291     uint32_t vm = vext_vm(desc);                                        \
292     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
293                      ctzl(sizeof(ETYPE)), GETPC());                     \
294 }
295 
296 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
297 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
298 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
299 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
300 
301 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
302 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
303                   target_ulong stride, CPURISCVState *env,              \
304                   uint32_t desc)                                        \
305 {                                                                       \
306     uint32_t vm = vext_vm(desc);                                        \
307     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
308                      ctzl(sizeof(ETYPE)), GETPC());                     \
309 }
310 
311 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
312 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
313 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
314 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
315 
316 /*
317  * unit-stride: access elements stored contiguously in memory
318  */
319 
320 /* unmasked unit-stride load and store operation */
321 static inline QEMU_ALWAYS_INLINE void
322 vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
323                   uint32_t elems, uint32_t nf, uint32_t max_elems,
324                   uint32_t log2_esz, bool is_load, int mmu_index,
325                   vext_ldst_elem_fn_tlb *ldst_tlb,
326                   vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
327 {
328     void *host;
329     int i, k, flags;
330     uint32_t esz = 1 << log2_esz;
331     uint32_t size = (elems * nf) << log2_esz;
332     uint32_t evl = env->vstart + elems;
333     MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
334 
335     /* Check page permission/pmp/watchpoint/etc. */
336     flags = probe_access_flags(env, adjust_addr(env, addr), size, access_type,
337                                mmu_index, true, &host, ra);
338 
339     if (flags == 0) {
340         if (nf == 1) {
341             vext_continuous_ldst_host(env, ldst_host, vd, evl, env->vstart,
342                                       host, esz, is_load);
343         } else {
344             for (i = env->vstart; i < evl; ++i) {
345                 k = 0;
346                 while (k < nf) {
347                     ldst_host(vd, i + k * max_elems, host);
348                     host += esz;
349                     k++;
350                 }
351             }
352         }
353         env->vstart += elems;
354     } else {
355         if (nf == 1) {
356             vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
357                                    ra, esz, is_load);
358         } else {
359             /* load bytes from guest memory */
360             for (i = env->vstart; i < evl; env->vstart = ++i) {
361                 k = 0;
362                 while (k < nf) {
363                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
364                              vd, ra);
365                     addr += esz;
366                     k++;
367                 }
368             }
369         }
370     }
371 }
372 
373 static inline QEMU_ALWAYS_INLINE void
374 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
375              vext_ldst_elem_fn_tlb *ldst_tlb,
376              vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
377              uint32_t evl, uintptr_t ra, bool is_load)
378 {
379     uint32_t k;
380     target_ulong page_split, elems, addr;
381     uint32_t nf = vext_nf(desc);
382     uint32_t max_elems = vext_max_elems(desc, log2_esz);
383     uint32_t esz = 1 << log2_esz;
384     uint32_t msize = nf * esz;
385     int mmu_index = riscv_env_mmu_index(env, false);
386 
387     VSTART_CHECK_EARLY_EXIT(env, evl);
388 
389 #if defined(CONFIG_USER_ONLY)
390     /*
391      * For data sizes <= 6 bytes we get better performance by simply calling
392      * vext_continuous_ldst_tlb
393      */
394     if (nf == 1 && (evl << log2_esz) <= 6) {
395         addr = base + (env->vstart << log2_esz);
396         vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart, ra,
397                                  esz, is_load);
398 
399         env->vstart = 0;
400         vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
401         return;
402     }
403 #endif
404 
405     /* Calculate the page range of first page */
406     addr = base + ((env->vstart * nf) << log2_esz);
407     page_split = -(addr | TARGET_PAGE_MASK);
408     /* Get number of elements */
409     elems = page_split / msize;
410     if (unlikely(env->vstart + elems >= evl)) {
411         elems = evl - env->vstart;
412     }
413 
414     /* Load/store elements in the first page */
415     if (likely(elems)) {
416         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
417                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
418     }
419 
420     /* Load/store elements in the second page */
421     if (unlikely(env->vstart < evl)) {
422         /* Cross page element */
423         if (unlikely(page_split % msize)) {
424             for (k = 0; k < nf; k++) {
425                 addr = base + ((env->vstart * nf + k) << log2_esz);
426                 ldst_tlb(env, adjust_addr(env, addr),
427                         env->vstart + k * max_elems, vd, ra);
428             }
429             env->vstart++;
430         }
431 
432         addr = base + ((env->vstart * nf) << log2_esz);
433         /* Get number of elements of second page */
434         elems = evl - env->vstart;
435 
436         /* Load/store elements in the second page */
437         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
438                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
439     }
440 
441     env->vstart = 0;
442     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
443 }
444 
445 /*
446  * masked unit-stride load and store operation will be a special case of
447  * stride, stride = NF * sizeof (ETYPE)
448  */
449 
450 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
451 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
452                          CPURISCVState *env, uint32_t desc)         \
453 {                                                                   \
454     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
455     vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
456                      LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
457 }                                                                   \
458                                                                     \
459 void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
460                   CPURISCVState *env, uint32_t desc)                \
461 {                                                                   \
462     vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
463                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
464 }
465 
466 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
467 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
468 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
469 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
470 
471 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
472 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
473                          CPURISCVState *env, uint32_t desc)              \
474 {                                                                        \
475     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
476     vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
477                      STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
478 }                                                                        \
479                                                                          \
480 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
481                   CPURISCVState *env, uint32_t desc)                     \
482 {                                                                        \
483     vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
484                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
485 }
486 
487 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
488 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
489 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
490 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
491 
492 /*
493  * unit stride mask load and store, EEW = 1
494  */
495 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
496                     CPURISCVState *env, uint32_t desc)
497 {
498     /* evl = ceil(vl/8) */
499     uint8_t evl = (env->vl + 7) >> 3;
500     vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
501                  0, evl, GETPC(), true);
502 }
503 
504 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
505                     CPURISCVState *env, uint32_t desc)
506 {
507     /* evl = ceil(vl/8) */
508     uint8_t evl = (env->vl + 7) >> 3;
509     vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
510                  0, evl, GETPC(), false);
511 }
512 
513 /*
514  * index: access vector element from indexed memory
515  */
516 typedef target_ulong vext_get_index_addr(target_ulong base,
517         uint32_t idx, void *vs2);
518 
519 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
520 static target_ulong NAME(target_ulong base,            \
521                          uint32_t idx, void *vs2)      \
522 {                                                      \
523     return (base + *((ETYPE *)vs2 + H(idx)));          \
524 }
525 
526 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
527 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
528 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
529 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
530 
531 static inline void
532 vext_ldst_index(void *vd, void *v0, target_ulong base,
533                 void *vs2, CPURISCVState *env, uint32_t desc,
534                 vext_get_index_addr get_index_addr,
535                 vext_ldst_elem_fn_tlb *ldst_elem,
536                 uint32_t log2_esz, uintptr_t ra)
537 {
538     uint32_t i, k;
539     uint32_t nf = vext_nf(desc);
540     uint32_t vm = vext_vm(desc);
541     uint32_t max_elems = vext_max_elems(desc, log2_esz);
542     uint32_t esz = 1 << log2_esz;
543     uint32_t vma = vext_vma(desc);
544 
545     VSTART_CHECK_EARLY_EXIT(env, env->vl);
546 
547     /* load bytes from guest memory */
548     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
549         k = 0;
550         while (k < nf) {
551             if (!vm && !vext_elem_mask(v0, i)) {
552                 /* set masked-off elements to 1s */
553                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
554                                   (i + k * max_elems + 1) * esz);
555                 k++;
556                 continue;
557             }
558             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
559             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
560             k++;
561         }
562     }
563     env->vstart = 0;
564 
565     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
566 }
567 
568 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
569 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
570                   void *vs2, CPURISCVState *env, uint32_t desc)            \
571 {                                                                          \
572     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
573                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
574 }
575 
576 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
577 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
578 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
579 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
580 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
581 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
582 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
583 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
584 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
585 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
586 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
587 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
588 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
589 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
590 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
591 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
592 
593 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
594 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
595                   void *vs2, CPURISCVState *env, uint32_t desc)  \
596 {                                                                \
597     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
598                     STORE_FN, ctzl(sizeof(ETYPE)),               \
599                     GETPC());                                    \
600 }
601 
602 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
603 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
604 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
605 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
606 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
607 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
608 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
609 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
610 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
611 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
612 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
613 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
614 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
615 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
616 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
617 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
618 
619 /*
620  * unit-stride fault-only-fisrt load instructions
621  */
622 static inline void
623 vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
624           uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
625           vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
626 {
627     uint32_t i, k, vl = 0;
628     uint32_t nf = vext_nf(desc);
629     uint32_t vm = vext_vm(desc);
630     uint32_t max_elems = vext_max_elems(desc, log2_esz);
631     uint32_t esz = 1 << log2_esz;
632     uint32_t msize = nf * esz;
633     uint32_t vma = vext_vma(desc);
634     target_ulong addr, addr_probe, addr_i, offset, remain, page_split, elems;
635     int mmu_index = riscv_env_mmu_index(env, false);
636     int flags;
637     void *host;
638 
639     VSTART_CHECK_EARLY_EXIT(env, env->vl);
640 
641     addr = base + ((env->vstart * nf) << log2_esz);
642     page_split = -(addr | TARGET_PAGE_MASK);
643     /* Get number of elements */
644     elems = page_split / msize;
645     if (unlikely(env->vstart + elems >= env->vl)) {
646         elems = env->vl - env->vstart;
647     }
648 
649     /* Check page permission/pmp/watchpoint/etc. */
650     flags = probe_access_flags(env, adjust_addr(env, addr), elems * msize,
651                                MMU_DATA_LOAD, mmu_index, true, &host, ra);
652 
653     /* If we are crossing a page check also the second page. */
654     if (env->vl > elems) {
655         addr_probe = addr + (elems << log2_esz);
656         flags |= probe_access_flags(env, adjust_addr(env, addr_probe),
657                                     elems * msize, MMU_DATA_LOAD, mmu_index,
658                                     true, &host, ra);
659     }
660 
661     if (flags & ~TLB_WATCHPOINT) {
662         /* probe every access */
663         for (i = env->vstart; i < env->vl; i++) {
664             if (!vm && !vext_elem_mask(v0, i)) {
665                 continue;
666             }
667             addr_i = adjust_addr(env, base + i * (nf << log2_esz));
668             if (i == 0) {
669                 /* Allow fault on first element. */
670                 probe_pages(env, addr_i, nf << log2_esz, ra, MMU_DATA_LOAD);
671             } else {
672                 remain = nf << log2_esz;
673                 while (remain > 0) {
674                     offset = -(addr_i | TARGET_PAGE_MASK);
675 
676                     /* Probe nonfault on subsequent elements. */
677                     flags = probe_access_flags(env, addr_i, offset,
678                                                MMU_DATA_LOAD, mmu_index, true,
679                                                &host, 0);
680 
681                     /*
682                      * Stop if invalid (unmapped) or mmio (transaction may
683                      * fail). Do not stop if watchpoint, as the spec says that
684                      * first-fault should continue to access the same
685                      * elements regardless of any watchpoint.
686                      */
687                     if (flags & ~TLB_WATCHPOINT) {
688                         vl = i;
689                         goto ProbeSuccess;
690                     }
691                     if (remain <= offset) {
692                         break;
693                     }
694                     remain -= offset;
695                     addr_i = adjust_addr(env, addr_i + offset);
696                 }
697             }
698         }
699     }
700 ProbeSuccess:
701     /* load bytes from guest memory */
702     if (vl != 0) {
703         env->vl = vl;
704     }
705 
706     if (env->vstart < env->vl) {
707         if (vm) {
708             /* Load/store elements in the first page */
709             if (likely(elems)) {
710                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
711                                   log2_esz, true, mmu_index, ldst_tlb,
712                                   ldst_host, ra);
713             }
714 
715             /* Load/store elements in the second page */
716             if (unlikely(env->vstart < env->vl)) {
717                 /* Cross page element */
718                 if (unlikely(page_split % msize)) {
719                     for (k = 0; k < nf; k++) {
720                         addr = base + ((env->vstart * nf + k) << log2_esz);
721                         ldst_tlb(env, adjust_addr(env, addr),
722                                  env->vstart + k * max_elems, vd, ra);
723                     }
724                     env->vstart++;
725                 }
726 
727                 addr = base + ((env->vstart * nf) << log2_esz);
728                 /* Get number of elements of second page */
729                 elems = env->vl - env->vstart;
730 
731                 /* Load/store elements in the second page */
732                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
733                                   log2_esz, true, mmu_index, ldst_tlb,
734                                   ldst_host, ra);
735             }
736         } else {
737             for (i = env->vstart; i < env->vl; i++) {
738                 k = 0;
739                 while (k < nf) {
740                     if (!vext_elem_mask(v0, i)) {
741                         /* set masked-off elements to 1s */
742                         vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
743                                           (i + k * max_elems + 1) * esz);
744                         k++;
745                         continue;
746                     }
747                     addr = base + ((i * nf + k) << log2_esz);
748                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
749                              vd, ra);
750                     k++;
751                 }
752             }
753         }
754     }
755     env->vstart = 0;
756 
757     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
758 }
759 
760 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
761 void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
762                   CPURISCVState *env, uint32_t desc)            \
763 {                                                               \
764     vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
765               LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
766 }
767 
768 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
769 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
770 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
771 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
772 
773 #define DO_SWAP(N, M) (M)
774 #define DO_AND(N, M)  (N & M)
775 #define DO_XOR(N, M)  (N ^ M)
776 #define DO_OR(N, M)   (N | M)
777 #define DO_ADD(N, M)  (N + M)
778 
779 /* Signed min/max */
780 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
781 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
782 
783 /*
784  * load and store whole register instructions
785  */
786 static inline QEMU_ALWAYS_INLINE void
787 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
788                 vext_ldst_elem_fn_tlb *ldst_tlb,
789                 vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
790                 uintptr_t ra, bool is_load)
791 {
792     target_ulong page_split, elems, addr;
793     uint32_t nf = vext_nf(desc);
794     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
795     uint32_t max_elems = vlenb >> log2_esz;
796     uint32_t evl = nf * max_elems;
797     uint32_t esz = 1 << log2_esz;
798     int mmu_index = riscv_env_mmu_index(env, false);
799 
800     /* Calculate the page range of first page */
801     addr = base + (env->vstart << log2_esz);
802     page_split = -(addr | TARGET_PAGE_MASK);
803     /* Get number of elements */
804     elems = page_split / esz;
805     if (unlikely(env->vstart + elems >= evl)) {
806         elems = evl - env->vstart;
807     }
808 
809     /* Load/store elements in the first page */
810     if (likely(elems)) {
811         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
812                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
813     }
814 
815     /* Load/store elements in the second page */
816     if (unlikely(env->vstart < evl)) {
817         /* Cross page element */
818         if (unlikely(page_split % esz)) {
819             addr = base + (env->vstart << log2_esz);
820             ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
821             env->vstart++;
822         }
823 
824         addr = base + (env->vstart << log2_esz);
825         /* Get number of elements of second page */
826         elems = evl - env->vstart;
827 
828         /* Load/store elements in the second page */
829         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
830                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
831     }
832 
833     env->vstart = 0;
834 }
835 
836 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
837 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
838                   uint32_t desc)                                    \
839 {                                                                   \
840     vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
841                     ctzl(sizeof(ETYPE)), GETPC(), true);            \
842 }
843 
844 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
845 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
846 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
847 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
848 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
849 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
850 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
851 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
852 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
853 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
854 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
855 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
856 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
857 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
858 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
859 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
860 
861 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
862 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
863                   uint32_t desc)                                        \
864 {                                                                       \
865     vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
866                     ctzl(sizeof(ETYPE)), GETPC(), false);               \
867 }
868 
869 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
870 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
871 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
872 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
873 
874 /*
875  * Vector Integer Arithmetic Instructions
876  */
877 
878 /* (TD, T1, T2, TX1, TX2) */
879 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
880 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
881 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
882 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
883 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
884 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
885 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
886 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
887 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
888 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
889 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
890 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
891 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
892 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
893 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
894 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
895 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
896 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
897 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
898 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
899 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
900 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
901 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
902 
903 #define DO_SUB(N, M) (N - M)
904 #define DO_RSUB(N, M) (M - N)
905 
906 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
907 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
908 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
909 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
910 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
911 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
912 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
913 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
914 
915 GEN_VEXT_VV(vadd_vv_b, 1)
916 GEN_VEXT_VV(vadd_vv_h, 2)
917 GEN_VEXT_VV(vadd_vv_w, 4)
918 GEN_VEXT_VV(vadd_vv_d, 8)
919 GEN_VEXT_VV(vsub_vv_b, 1)
920 GEN_VEXT_VV(vsub_vv_h, 2)
921 GEN_VEXT_VV(vsub_vv_w, 4)
922 GEN_VEXT_VV(vsub_vv_d, 8)
923 
924 
925 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
926 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
927 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
928 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
929 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
930 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
931 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
932 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
933 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
934 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
935 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
936 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
937 
938 GEN_VEXT_VX(vadd_vx_b, 1)
939 GEN_VEXT_VX(vadd_vx_h, 2)
940 GEN_VEXT_VX(vadd_vx_w, 4)
941 GEN_VEXT_VX(vadd_vx_d, 8)
942 GEN_VEXT_VX(vsub_vx_b, 1)
943 GEN_VEXT_VX(vsub_vx_h, 2)
944 GEN_VEXT_VX(vsub_vx_w, 4)
945 GEN_VEXT_VX(vsub_vx_d, 8)
946 GEN_VEXT_VX(vrsub_vx_b, 1)
947 GEN_VEXT_VX(vrsub_vx_h, 2)
948 GEN_VEXT_VX(vrsub_vx_w, 4)
949 GEN_VEXT_VX(vrsub_vx_d, 8)
950 
951 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
952 {
953     intptr_t oprsz = simd_oprsz(desc);
954     intptr_t i;
955 
956     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
957         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
958     }
959 }
960 
961 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
962 {
963     intptr_t oprsz = simd_oprsz(desc);
964     intptr_t i;
965 
966     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
967         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
968     }
969 }
970 
971 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
972 {
973     intptr_t oprsz = simd_oprsz(desc);
974     intptr_t i;
975 
976     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
977         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
978     }
979 }
980 
981 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
982 {
983     intptr_t oprsz = simd_oprsz(desc);
984     intptr_t i;
985 
986     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
987         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
988     }
989 }
990 
991 /* Vector Widening Integer Add/Subtract */
992 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
993 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
994 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
995 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
996 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
997 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
998 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
999 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
1000 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
1001 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
1002 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
1003 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
1004 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
1005 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
1006 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
1007 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
1008 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
1009 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1010 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1011 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1012 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1013 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1014 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1015 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1016 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1017 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1018 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1019 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1020 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1021 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1022 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1023 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1024 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1025 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1026 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1027 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1028 GEN_VEXT_VV(vwaddu_vv_b, 2)
1029 GEN_VEXT_VV(vwaddu_vv_h, 4)
1030 GEN_VEXT_VV(vwaddu_vv_w, 8)
1031 GEN_VEXT_VV(vwsubu_vv_b, 2)
1032 GEN_VEXT_VV(vwsubu_vv_h, 4)
1033 GEN_VEXT_VV(vwsubu_vv_w, 8)
1034 GEN_VEXT_VV(vwadd_vv_b, 2)
1035 GEN_VEXT_VV(vwadd_vv_h, 4)
1036 GEN_VEXT_VV(vwadd_vv_w, 8)
1037 GEN_VEXT_VV(vwsub_vv_b, 2)
1038 GEN_VEXT_VV(vwsub_vv_h, 4)
1039 GEN_VEXT_VV(vwsub_vv_w, 8)
1040 GEN_VEXT_VV(vwaddu_wv_b, 2)
1041 GEN_VEXT_VV(vwaddu_wv_h, 4)
1042 GEN_VEXT_VV(vwaddu_wv_w, 8)
1043 GEN_VEXT_VV(vwsubu_wv_b, 2)
1044 GEN_VEXT_VV(vwsubu_wv_h, 4)
1045 GEN_VEXT_VV(vwsubu_wv_w, 8)
1046 GEN_VEXT_VV(vwadd_wv_b, 2)
1047 GEN_VEXT_VV(vwadd_wv_h, 4)
1048 GEN_VEXT_VV(vwadd_wv_w, 8)
1049 GEN_VEXT_VV(vwsub_wv_b, 2)
1050 GEN_VEXT_VV(vwsub_wv_h, 4)
1051 GEN_VEXT_VV(vwsub_wv_w, 8)
1052 
1053 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1054 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1055 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1056 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1057 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1058 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1059 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1060 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1061 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1062 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1063 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1064 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1065 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1066 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1067 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1068 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1069 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1070 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1071 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1072 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1073 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1074 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1075 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1076 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1077 GEN_VEXT_VX(vwaddu_vx_b, 2)
1078 GEN_VEXT_VX(vwaddu_vx_h, 4)
1079 GEN_VEXT_VX(vwaddu_vx_w, 8)
1080 GEN_VEXT_VX(vwsubu_vx_b, 2)
1081 GEN_VEXT_VX(vwsubu_vx_h, 4)
1082 GEN_VEXT_VX(vwsubu_vx_w, 8)
1083 GEN_VEXT_VX(vwadd_vx_b, 2)
1084 GEN_VEXT_VX(vwadd_vx_h, 4)
1085 GEN_VEXT_VX(vwadd_vx_w, 8)
1086 GEN_VEXT_VX(vwsub_vx_b, 2)
1087 GEN_VEXT_VX(vwsub_vx_h, 4)
1088 GEN_VEXT_VX(vwsub_vx_w, 8)
1089 GEN_VEXT_VX(vwaddu_wx_b, 2)
1090 GEN_VEXT_VX(vwaddu_wx_h, 4)
1091 GEN_VEXT_VX(vwaddu_wx_w, 8)
1092 GEN_VEXT_VX(vwsubu_wx_b, 2)
1093 GEN_VEXT_VX(vwsubu_wx_h, 4)
1094 GEN_VEXT_VX(vwsubu_wx_w, 8)
1095 GEN_VEXT_VX(vwadd_wx_b, 2)
1096 GEN_VEXT_VX(vwadd_wx_h, 4)
1097 GEN_VEXT_VX(vwadd_wx_w, 8)
1098 GEN_VEXT_VX(vwsub_wx_b, 2)
1099 GEN_VEXT_VX(vwsub_wx_h, 4)
1100 GEN_VEXT_VX(vwsub_wx_w, 8)
1101 
1102 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1103 #define DO_VADC(N, M, C) (N + M + C)
1104 #define DO_VSBC(N, M, C) (N - M - C)
1105 
1106 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1107 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1108                   CPURISCVState *env, uint32_t desc)          \
1109 {                                                             \
1110     uint32_t vl = env->vl;                                    \
1111     uint32_t esz = sizeof(ETYPE);                             \
1112     uint32_t total_elems =                                    \
1113         vext_get_total_elems(env, desc, esz);                 \
1114     uint32_t vta = vext_vta(desc);                            \
1115     uint32_t i;                                               \
1116                                                               \
1117     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1118                                                               \
1119     for (i = env->vstart; i < vl; i++) {                      \
1120         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1121         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1122         ETYPE carry = vext_elem_mask(v0, i);                  \
1123                                                               \
1124         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1125     }                                                         \
1126     env->vstart = 0;                                          \
1127     /* set tail elements to 1s */                             \
1128     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1129 }
1130 
1131 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1132 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1133 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1134 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1135 
1136 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1137 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1138 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1139 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1140 
1141 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1142 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1143                   CPURISCVState *env, uint32_t desc)                     \
1144 {                                                                        \
1145     uint32_t vl = env->vl;                                               \
1146     uint32_t esz = sizeof(ETYPE);                                        \
1147     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1148     uint32_t vta = vext_vta(desc);                                       \
1149     uint32_t i;                                                          \
1150                                                                          \
1151     VSTART_CHECK_EARLY_EXIT(env, vl);                                    \
1152                                                                          \
1153     for (i = env->vstart; i < vl; i++) {                                 \
1154         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1155         ETYPE carry = vext_elem_mask(v0, i);                             \
1156                                                                          \
1157         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1158     }                                                                    \
1159     env->vstart = 0;                                                     \
1160     /* set tail elements to 1s */                                        \
1161     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1162 }
1163 
1164 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1165 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1166 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1167 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1168 
1169 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1170 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1171 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1172 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1173 
1174 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1175                           (__typeof(N))(N + M) < N)
1176 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1177 
1178 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1179 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1180                   CPURISCVState *env, uint32_t desc)          \
1181 {                                                             \
1182     uint32_t vl = env->vl;                                    \
1183     uint32_t vm = vext_vm(desc);                              \
1184     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1185     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1186     uint32_t i;                                               \
1187                                                               \
1188     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1189                                                               \
1190     for (i = env->vstart; i < vl; i++) {                      \
1191         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1192         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1193         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1194         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1195     }                                                         \
1196     env->vstart = 0;                                          \
1197     /*
1198      * mask destination register are always tail-agnostic
1199      * set tail elements to 1s
1200      */                                                       \
1201     if (vta_all_1s) {                                         \
1202         for (; i < total_elems; i++) {                        \
1203             vext_set_elem_mask(vd, i, 1);                     \
1204         }                                                     \
1205     }                                                         \
1206 }
1207 
1208 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1209 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1210 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1211 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1212 
1213 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1214 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1215 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1216 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1217 
1218 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1219 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1220                   void *vs2, CPURISCVState *env, uint32_t desc) \
1221 {                                                               \
1222     uint32_t vl = env->vl;                                      \
1223     uint32_t vm = vext_vm(desc);                                \
1224     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1225     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1226     uint32_t i;                                                 \
1227                                                                 \
1228     VSTART_CHECK_EARLY_EXIT(env, vl);                           \
1229                                                                 \
1230     for (i = env->vstart; i < vl; i++) {                        \
1231         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1232         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1233         vext_set_elem_mask(vd, i,                               \
1234                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1235     }                                                           \
1236     env->vstart = 0;                                            \
1237     /*
1238      * mask destination register are always tail-agnostic
1239      * set tail elements to 1s
1240      */                                                         \
1241     if (vta_all_1s) {                                           \
1242         for (; i < total_elems; i++) {                          \
1243             vext_set_elem_mask(vd, i, 1);                       \
1244         }                                                       \
1245     }                                                           \
1246 }
1247 
1248 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1249 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1250 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1251 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1252 
1253 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1254 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1255 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1256 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1257 
1258 /* Vector Bitwise Logical Instructions */
1259 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1260 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1261 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1262 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1263 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1264 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1265 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1266 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1267 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1268 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1269 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1270 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1271 GEN_VEXT_VV(vand_vv_b, 1)
1272 GEN_VEXT_VV(vand_vv_h, 2)
1273 GEN_VEXT_VV(vand_vv_w, 4)
1274 GEN_VEXT_VV(vand_vv_d, 8)
1275 GEN_VEXT_VV(vor_vv_b, 1)
1276 GEN_VEXT_VV(vor_vv_h, 2)
1277 GEN_VEXT_VV(vor_vv_w, 4)
1278 GEN_VEXT_VV(vor_vv_d, 8)
1279 GEN_VEXT_VV(vxor_vv_b, 1)
1280 GEN_VEXT_VV(vxor_vv_h, 2)
1281 GEN_VEXT_VV(vxor_vv_w, 4)
1282 GEN_VEXT_VV(vxor_vv_d, 8)
1283 
1284 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1285 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1286 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1287 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1288 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1289 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1290 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1291 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1292 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1293 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1294 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1295 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1296 GEN_VEXT_VX(vand_vx_b, 1)
1297 GEN_VEXT_VX(vand_vx_h, 2)
1298 GEN_VEXT_VX(vand_vx_w, 4)
1299 GEN_VEXT_VX(vand_vx_d, 8)
1300 GEN_VEXT_VX(vor_vx_b, 1)
1301 GEN_VEXT_VX(vor_vx_h, 2)
1302 GEN_VEXT_VX(vor_vx_w, 4)
1303 GEN_VEXT_VX(vor_vx_d, 8)
1304 GEN_VEXT_VX(vxor_vx_b, 1)
1305 GEN_VEXT_VX(vxor_vx_h, 2)
1306 GEN_VEXT_VX(vxor_vx_w, 4)
1307 GEN_VEXT_VX(vxor_vx_d, 8)
1308 
1309 /* Vector Single-Width Bit Shift Instructions */
1310 #define DO_SLL(N, M)  (N << (M))
1311 #define DO_SRL(N, M)  (N >> (M))
1312 
1313 /* generate the helpers for shift instructions with two vector operators */
1314 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1315 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1316                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1317 {                                                                         \
1318     uint32_t vm = vext_vm(desc);                                          \
1319     uint32_t vl = env->vl;                                                \
1320     uint32_t esz = sizeof(TS1);                                           \
1321     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1322     uint32_t vta = vext_vta(desc);                                        \
1323     uint32_t vma = vext_vma(desc);                                        \
1324     uint32_t i;                                                           \
1325                                                                           \
1326     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
1327                                                                           \
1328     for (i = env->vstart; i < vl; i++) {                                  \
1329         if (!vm && !vext_elem_mask(v0, i)) {                              \
1330             /* set masked-off elements to 1s */                           \
1331             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1332             continue;                                                     \
1333         }                                                                 \
1334         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1335         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1336         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1337     }                                                                     \
1338     env->vstart = 0;                                                      \
1339     /* set tail elements to 1s */                                         \
1340     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1341 }
1342 
1343 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1344 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1345 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1346 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1347 
1348 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1349 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1350 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1351 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1352 
1353 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1354 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1355 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1356 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1357 
1358 /*
1359  * generate the helpers for shift instructions with one vector and one scalar
1360  */
1361 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1362 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1363                   void *vs2, CPURISCVState *env,            \
1364                   uint32_t desc)                            \
1365 {                                                           \
1366     uint32_t vm = vext_vm(desc);                            \
1367     uint32_t vl = env->vl;                                  \
1368     uint32_t esz = sizeof(TD);                              \
1369     uint32_t total_elems =                                  \
1370         vext_get_total_elems(env, desc, esz);               \
1371     uint32_t vta = vext_vta(desc);                          \
1372     uint32_t vma = vext_vma(desc);                          \
1373     uint32_t i;                                             \
1374                                                             \
1375     VSTART_CHECK_EARLY_EXIT(env, vl);                       \
1376                                                             \
1377     for (i = env->vstart; i < vl; i++) {                    \
1378         if (!vm && !vext_elem_mask(v0, i)) {                \
1379             /* set masked-off elements to 1s */             \
1380             vext_set_elems_1s(vd, vma, i * esz,             \
1381                               (i + 1) * esz);               \
1382             continue;                                       \
1383         }                                                   \
1384         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1385         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1386     }                                                       \
1387     env->vstart = 0;                                        \
1388     /* set tail elements to 1s */                           \
1389     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1390 }
1391 
1392 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1393 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1394 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1395 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1396 
1397 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1398 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1399 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1400 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1401 
1402 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1403 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1404 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1405 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1406 
1407 /* Vector Narrowing Integer Right Shift Instructions */
1408 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1409 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1410 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1411 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1412 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1413 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1414 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1415 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1416 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1417 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1418 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1419 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1420 
1421 /* Vector Integer Comparison Instructions */
1422 #define DO_MSEQ(N, M) (N == M)
1423 #define DO_MSNE(N, M) (N != M)
1424 #define DO_MSLT(N, M) (N < M)
1425 #define DO_MSLE(N, M) (N <= M)
1426 #define DO_MSGT(N, M) (N > M)
1427 
1428 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1429 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1430                   CPURISCVState *env, uint32_t desc)          \
1431 {                                                             \
1432     uint32_t vm = vext_vm(desc);                              \
1433     uint32_t vl = env->vl;                                    \
1434     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1435     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1436     uint32_t vma = vext_vma(desc);                            \
1437     uint32_t i;                                               \
1438                                                               \
1439     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1440                                                               \
1441     for (i = env->vstart; i < vl; i++) {                      \
1442         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1443         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1444         if (!vm && !vext_elem_mask(v0, i)) {                  \
1445             /* set masked-off elements to 1s */               \
1446             if (vma) {                                        \
1447                 vext_set_elem_mask(vd, i, 1);                 \
1448             }                                                 \
1449             continue;                                         \
1450         }                                                     \
1451         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1452     }                                                         \
1453     env->vstart = 0;                                          \
1454     /*
1455      * mask destination register are always tail-agnostic
1456      * set tail elements to 1s
1457      */                                                       \
1458     if (vta_all_1s) {                                         \
1459         for (; i < total_elems; i++) {                        \
1460             vext_set_elem_mask(vd, i, 1);                     \
1461         }                                                     \
1462     }                                                         \
1463 }
1464 
1465 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1466 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1467 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1468 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1469 
1470 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1471 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1472 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1473 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1474 
1475 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1476 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1477 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1478 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1479 
1480 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1481 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1482 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1483 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1484 
1485 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1486 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1487 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1488 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1489 
1490 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1491 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1492 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1493 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1494 
1495 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1496 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1497                   CPURISCVState *env, uint32_t desc)                \
1498 {                                                                   \
1499     uint32_t vm = vext_vm(desc);                                    \
1500     uint32_t vl = env->vl;                                          \
1501     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1502     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1503     uint32_t vma = vext_vma(desc);                                  \
1504     uint32_t i;                                                     \
1505                                                                     \
1506     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
1507                                                                     \
1508     for (i = env->vstart; i < vl; i++) {                            \
1509         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1510         if (!vm && !vext_elem_mask(v0, i)) {                        \
1511             /* set masked-off elements to 1s */                     \
1512             if (vma) {                                              \
1513                 vext_set_elem_mask(vd, i, 1);                       \
1514             }                                                       \
1515             continue;                                               \
1516         }                                                           \
1517         vext_set_elem_mask(vd, i,                                   \
1518                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1519     }                                                               \
1520     env->vstart = 0;                                                \
1521     /*
1522      * mask destination register are always tail-agnostic
1523      * set tail elements to 1s
1524      */                                                             \
1525     if (vta_all_1s) {                                               \
1526         for (; i < total_elems; i++) {                              \
1527             vext_set_elem_mask(vd, i, 1);                           \
1528         }                                                           \
1529     }                                                               \
1530 }
1531 
1532 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1533 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1534 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1535 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1536 
1537 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1538 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1539 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1540 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1541 
1542 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1543 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1544 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1545 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1546 
1547 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1548 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1549 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1550 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1551 
1552 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1553 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1554 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1555 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1556 
1557 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1558 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1559 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1560 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1561 
1562 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1563 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1564 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1565 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1566 
1567 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1568 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1569 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1570 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1571 
1572 /* Vector Integer Min/Max Instructions */
1573 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1574 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1575 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1576 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1577 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1578 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1579 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1580 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1581 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1582 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1583 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1584 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1585 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1586 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1587 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1588 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1589 GEN_VEXT_VV(vminu_vv_b, 1)
1590 GEN_VEXT_VV(vminu_vv_h, 2)
1591 GEN_VEXT_VV(vminu_vv_w, 4)
1592 GEN_VEXT_VV(vminu_vv_d, 8)
1593 GEN_VEXT_VV(vmin_vv_b, 1)
1594 GEN_VEXT_VV(vmin_vv_h, 2)
1595 GEN_VEXT_VV(vmin_vv_w, 4)
1596 GEN_VEXT_VV(vmin_vv_d, 8)
1597 GEN_VEXT_VV(vmaxu_vv_b, 1)
1598 GEN_VEXT_VV(vmaxu_vv_h, 2)
1599 GEN_VEXT_VV(vmaxu_vv_w, 4)
1600 GEN_VEXT_VV(vmaxu_vv_d, 8)
1601 GEN_VEXT_VV(vmax_vv_b, 1)
1602 GEN_VEXT_VV(vmax_vv_h, 2)
1603 GEN_VEXT_VV(vmax_vv_w, 4)
1604 GEN_VEXT_VV(vmax_vv_d, 8)
1605 
1606 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1607 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1608 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1609 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1610 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1611 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1612 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1613 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1614 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1615 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1616 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1617 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1618 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1619 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1620 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1621 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1622 GEN_VEXT_VX(vminu_vx_b, 1)
1623 GEN_VEXT_VX(vminu_vx_h, 2)
1624 GEN_VEXT_VX(vminu_vx_w, 4)
1625 GEN_VEXT_VX(vminu_vx_d, 8)
1626 GEN_VEXT_VX(vmin_vx_b, 1)
1627 GEN_VEXT_VX(vmin_vx_h, 2)
1628 GEN_VEXT_VX(vmin_vx_w, 4)
1629 GEN_VEXT_VX(vmin_vx_d, 8)
1630 GEN_VEXT_VX(vmaxu_vx_b, 1)
1631 GEN_VEXT_VX(vmaxu_vx_h, 2)
1632 GEN_VEXT_VX(vmaxu_vx_w, 4)
1633 GEN_VEXT_VX(vmaxu_vx_d, 8)
1634 GEN_VEXT_VX(vmax_vx_b, 1)
1635 GEN_VEXT_VX(vmax_vx_h, 2)
1636 GEN_VEXT_VX(vmax_vx_w, 4)
1637 GEN_VEXT_VX(vmax_vx_d, 8)
1638 
1639 /* Vector Single-Width Integer Multiply Instructions */
1640 #define DO_MUL(N, M) (N * M)
1641 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1642 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1643 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1644 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1645 GEN_VEXT_VV(vmul_vv_b, 1)
1646 GEN_VEXT_VV(vmul_vv_h, 2)
1647 GEN_VEXT_VV(vmul_vv_w, 4)
1648 GEN_VEXT_VV(vmul_vv_d, 8)
1649 
1650 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1651 {
1652     return (int16_t)s2 * (int16_t)s1 >> 8;
1653 }
1654 
1655 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1656 {
1657     return (int32_t)s2 * (int32_t)s1 >> 16;
1658 }
1659 
1660 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1661 {
1662     return (int64_t)s2 * (int64_t)s1 >> 32;
1663 }
1664 
1665 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1666 {
1667     uint64_t hi_64, lo_64;
1668 
1669     muls64(&lo_64, &hi_64, s1, s2);
1670     return hi_64;
1671 }
1672 
1673 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1674 {
1675     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1676 }
1677 
1678 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1679 {
1680     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1681 }
1682 
1683 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1684 {
1685     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1686 }
1687 
1688 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1689 {
1690     uint64_t hi_64, lo_64;
1691 
1692     mulu64(&lo_64, &hi_64, s2, s1);
1693     return hi_64;
1694 }
1695 
1696 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1697 {
1698     return (int16_t)s2 * (uint16_t)s1 >> 8;
1699 }
1700 
1701 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1702 {
1703     return (int32_t)s2 * (uint32_t)s1 >> 16;
1704 }
1705 
1706 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1707 {
1708     return (int64_t)s2 * (uint64_t)s1 >> 32;
1709 }
1710 
1711 /*
1712  * Let  A = signed operand,
1713  *      B = unsigned operand
1714  *      P = mulu64(A, B), unsigned product
1715  *
1716  * LET  X = 2 ** 64  - A, 2's complement of A
1717  *      SP = signed product
1718  * THEN
1719  *      IF A < 0
1720  *          SP = -X * B
1721  *             = -(2 ** 64 - A) * B
1722  *             = A * B - 2 ** 64 * B
1723  *             = P - 2 ** 64 * B
1724  *      ELSE
1725  *          SP = P
1726  * THEN
1727  *      HI_P -= (A < 0 ? B : 0)
1728  */
1729 
1730 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1731 {
1732     uint64_t hi_64, lo_64;
1733 
1734     mulu64(&lo_64, &hi_64, s2, s1);
1735 
1736     hi_64 -= s2 < 0 ? s1 : 0;
1737     return hi_64;
1738 }
1739 
1740 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1741 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1742 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1743 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1744 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1745 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1746 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1747 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1748 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1749 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1750 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1751 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1752 GEN_VEXT_VV(vmulh_vv_b, 1)
1753 GEN_VEXT_VV(vmulh_vv_h, 2)
1754 GEN_VEXT_VV(vmulh_vv_w, 4)
1755 GEN_VEXT_VV(vmulh_vv_d, 8)
1756 GEN_VEXT_VV(vmulhu_vv_b, 1)
1757 GEN_VEXT_VV(vmulhu_vv_h, 2)
1758 GEN_VEXT_VV(vmulhu_vv_w, 4)
1759 GEN_VEXT_VV(vmulhu_vv_d, 8)
1760 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1761 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1762 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1763 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1764 
1765 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1766 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1767 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1768 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1769 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1770 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1771 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1772 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1773 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1774 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1775 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1776 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1777 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1778 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1779 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1780 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1781 GEN_VEXT_VX(vmul_vx_b, 1)
1782 GEN_VEXT_VX(vmul_vx_h, 2)
1783 GEN_VEXT_VX(vmul_vx_w, 4)
1784 GEN_VEXT_VX(vmul_vx_d, 8)
1785 GEN_VEXT_VX(vmulh_vx_b, 1)
1786 GEN_VEXT_VX(vmulh_vx_h, 2)
1787 GEN_VEXT_VX(vmulh_vx_w, 4)
1788 GEN_VEXT_VX(vmulh_vx_d, 8)
1789 GEN_VEXT_VX(vmulhu_vx_b, 1)
1790 GEN_VEXT_VX(vmulhu_vx_h, 2)
1791 GEN_VEXT_VX(vmulhu_vx_w, 4)
1792 GEN_VEXT_VX(vmulhu_vx_d, 8)
1793 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1794 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1795 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1796 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1797 
1798 /* Vector Integer Divide Instructions */
1799 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1800 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1801 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1802         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1803 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1804         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1805 
1806 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1807 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1808 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1809 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1810 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1811 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1812 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1813 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1814 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1815 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1816 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1817 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1818 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1819 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1820 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1821 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1822 GEN_VEXT_VV(vdivu_vv_b, 1)
1823 GEN_VEXT_VV(vdivu_vv_h, 2)
1824 GEN_VEXT_VV(vdivu_vv_w, 4)
1825 GEN_VEXT_VV(vdivu_vv_d, 8)
1826 GEN_VEXT_VV(vdiv_vv_b, 1)
1827 GEN_VEXT_VV(vdiv_vv_h, 2)
1828 GEN_VEXT_VV(vdiv_vv_w, 4)
1829 GEN_VEXT_VV(vdiv_vv_d, 8)
1830 GEN_VEXT_VV(vremu_vv_b, 1)
1831 GEN_VEXT_VV(vremu_vv_h, 2)
1832 GEN_VEXT_VV(vremu_vv_w, 4)
1833 GEN_VEXT_VV(vremu_vv_d, 8)
1834 GEN_VEXT_VV(vrem_vv_b, 1)
1835 GEN_VEXT_VV(vrem_vv_h, 2)
1836 GEN_VEXT_VV(vrem_vv_w, 4)
1837 GEN_VEXT_VV(vrem_vv_d, 8)
1838 
1839 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1840 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1841 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1842 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1843 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1844 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1845 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1846 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1847 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1848 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1849 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1850 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1851 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1852 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1853 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1854 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1855 GEN_VEXT_VX(vdivu_vx_b, 1)
1856 GEN_VEXT_VX(vdivu_vx_h, 2)
1857 GEN_VEXT_VX(vdivu_vx_w, 4)
1858 GEN_VEXT_VX(vdivu_vx_d, 8)
1859 GEN_VEXT_VX(vdiv_vx_b, 1)
1860 GEN_VEXT_VX(vdiv_vx_h, 2)
1861 GEN_VEXT_VX(vdiv_vx_w, 4)
1862 GEN_VEXT_VX(vdiv_vx_d, 8)
1863 GEN_VEXT_VX(vremu_vx_b, 1)
1864 GEN_VEXT_VX(vremu_vx_h, 2)
1865 GEN_VEXT_VX(vremu_vx_w, 4)
1866 GEN_VEXT_VX(vremu_vx_d, 8)
1867 GEN_VEXT_VX(vrem_vx_b, 1)
1868 GEN_VEXT_VX(vrem_vx_h, 2)
1869 GEN_VEXT_VX(vrem_vx_w, 4)
1870 GEN_VEXT_VX(vrem_vx_d, 8)
1871 
1872 /* Vector Widening Integer Multiply Instructions */
1873 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1874 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1875 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1876 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1877 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1878 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1879 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1880 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1881 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1882 GEN_VEXT_VV(vwmul_vv_b, 2)
1883 GEN_VEXT_VV(vwmul_vv_h, 4)
1884 GEN_VEXT_VV(vwmul_vv_w, 8)
1885 GEN_VEXT_VV(vwmulu_vv_b, 2)
1886 GEN_VEXT_VV(vwmulu_vv_h, 4)
1887 GEN_VEXT_VV(vwmulu_vv_w, 8)
1888 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1889 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1890 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1891 
1892 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1893 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1894 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1895 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1896 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1897 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1898 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1899 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1900 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1901 GEN_VEXT_VX(vwmul_vx_b, 2)
1902 GEN_VEXT_VX(vwmul_vx_h, 4)
1903 GEN_VEXT_VX(vwmul_vx_w, 8)
1904 GEN_VEXT_VX(vwmulu_vx_b, 2)
1905 GEN_VEXT_VX(vwmulu_vx_h, 4)
1906 GEN_VEXT_VX(vwmulu_vx_w, 8)
1907 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1908 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1909 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1910 
1911 /* Vector Single-Width Integer Multiply-Add Instructions */
1912 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1913 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1914 {                                                                  \
1915     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1916     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1917     TD d = *((TD *)vd + HD(i));                                    \
1918     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1919 }
1920 
1921 #define DO_MACC(N, M, D) (M * N + D)
1922 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1923 #define DO_MADD(N, M, D) (M * D + N)
1924 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1925 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1926 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1927 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1928 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1929 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1930 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1931 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1932 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1933 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1934 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1935 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1936 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1937 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1938 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1939 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1940 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1941 GEN_VEXT_VV(vmacc_vv_b, 1)
1942 GEN_VEXT_VV(vmacc_vv_h, 2)
1943 GEN_VEXT_VV(vmacc_vv_w, 4)
1944 GEN_VEXT_VV(vmacc_vv_d, 8)
1945 GEN_VEXT_VV(vnmsac_vv_b, 1)
1946 GEN_VEXT_VV(vnmsac_vv_h, 2)
1947 GEN_VEXT_VV(vnmsac_vv_w, 4)
1948 GEN_VEXT_VV(vnmsac_vv_d, 8)
1949 GEN_VEXT_VV(vmadd_vv_b, 1)
1950 GEN_VEXT_VV(vmadd_vv_h, 2)
1951 GEN_VEXT_VV(vmadd_vv_w, 4)
1952 GEN_VEXT_VV(vmadd_vv_d, 8)
1953 GEN_VEXT_VV(vnmsub_vv_b, 1)
1954 GEN_VEXT_VV(vnmsub_vv_h, 2)
1955 GEN_VEXT_VV(vnmsub_vv_w, 4)
1956 GEN_VEXT_VV(vnmsub_vv_d, 8)
1957 
1958 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1959 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1960 {                                                                   \
1961     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1962     TD d = *((TD *)vd + HD(i));                                     \
1963     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1964 }
1965 
1966 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1967 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1968 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1969 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1970 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1971 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1972 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1973 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1974 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1975 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1976 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1977 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1978 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1979 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1980 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1981 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1982 GEN_VEXT_VX(vmacc_vx_b, 1)
1983 GEN_VEXT_VX(vmacc_vx_h, 2)
1984 GEN_VEXT_VX(vmacc_vx_w, 4)
1985 GEN_VEXT_VX(vmacc_vx_d, 8)
1986 GEN_VEXT_VX(vnmsac_vx_b, 1)
1987 GEN_VEXT_VX(vnmsac_vx_h, 2)
1988 GEN_VEXT_VX(vnmsac_vx_w, 4)
1989 GEN_VEXT_VX(vnmsac_vx_d, 8)
1990 GEN_VEXT_VX(vmadd_vx_b, 1)
1991 GEN_VEXT_VX(vmadd_vx_h, 2)
1992 GEN_VEXT_VX(vmadd_vx_w, 4)
1993 GEN_VEXT_VX(vmadd_vx_d, 8)
1994 GEN_VEXT_VX(vnmsub_vx_b, 1)
1995 GEN_VEXT_VX(vnmsub_vx_h, 2)
1996 GEN_VEXT_VX(vnmsub_vx_w, 4)
1997 GEN_VEXT_VX(vnmsub_vx_d, 8)
1998 
1999 /* Vector Widening Integer Multiply-Add Instructions */
2000 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
2001 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
2002 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
2003 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
2004 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
2005 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
2006 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
2007 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
2008 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
2009 GEN_VEXT_VV(vwmaccu_vv_b, 2)
2010 GEN_VEXT_VV(vwmaccu_vv_h, 4)
2011 GEN_VEXT_VV(vwmaccu_vv_w, 8)
2012 GEN_VEXT_VV(vwmacc_vv_b, 2)
2013 GEN_VEXT_VV(vwmacc_vv_h, 4)
2014 GEN_VEXT_VV(vwmacc_vv_w, 8)
2015 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
2016 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
2017 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
2018 
2019 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
2020 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
2021 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
2022 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
2023 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
2024 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
2025 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
2026 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
2027 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
2028 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
2029 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
2030 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
2031 GEN_VEXT_VX(vwmaccu_vx_b, 2)
2032 GEN_VEXT_VX(vwmaccu_vx_h, 4)
2033 GEN_VEXT_VX(vwmaccu_vx_w, 8)
2034 GEN_VEXT_VX(vwmacc_vx_b, 2)
2035 GEN_VEXT_VX(vwmacc_vx_h, 4)
2036 GEN_VEXT_VX(vwmacc_vx_w, 8)
2037 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2038 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2039 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2040 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2041 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2042 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2043 
2044 /* Vector Integer Merge and Move Instructions */
2045 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2046 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2047                   uint32_t desc)                                     \
2048 {                                                                    \
2049     uint32_t vl = env->vl;                                           \
2050     uint32_t esz = sizeof(ETYPE);                                    \
2051     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2052     uint32_t vta = vext_vta(desc);                                   \
2053     uint32_t i;                                                      \
2054                                                                      \
2055     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2056                                                                      \
2057     for (i = env->vstart; i < vl; i++) {                             \
2058         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2059         *((ETYPE *)vd + H(i)) = s1;                                  \
2060     }                                                                \
2061     env->vstart = 0;                                                 \
2062     /* set tail elements to 1s */                                    \
2063     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2064 }
2065 
2066 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2067 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2068 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2069 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2070 
2071 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2072 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2073                   uint32_t desc)                                     \
2074 {                                                                    \
2075     uint32_t vl = env->vl;                                           \
2076     uint32_t esz = sizeof(ETYPE);                                    \
2077     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2078     uint32_t vta = vext_vta(desc);                                   \
2079     uint32_t i;                                                      \
2080                                                                      \
2081     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2082                                                                      \
2083     for (i = env->vstart; i < vl; i++) {                             \
2084         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2085     }                                                                \
2086     env->vstart = 0;                                                 \
2087     /* set tail elements to 1s */                                    \
2088     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2089 }
2090 
2091 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2092 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2093 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2094 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2095 
2096 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2097 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2098                   CPURISCVState *env, uint32_t desc)                 \
2099 {                                                                    \
2100     uint32_t vl = env->vl;                                           \
2101     uint32_t esz = sizeof(ETYPE);                                    \
2102     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2103     uint32_t vta = vext_vta(desc);                                   \
2104     uint32_t i;                                                      \
2105                                                                      \
2106     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2107                                                                      \
2108     for (i = env->vstart; i < vl; i++) {                             \
2109         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2110         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2111     }                                                                \
2112     env->vstart = 0;                                                 \
2113     /* set tail elements to 1s */                                    \
2114     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2115 }
2116 
2117 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2118 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2119 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2120 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2121 
2122 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2123 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2124                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2125 {                                                                    \
2126     uint32_t vl = env->vl;                                           \
2127     uint32_t esz = sizeof(ETYPE);                                    \
2128     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2129     uint32_t vta = vext_vta(desc);                                   \
2130     uint32_t i;                                                      \
2131                                                                      \
2132     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2133                                                                      \
2134     for (i = env->vstart; i < vl; i++) {                             \
2135         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2136         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2137                    (ETYPE)(target_long)s1);                          \
2138         *((ETYPE *)vd + H(i)) = d;                                   \
2139     }                                                                \
2140     env->vstart = 0;                                                 \
2141     /* set tail elements to 1s */                                    \
2142     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2143 }
2144 
2145 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2146 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2147 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2148 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2149 
2150 /*
2151  * Vector Fixed-Point Arithmetic Instructions
2152  */
2153 
2154 /* Vector Single-Width Saturating Add and Subtract */
2155 
2156 /*
2157  * As fixed point instructions probably have round mode and saturation,
2158  * define common macros for fixed point here.
2159  */
2160 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2161                           CPURISCVState *env, int vxrm);
2162 
2163 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2164 static inline void                                                  \
2165 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2166           CPURISCVState *env, int vxrm)                             \
2167 {                                                                   \
2168     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2169     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2170     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2171 }
2172 
2173 static inline void
2174 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2175              CPURISCVState *env,
2176              uint32_t vl, uint32_t vm, int vxrm,
2177              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2178 {
2179     for (uint32_t i = env->vstart; i < vl; i++) {
2180         if (!vm && !vext_elem_mask(v0, i)) {
2181             /* set masked-off elements to 1s */
2182             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2183             continue;
2184         }
2185         fn(vd, vs1, vs2, i, env, vxrm);
2186     }
2187     env->vstart = 0;
2188 }
2189 
2190 static inline void
2191 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2192              CPURISCVState *env,
2193              uint32_t desc,
2194              opivv2_rm_fn *fn, uint32_t esz)
2195 {
2196     uint32_t vm = vext_vm(desc);
2197     uint32_t vl = env->vl;
2198     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2199     uint32_t vta = vext_vta(desc);
2200     uint32_t vma = vext_vma(desc);
2201 
2202     VSTART_CHECK_EARLY_EXIT(env, vl);
2203 
2204     switch (env->vxrm) {
2205     case 0: /* rnu */
2206         vext_vv_rm_1(vd, v0, vs1, vs2,
2207                      env, vl, vm, 0, fn, vma, esz);
2208         break;
2209     case 1: /* rne */
2210         vext_vv_rm_1(vd, v0, vs1, vs2,
2211                      env, vl, vm, 1, fn, vma, esz);
2212         break;
2213     case 2: /* rdn */
2214         vext_vv_rm_1(vd, v0, vs1, vs2,
2215                      env, vl, vm, 2, fn, vma, esz);
2216         break;
2217     default: /* rod */
2218         vext_vv_rm_1(vd, v0, vs1, vs2,
2219                      env, vl, vm, 3, fn, vma, esz);
2220         break;
2221     }
2222     /* set tail elements to 1s */
2223     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2224 }
2225 
2226 /* generate helpers for fixed point instructions with OPIVV format */
2227 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2228 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2229                   CPURISCVState *env, uint32_t desc)            \
2230 {                                                               \
2231     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2232                  do_##NAME, ESZ);                               \
2233 }
2234 
2235 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2236                              uint8_t b)
2237 {
2238     uint8_t res = a + b;
2239     if (res < a) {
2240         res = UINT8_MAX;
2241         env->vxsat = 0x1;
2242     }
2243     return res;
2244 }
2245 
2246 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2247                                uint16_t b)
2248 {
2249     uint16_t res = a + b;
2250     if (res < a) {
2251         res = UINT16_MAX;
2252         env->vxsat = 0x1;
2253     }
2254     return res;
2255 }
2256 
2257 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2258                                uint32_t b)
2259 {
2260     uint32_t res = a + b;
2261     if (res < a) {
2262         res = UINT32_MAX;
2263         env->vxsat = 0x1;
2264     }
2265     return res;
2266 }
2267 
2268 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2269                                uint64_t b)
2270 {
2271     uint64_t res = a + b;
2272     if (res < a) {
2273         res = UINT64_MAX;
2274         env->vxsat = 0x1;
2275     }
2276     return res;
2277 }
2278 
2279 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2280 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2281 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2282 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2283 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2284 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2285 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2286 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2287 
2288 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2289                           CPURISCVState *env, int vxrm);
2290 
2291 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2292 static inline void                                                  \
2293 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2294           CPURISCVState *env, int vxrm)                             \
2295 {                                                                   \
2296     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2297     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2298 }
2299 
2300 static inline void
2301 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2302              CPURISCVState *env,
2303              uint32_t vl, uint32_t vm, int vxrm,
2304              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2305 {
2306     for (uint32_t i = env->vstart; i < vl; i++) {
2307         if (!vm && !vext_elem_mask(v0, i)) {
2308             /* set masked-off elements to 1s */
2309             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2310             continue;
2311         }
2312         fn(vd, s1, vs2, i, env, vxrm);
2313     }
2314     env->vstart = 0;
2315 }
2316 
2317 static inline void
2318 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2319              CPURISCVState *env,
2320              uint32_t desc,
2321              opivx2_rm_fn *fn, uint32_t esz)
2322 {
2323     uint32_t vm = vext_vm(desc);
2324     uint32_t vl = env->vl;
2325     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2326     uint32_t vta = vext_vta(desc);
2327     uint32_t vma = vext_vma(desc);
2328 
2329     VSTART_CHECK_EARLY_EXIT(env, vl);
2330 
2331     switch (env->vxrm) {
2332     case 0: /* rnu */
2333         vext_vx_rm_1(vd, v0, s1, vs2,
2334                      env, vl, vm, 0, fn, vma, esz);
2335         break;
2336     case 1: /* rne */
2337         vext_vx_rm_1(vd, v0, s1, vs2,
2338                      env, vl, vm, 1, fn, vma, esz);
2339         break;
2340     case 2: /* rdn */
2341         vext_vx_rm_1(vd, v0, s1, vs2,
2342                      env, vl, vm, 2, fn, vma, esz);
2343         break;
2344     default: /* rod */
2345         vext_vx_rm_1(vd, v0, s1, vs2,
2346                      env, vl, vm, 3, fn, vma, esz);
2347         break;
2348     }
2349     /* set tail elements to 1s */
2350     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2351 }
2352 
2353 /* generate helpers for fixed point instructions with OPIVX format */
2354 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2355 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2356                   void *vs2, CPURISCVState *env,          \
2357                   uint32_t desc)                          \
2358 {                                                         \
2359     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2360                  do_##NAME, ESZ);                         \
2361 }
2362 
2363 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2364 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2365 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2366 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2367 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2368 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2369 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2370 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2371 
2372 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2373 {
2374     int8_t res = a + b;
2375     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2376         res = a > 0 ? INT8_MAX : INT8_MIN;
2377         env->vxsat = 0x1;
2378     }
2379     return res;
2380 }
2381 
2382 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2383                              int16_t b)
2384 {
2385     int16_t res = a + b;
2386     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2387         res = a > 0 ? INT16_MAX : INT16_MIN;
2388         env->vxsat = 0x1;
2389     }
2390     return res;
2391 }
2392 
2393 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2394                              int32_t b)
2395 {
2396     int32_t res = a + b;
2397     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2398         res = a > 0 ? INT32_MAX : INT32_MIN;
2399         env->vxsat = 0x1;
2400     }
2401     return res;
2402 }
2403 
2404 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2405                              int64_t b)
2406 {
2407     int64_t res = a + b;
2408     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2409         res = a > 0 ? INT64_MAX : INT64_MIN;
2410         env->vxsat = 0x1;
2411     }
2412     return res;
2413 }
2414 
2415 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2416 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2417 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2418 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2419 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2420 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2421 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2422 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2423 
2424 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2425 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2426 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2427 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2428 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2429 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2430 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2431 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2432 
2433 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2434                              uint8_t b)
2435 {
2436     uint8_t res = a - b;
2437     if (res > a) {
2438         res = 0;
2439         env->vxsat = 0x1;
2440     }
2441     return res;
2442 }
2443 
2444 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2445                                uint16_t b)
2446 {
2447     uint16_t res = a - b;
2448     if (res > a) {
2449         res = 0;
2450         env->vxsat = 0x1;
2451     }
2452     return res;
2453 }
2454 
2455 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2456                                uint32_t b)
2457 {
2458     uint32_t res = a - b;
2459     if (res > a) {
2460         res = 0;
2461         env->vxsat = 0x1;
2462     }
2463     return res;
2464 }
2465 
2466 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2467                                uint64_t b)
2468 {
2469     uint64_t res = a - b;
2470     if (res > a) {
2471         res = 0;
2472         env->vxsat = 0x1;
2473     }
2474     return res;
2475 }
2476 
2477 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2478 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2479 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2480 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2481 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2482 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2483 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2484 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2485 
2486 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2487 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2488 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2489 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2490 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2491 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2492 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2493 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2494 
2495 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2496 {
2497     int8_t res = a - b;
2498     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2499         res = a >= 0 ? INT8_MAX : INT8_MIN;
2500         env->vxsat = 0x1;
2501     }
2502     return res;
2503 }
2504 
2505 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2506                              int16_t b)
2507 {
2508     int16_t res = a - b;
2509     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2510         res = a >= 0 ? INT16_MAX : INT16_MIN;
2511         env->vxsat = 0x1;
2512     }
2513     return res;
2514 }
2515 
2516 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2517                              int32_t b)
2518 {
2519     int32_t res = a - b;
2520     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2521         res = a >= 0 ? INT32_MAX : INT32_MIN;
2522         env->vxsat = 0x1;
2523     }
2524     return res;
2525 }
2526 
2527 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2528                              int64_t b)
2529 {
2530     int64_t res = a - b;
2531     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2532         res = a >= 0 ? INT64_MAX : INT64_MIN;
2533         env->vxsat = 0x1;
2534     }
2535     return res;
2536 }
2537 
2538 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2539 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2540 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2541 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2542 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2543 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2544 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2545 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2546 
2547 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2548 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2549 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2550 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2551 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2552 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2553 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2554 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2555 
2556 /* Vector Single-Width Averaging Add and Subtract */
2557 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2558 {
2559     uint8_t d = extract64(v, shift, 1);
2560     uint8_t d1;
2561     uint64_t D1, D2;
2562 
2563     if (shift == 0 || shift > 64) {
2564         return 0;
2565     }
2566 
2567     d1 = extract64(v, shift - 1, 1);
2568     D1 = extract64(v, 0, shift);
2569     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2570         return d1;
2571     } else if (vxrm == 1) { /* round-to-nearest-even */
2572         if (shift > 1) {
2573             D2 = extract64(v, 0, shift - 1);
2574             return d1 & ((D2 != 0) | d);
2575         } else {
2576             return d1 & d;
2577         }
2578     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2579         return !d & (D1 != 0);
2580     }
2581     return 0; /* round-down (truncate) */
2582 }
2583 
2584 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2585                              int32_t b)
2586 {
2587     int64_t res = (int64_t)a + b;
2588     uint8_t round = get_round(vxrm, res, 1);
2589 
2590     return (res >> 1) + round;
2591 }
2592 
2593 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2594                              int64_t b)
2595 {
2596     int64_t res = a + b;
2597     uint8_t round = get_round(vxrm, res, 1);
2598     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2599 
2600     /* With signed overflow, bit 64 is inverse of bit 63. */
2601     return ((res >> 1) ^ over) + round;
2602 }
2603 
2604 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2605 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2606 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2607 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2608 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2609 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2610 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2611 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2612 
2613 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2614 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2615 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2616 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2617 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2618 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2619 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2620 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2621 
2622 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2623                                uint32_t a, uint32_t b)
2624 {
2625     uint64_t res = (uint64_t)a + b;
2626     uint8_t round = get_round(vxrm, res, 1);
2627 
2628     return (res >> 1) + round;
2629 }
2630 
2631 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2632                                uint64_t a, uint64_t b)
2633 {
2634     uint64_t res = a + b;
2635     uint8_t round = get_round(vxrm, res, 1);
2636     uint64_t over = (uint64_t)(res < a) << 63;
2637 
2638     return ((res >> 1) | over) + round;
2639 }
2640 
2641 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2642 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2643 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2644 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2645 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2646 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2647 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2648 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2649 
2650 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2651 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2652 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2653 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2654 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2655 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2656 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2657 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2658 
2659 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2660                              int32_t b)
2661 {
2662     int64_t res = (int64_t)a - b;
2663     uint8_t round = get_round(vxrm, res, 1);
2664 
2665     return (res >> 1) + round;
2666 }
2667 
2668 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2669                              int64_t b)
2670 {
2671     int64_t res = (int64_t)a - b;
2672     uint8_t round = get_round(vxrm, res, 1);
2673     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2674 
2675     /* With signed overflow, bit 64 is inverse of bit 63. */
2676     return ((res >> 1) ^ over) + round;
2677 }
2678 
2679 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2680 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2681 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2682 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2683 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2684 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2685 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2686 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2687 
2688 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2689 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2690 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2691 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2692 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2693 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2694 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2695 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2696 
2697 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2698                                uint32_t a, uint32_t b)
2699 {
2700     int64_t res = (int64_t)a - b;
2701     uint8_t round = get_round(vxrm, res, 1);
2702 
2703     return (res >> 1) + round;
2704 }
2705 
2706 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2707                                uint64_t a, uint64_t b)
2708 {
2709     uint64_t res = (uint64_t)a - b;
2710     uint8_t round = get_round(vxrm, res, 1);
2711     uint64_t over = (uint64_t)(res > a) << 63;
2712 
2713     return ((res >> 1) | over) + round;
2714 }
2715 
2716 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2717 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2718 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2719 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2720 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2721 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2722 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2723 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2724 
2725 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2726 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2727 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2728 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2729 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2730 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2731 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2732 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2733 
2734 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2735 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2736 {
2737     uint8_t round;
2738     int16_t res;
2739 
2740     res = (int16_t)a * (int16_t)b;
2741     round = get_round(vxrm, res, 7);
2742     res = (res >> 7) + round;
2743 
2744     if (res > INT8_MAX) {
2745         env->vxsat = 0x1;
2746         return INT8_MAX;
2747     } else if (res < INT8_MIN) {
2748         env->vxsat = 0x1;
2749         return INT8_MIN;
2750     } else {
2751         return res;
2752     }
2753 }
2754 
2755 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2756 {
2757     uint8_t round;
2758     int32_t res;
2759 
2760     res = (int32_t)a * (int32_t)b;
2761     round = get_round(vxrm, res, 15);
2762     res = (res >> 15) + round;
2763 
2764     if (res > INT16_MAX) {
2765         env->vxsat = 0x1;
2766         return INT16_MAX;
2767     } else if (res < INT16_MIN) {
2768         env->vxsat = 0x1;
2769         return INT16_MIN;
2770     } else {
2771         return res;
2772     }
2773 }
2774 
2775 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2776 {
2777     uint8_t round;
2778     int64_t res;
2779 
2780     res = (int64_t)a * (int64_t)b;
2781     round = get_round(vxrm, res, 31);
2782     res = (res >> 31) + round;
2783 
2784     if (res > INT32_MAX) {
2785         env->vxsat = 0x1;
2786         return INT32_MAX;
2787     } else if (res < INT32_MIN) {
2788         env->vxsat = 0x1;
2789         return INT32_MIN;
2790     } else {
2791         return res;
2792     }
2793 }
2794 
2795 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2796 {
2797     uint8_t round;
2798     uint64_t hi_64, lo_64;
2799     int64_t res;
2800 
2801     if (a == INT64_MIN && b == INT64_MIN) {
2802         env->vxsat = 1;
2803         return INT64_MAX;
2804     }
2805 
2806     muls64(&lo_64, &hi_64, a, b);
2807     round = get_round(vxrm, lo_64, 63);
2808     /*
2809      * Cannot overflow, as there are always
2810      * 2 sign bits after multiply.
2811      */
2812     res = (hi_64 << 1) | (lo_64 >> 63);
2813     if (round) {
2814         if (res == INT64_MAX) {
2815             env->vxsat = 1;
2816         } else {
2817             res += 1;
2818         }
2819     }
2820     return res;
2821 }
2822 
2823 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2824 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2825 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2826 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2827 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2828 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2829 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2830 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2831 
2832 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2833 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2834 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2835 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2836 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2837 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2838 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2839 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2840 
2841 /* Vector Single-Width Scaling Shift Instructions */
2842 static inline uint8_t
2843 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2844 {
2845     uint8_t round, shift = b & 0x7;
2846     uint8_t res;
2847 
2848     round = get_round(vxrm, a, shift);
2849     res = (a >> shift) + round;
2850     return res;
2851 }
2852 static inline uint16_t
2853 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2854 {
2855     uint8_t round, shift = b & 0xf;
2856 
2857     round = get_round(vxrm, a, shift);
2858     return (a >> shift) + round;
2859 }
2860 static inline uint32_t
2861 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2862 {
2863     uint8_t round, shift = b & 0x1f;
2864 
2865     round = get_round(vxrm, a, shift);
2866     return (a >> shift) + round;
2867 }
2868 static inline uint64_t
2869 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2870 {
2871     uint8_t round, shift = b & 0x3f;
2872 
2873     round = get_round(vxrm, a, shift);
2874     return (a >> shift) + round;
2875 }
2876 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2877 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2878 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2879 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2880 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2881 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2882 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2883 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2884 
2885 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2886 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2887 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2888 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2889 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2890 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2891 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2892 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2893 
2894 static inline int8_t
2895 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2896 {
2897     uint8_t round, shift = b & 0x7;
2898 
2899     round = get_round(vxrm, a, shift);
2900     return (a >> shift) + round;
2901 }
2902 static inline int16_t
2903 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2904 {
2905     uint8_t round, shift = b & 0xf;
2906 
2907     round = get_round(vxrm, a, shift);
2908     return (a >> shift) + round;
2909 }
2910 static inline int32_t
2911 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2912 {
2913     uint8_t round, shift = b & 0x1f;
2914 
2915     round = get_round(vxrm, a, shift);
2916     return (a >> shift) + round;
2917 }
2918 static inline int64_t
2919 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2920 {
2921     uint8_t round, shift = b & 0x3f;
2922 
2923     round = get_round(vxrm, a, shift);
2924     return (a >> shift) + round;
2925 }
2926 
2927 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2928 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2929 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2930 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2931 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2932 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2933 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2934 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2935 
2936 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2937 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2938 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2939 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2940 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2941 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2942 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2943 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2944 
2945 /* Vector Narrowing Fixed-Point Clip Instructions */
2946 static inline int8_t
2947 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2948 {
2949     uint8_t round, shift = b & 0xf;
2950     int16_t res;
2951 
2952     round = get_round(vxrm, a, shift);
2953     res = (a >> shift) + round;
2954     if (res > INT8_MAX) {
2955         env->vxsat = 0x1;
2956         return INT8_MAX;
2957     } else if (res < INT8_MIN) {
2958         env->vxsat = 0x1;
2959         return INT8_MIN;
2960     } else {
2961         return res;
2962     }
2963 }
2964 
2965 static inline int16_t
2966 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2967 {
2968     uint8_t round, shift = b & 0x1f;
2969     int32_t res;
2970 
2971     round = get_round(vxrm, a, shift);
2972     res = (a >> shift) + round;
2973     if (res > INT16_MAX) {
2974         env->vxsat = 0x1;
2975         return INT16_MAX;
2976     } else if (res < INT16_MIN) {
2977         env->vxsat = 0x1;
2978         return INT16_MIN;
2979     } else {
2980         return res;
2981     }
2982 }
2983 
2984 static inline int32_t
2985 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2986 {
2987     uint8_t round, shift = b & 0x3f;
2988     int64_t res;
2989 
2990     round = get_round(vxrm, a, shift);
2991     res = (a >> shift) + round;
2992     if (res > INT32_MAX) {
2993         env->vxsat = 0x1;
2994         return INT32_MAX;
2995     } else if (res < INT32_MIN) {
2996         env->vxsat = 0x1;
2997         return INT32_MIN;
2998     } else {
2999         return res;
3000     }
3001 }
3002 
3003 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
3004 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
3005 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
3006 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
3007 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
3008 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
3009 
3010 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
3011 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
3012 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
3013 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
3014 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
3015 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
3016 
3017 static inline uint8_t
3018 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
3019 {
3020     uint8_t round, shift = b & 0xf;
3021     uint16_t res;
3022 
3023     round = get_round(vxrm, a, shift);
3024     res = (a >> shift) + round;
3025     if (res > UINT8_MAX) {
3026         env->vxsat = 0x1;
3027         return UINT8_MAX;
3028     } else {
3029         return res;
3030     }
3031 }
3032 
3033 static inline uint16_t
3034 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
3035 {
3036     uint8_t round, shift = b & 0x1f;
3037     uint32_t res;
3038 
3039     round = get_round(vxrm, a, shift);
3040     res = (a >> shift) + round;
3041     if (res > UINT16_MAX) {
3042         env->vxsat = 0x1;
3043         return UINT16_MAX;
3044     } else {
3045         return res;
3046     }
3047 }
3048 
3049 static inline uint32_t
3050 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3051 {
3052     uint8_t round, shift = b & 0x3f;
3053     uint64_t res;
3054 
3055     round = get_round(vxrm, a, shift);
3056     res = (a >> shift) + round;
3057     if (res > UINT32_MAX) {
3058         env->vxsat = 0x1;
3059         return UINT32_MAX;
3060     } else {
3061         return res;
3062     }
3063 }
3064 
3065 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3066 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3067 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3068 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3069 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3070 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3071 
3072 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3073 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3074 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3075 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3076 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3077 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3078 
3079 /*
3080  * Vector Float Point Arithmetic Instructions
3081  */
3082 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3083 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3084 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3085                       CPURISCVState *env)                      \
3086 {                                                              \
3087     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3088     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3089     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3090 }
3091 
3092 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3093 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3094                   void *vs2, CPURISCVState *env,          \
3095                   uint32_t desc)                          \
3096 {                                                         \
3097     uint32_t vm = vext_vm(desc);                          \
3098     uint32_t vl = env->vl;                                \
3099     uint32_t total_elems =                                \
3100         vext_get_total_elems(env, desc, ESZ);             \
3101     uint32_t vta = vext_vta(desc);                        \
3102     uint32_t vma = vext_vma(desc);                        \
3103     uint32_t i;                                           \
3104                                                           \
3105     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3106                                                           \
3107     for (i = env->vstart; i < vl; i++) {                  \
3108         if (!vm && !vext_elem_mask(v0, i)) {              \
3109             /* set masked-off elements to 1s */           \
3110             vext_set_elems_1s(vd, vma, i * ESZ,           \
3111                               (i + 1) * ESZ);             \
3112             continue;                                     \
3113         }                                                 \
3114         do_##NAME(vd, vs1, vs2, i, env);                  \
3115     }                                                     \
3116     env->vstart = 0;                                      \
3117     /* set tail elements to 1s */                         \
3118     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3119                       total_elems * ESZ);                 \
3120 }
3121 
3122 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3123 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3124 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3125 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3126 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3127 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3128 
3129 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3130 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3131                       CPURISCVState *env)                      \
3132 {                                                              \
3133     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3134     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3135 }
3136 
3137 #define GEN_VEXT_VF(NAME, ESZ)                            \
3138 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3139                   void *vs2, CPURISCVState *env,          \
3140                   uint32_t desc)                          \
3141 {                                                         \
3142     uint32_t vm = vext_vm(desc);                          \
3143     uint32_t vl = env->vl;                                \
3144     uint32_t total_elems =                                \
3145         vext_get_total_elems(env, desc, ESZ);             \
3146     uint32_t vta = vext_vta(desc);                        \
3147     uint32_t vma = vext_vma(desc);                        \
3148     uint32_t i;                                           \
3149                                                           \
3150     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3151                                                           \
3152     for (i = env->vstart; i < vl; i++) {                  \
3153         if (!vm && !vext_elem_mask(v0, i)) {              \
3154             /* set masked-off elements to 1s */           \
3155             vext_set_elems_1s(vd, vma, i * ESZ,           \
3156                               (i + 1) * ESZ);             \
3157             continue;                                     \
3158         }                                                 \
3159         do_##NAME(vd, s1, vs2, i, env);                   \
3160     }                                                     \
3161     env->vstart = 0;                                      \
3162     /* set tail elements to 1s */                         \
3163     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3164                       total_elems * ESZ);                 \
3165 }
3166 
3167 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3168 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3169 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3170 GEN_VEXT_VF(vfadd_vf_h, 2)
3171 GEN_VEXT_VF(vfadd_vf_w, 4)
3172 GEN_VEXT_VF(vfadd_vf_d, 8)
3173 
3174 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3175 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3176 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3177 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3178 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3179 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3180 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3181 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3182 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3183 GEN_VEXT_VF(vfsub_vf_h, 2)
3184 GEN_VEXT_VF(vfsub_vf_w, 4)
3185 GEN_VEXT_VF(vfsub_vf_d, 8)
3186 
3187 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3188 {
3189     return float16_sub(b, a, s);
3190 }
3191 
3192 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3193 {
3194     return float32_sub(b, a, s);
3195 }
3196 
3197 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3198 {
3199     return float64_sub(b, a, s);
3200 }
3201 
3202 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3203 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3204 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3205 GEN_VEXT_VF(vfrsub_vf_h, 2)
3206 GEN_VEXT_VF(vfrsub_vf_w, 4)
3207 GEN_VEXT_VF(vfrsub_vf_d, 8)
3208 
3209 /* Vector Widening Floating-Point Add/Subtract Instructions */
3210 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3211 {
3212     return float32_add(float16_to_float32(a, true, s),
3213                        float16_to_float32(b, true, s), s);
3214 }
3215 
3216 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3217 {
3218     return float64_add(float32_to_float64(a, s),
3219                        float32_to_float64(b, s), s);
3220 
3221 }
3222 
3223 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3224 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3225 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3226 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3227 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3228 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3229 GEN_VEXT_VF(vfwadd_vf_h, 4)
3230 GEN_VEXT_VF(vfwadd_vf_w, 8)
3231 
3232 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3233 {
3234     return float32_sub(float16_to_float32(a, true, s),
3235                        float16_to_float32(b, true, s), s);
3236 }
3237 
3238 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3239 {
3240     return float64_sub(float32_to_float64(a, s),
3241                        float32_to_float64(b, s), s);
3242 
3243 }
3244 
3245 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3246 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3247 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3248 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3249 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3250 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3251 GEN_VEXT_VF(vfwsub_vf_h, 4)
3252 GEN_VEXT_VF(vfwsub_vf_w, 8)
3253 
3254 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3255 {
3256     return float32_add(a, float16_to_float32(b, true, s), s);
3257 }
3258 
3259 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3260 {
3261     return float64_add(a, float32_to_float64(b, s), s);
3262 }
3263 
3264 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3265 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3266 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3267 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3268 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3269 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3270 GEN_VEXT_VF(vfwadd_wf_h, 4)
3271 GEN_VEXT_VF(vfwadd_wf_w, 8)
3272 
3273 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3274 {
3275     return float32_sub(a, float16_to_float32(b, true, s), s);
3276 }
3277 
3278 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3279 {
3280     return float64_sub(a, float32_to_float64(b, s), s);
3281 }
3282 
3283 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3284 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3285 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3286 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3287 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3288 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3289 GEN_VEXT_VF(vfwsub_wf_h, 4)
3290 GEN_VEXT_VF(vfwsub_wf_w, 8)
3291 
3292 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3293 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3294 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3295 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3296 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3297 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3298 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3299 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3300 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3301 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3302 GEN_VEXT_VF(vfmul_vf_h, 2)
3303 GEN_VEXT_VF(vfmul_vf_w, 4)
3304 GEN_VEXT_VF(vfmul_vf_d, 8)
3305 
3306 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3307 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3308 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3309 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3310 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3311 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3312 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3313 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3314 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3315 GEN_VEXT_VF(vfdiv_vf_h, 2)
3316 GEN_VEXT_VF(vfdiv_vf_w, 4)
3317 GEN_VEXT_VF(vfdiv_vf_d, 8)
3318 
3319 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3320 {
3321     return float16_div(b, a, s);
3322 }
3323 
3324 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3325 {
3326     return float32_div(b, a, s);
3327 }
3328 
3329 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3330 {
3331     return float64_div(b, a, s);
3332 }
3333 
3334 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3335 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3336 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3337 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3338 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3339 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3340 
3341 /* Vector Widening Floating-Point Multiply */
3342 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3343 {
3344     return float32_mul(float16_to_float32(a, true, s),
3345                        float16_to_float32(b, true, s), s);
3346 }
3347 
3348 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3349 {
3350     return float64_mul(float32_to_float64(a, s),
3351                        float32_to_float64(b, s), s);
3352 
3353 }
3354 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3355 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3356 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3357 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3358 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3359 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3360 GEN_VEXT_VF(vfwmul_vf_h, 4)
3361 GEN_VEXT_VF(vfwmul_vf_w, 8)
3362 
3363 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3364 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3365 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3366                       CPURISCVState *env)                          \
3367 {                                                                  \
3368     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3369     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3370     TD d = *((TD *)vd + HD(i));                                    \
3371     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3372 }
3373 
3374 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3375 {
3376     return float16_muladd(a, b, d, 0, s);
3377 }
3378 
3379 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3380 {
3381     return float32_muladd(a, b, d, 0, s);
3382 }
3383 
3384 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3385 {
3386     return float64_muladd(a, b, d, 0, s);
3387 }
3388 
3389 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3390 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3391 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3392 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3393 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3394 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3395 
3396 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3397 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3398                       CPURISCVState *env)                         \
3399 {                                                                 \
3400     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3401     TD d = *((TD *)vd + HD(i));                                   \
3402     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3403 }
3404 
3405 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3406 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3407 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3408 GEN_VEXT_VF(vfmacc_vf_h, 2)
3409 GEN_VEXT_VF(vfmacc_vf_w, 4)
3410 GEN_VEXT_VF(vfmacc_vf_d, 8)
3411 
3412 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3413 {
3414     return float16_muladd(a, b, d, float_muladd_negate_c |
3415                                    float_muladd_negate_product, s);
3416 }
3417 
3418 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3419 {
3420     return float32_muladd(a, b, d, float_muladd_negate_c |
3421                                    float_muladd_negate_product, s);
3422 }
3423 
3424 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3425 {
3426     return float64_muladd(a, b, d, float_muladd_negate_c |
3427                                    float_muladd_negate_product, s);
3428 }
3429 
3430 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3431 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3432 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3433 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3434 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3435 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3436 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3437 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3438 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3439 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3440 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3441 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3442 
3443 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3444 {
3445     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3446 }
3447 
3448 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3449 {
3450     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3451 }
3452 
3453 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3454 {
3455     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3456 }
3457 
3458 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3459 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3460 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3461 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3462 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3463 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3464 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3465 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3466 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3467 GEN_VEXT_VF(vfmsac_vf_h, 2)
3468 GEN_VEXT_VF(vfmsac_vf_w, 4)
3469 GEN_VEXT_VF(vfmsac_vf_d, 8)
3470 
3471 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3472 {
3473     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3474 }
3475 
3476 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3477 {
3478     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3479 }
3480 
3481 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3482 {
3483     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3484 }
3485 
3486 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3487 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3488 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3489 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3490 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3491 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3492 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3493 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3494 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3495 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3496 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3497 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3498 
3499 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3500 {
3501     return float16_muladd(d, b, a, 0, s);
3502 }
3503 
3504 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3505 {
3506     return float32_muladd(d, b, a, 0, s);
3507 }
3508 
3509 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3510 {
3511     return float64_muladd(d, b, a, 0, s);
3512 }
3513 
3514 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3515 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3516 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3517 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3518 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3519 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3520 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3521 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3522 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3523 GEN_VEXT_VF(vfmadd_vf_h, 2)
3524 GEN_VEXT_VF(vfmadd_vf_w, 4)
3525 GEN_VEXT_VF(vfmadd_vf_d, 8)
3526 
3527 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3528 {
3529     return float16_muladd(d, b, a, float_muladd_negate_c |
3530                                    float_muladd_negate_product, s);
3531 }
3532 
3533 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3534 {
3535     return float32_muladd(d, b, a, float_muladd_negate_c |
3536                                    float_muladd_negate_product, s);
3537 }
3538 
3539 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3540 {
3541     return float64_muladd(d, b, a, float_muladd_negate_c |
3542                                    float_muladd_negate_product, s);
3543 }
3544 
3545 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3546 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3547 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3548 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3549 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3550 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3551 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3552 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3553 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3554 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3555 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3556 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3557 
3558 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3559 {
3560     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3561 }
3562 
3563 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3564 {
3565     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3566 }
3567 
3568 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3569 {
3570     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3571 }
3572 
3573 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3574 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3575 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3576 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3577 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3578 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3579 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3580 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3581 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3582 GEN_VEXT_VF(vfmsub_vf_h, 2)
3583 GEN_VEXT_VF(vfmsub_vf_w, 4)
3584 GEN_VEXT_VF(vfmsub_vf_d, 8)
3585 
3586 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3587 {
3588     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3589 }
3590 
3591 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3592 {
3593     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3594 }
3595 
3596 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3597 {
3598     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3599 }
3600 
3601 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3602 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3603 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3604 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3605 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3606 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3607 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3608 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3609 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3610 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3611 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3612 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3613 
3614 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3615 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3616 {
3617     return float32_muladd(float16_to_float32(a, true, s),
3618                           float16_to_float32(b, true, s), d, 0, s);
3619 }
3620 
3621 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3622 {
3623     return float64_muladd(float32_to_float64(a, s),
3624                           float32_to_float64(b, s), d, 0, s);
3625 }
3626 
3627 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3628 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3629 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3630 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3631 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3632 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3633 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3634 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3635 
3636 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3637 {
3638     return float32_muladd(bfloat16_to_float32(a, s),
3639                           bfloat16_to_float32(b, s), d, 0, s);
3640 }
3641 
3642 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3643 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3644 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3645 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3646 
3647 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3648 {
3649     return float32_muladd(float16_to_float32(a, true, s),
3650                           float16_to_float32(b, true, s), d,
3651                           float_muladd_negate_c | float_muladd_negate_product,
3652                           s);
3653 }
3654 
3655 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3656 {
3657     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3658                           d, float_muladd_negate_c |
3659                              float_muladd_negate_product, s);
3660 }
3661 
3662 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3663 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3664 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3665 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3666 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3667 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3668 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3669 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3670 
3671 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3672 {
3673     return float32_muladd(float16_to_float32(a, true, s),
3674                           float16_to_float32(b, true, s), d,
3675                           float_muladd_negate_c, s);
3676 }
3677 
3678 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3679 {
3680     return float64_muladd(float32_to_float64(a, s),
3681                           float32_to_float64(b, s), d,
3682                           float_muladd_negate_c, s);
3683 }
3684 
3685 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3686 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3687 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3688 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3689 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3690 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3691 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3692 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3693 
3694 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3695 {
3696     return float32_muladd(float16_to_float32(a, true, s),
3697                           float16_to_float32(b, true, s), d,
3698                           float_muladd_negate_product, s);
3699 }
3700 
3701 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3702 {
3703     return float64_muladd(float32_to_float64(a, s),
3704                           float32_to_float64(b, s), d,
3705                           float_muladd_negate_product, s);
3706 }
3707 
3708 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3709 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3710 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3711 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3712 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3713 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3714 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3715 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3716 
3717 /* Vector Floating-Point Square-Root Instruction */
3718 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3719 static void do_##NAME(void *vd, void *vs2, int i,      \
3720                       CPURISCVState *env)              \
3721 {                                                      \
3722     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3723     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3724 }
3725 
3726 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3727 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3728                   CPURISCVState *env, uint32_t desc)   \
3729 {                                                      \
3730     uint32_t vm = vext_vm(desc);                       \
3731     uint32_t vl = env->vl;                             \
3732     uint32_t total_elems =                             \
3733         vext_get_total_elems(env, desc, ESZ);          \
3734     uint32_t vta = vext_vta(desc);                     \
3735     uint32_t vma = vext_vma(desc);                     \
3736     uint32_t i;                                        \
3737                                                        \
3738     VSTART_CHECK_EARLY_EXIT(env, vl);                  \
3739                                                        \
3740     if (vl == 0) {                                     \
3741         return;                                        \
3742     }                                                  \
3743     for (i = env->vstart; i < vl; i++) {               \
3744         if (!vm && !vext_elem_mask(v0, i)) {           \
3745             /* set masked-off elements to 1s */        \
3746             vext_set_elems_1s(vd, vma, i * ESZ,        \
3747                               (i + 1) * ESZ);          \
3748             continue;                                  \
3749         }                                              \
3750         do_##NAME(vd, vs2, i, env);                    \
3751     }                                                  \
3752     env->vstart = 0;                                   \
3753     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3754                       total_elems * ESZ);              \
3755 }
3756 
3757 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3758 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3759 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3760 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3761 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3762 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3763 
3764 /*
3765  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3766  *
3767  * Adapted from riscv-v-spec recip.c:
3768  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3769  */
3770 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3771 {
3772     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3773     uint64_t exp = extract64(f, frac_size, exp_size);
3774     uint64_t frac = extract64(f, 0, frac_size);
3775 
3776     const uint8_t lookup_table[] = {
3777         52, 51, 50, 48, 47, 46, 44, 43,
3778         42, 41, 40, 39, 38, 36, 35, 34,
3779         33, 32, 31, 30, 30, 29, 28, 27,
3780         26, 25, 24, 23, 23, 22, 21, 20,
3781         19, 19, 18, 17, 16, 16, 15, 14,
3782         14, 13, 12, 12, 11, 10, 10, 9,
3783         9, 8, 7, 7, 6, 6, 5, 4,
3784         4, 3, 3, 2, 2, 1, 1, 0,
3785         127, 125, 123, 121, 119, 118, 116, 114,
3786         113, 111, 109, 108, 106, 105, 103, 102,
3787         100, 99, 97, 96, 95, 93, 92, 91,
3788         90, 88, 87, 86, 85, 84, 83, 82,
3789         80, 79, 78, 77, 76, 75, 74, 73,
3790         72, 71, 70, 70, 69, 68, 67, 66,
3791         65, 64, 63, 63, 62, 61, 60, 59,
3792         59, 58, 57, 56, 56, 55, 54, 53
3793     };
3794     const int precision = 7;
3795 
3796     if (exp == 0 && frac != 0) { /* subnormal */
3797         /* Normalize the subnormal. */
3798         while (extract64(frac, frac_size - 1, 1) == 0) {
3799             exp--;
3800             frac <<= 1;
3801         }
3802 
3803         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3804     }
3805 
3806     int idx = ((exp & 1) << (precision - 1)) |
3807               (frac >> (frac_size - precision + 1));
3808     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3809                         (frac_size - precision);
3810     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3811 
3812     uint64_t val = 0;
3813     val = deposit64(val, 0, frac_size, out_frac);
3814     val = deposit64(val, frac_size, exp_size, out_exp);
3815     val = deposit64(val, frac_size + exp_size, 1, sign);
3816     return val;
3817 }
3818 
3819 static float16 frsqrt7_h(float16 f, float_status *s)
3820 {
3821     int exp_size = 5, frac_size = 10;
3822     bool sign = float16_is_neg(f);
3823 
3824     /*
3825      * frsqrt7(sNaN) = canonical NaN
3826      * frsqrt7(-inf) = canonical NaN
3827      * frsqrt7(-normal) = canonical NaN
3828      * frsqrt7(-subnormal) = canonical NaN
3829      */
3830     if (float16_is_signaling_nan(f, s) ||
3831         (float16_is_infinity(f) && sign) ||
3832         (float16_is_normal(f) && sign) ||
3833         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3834         s->float_exception_flags |= float_flag_invalid;
3835         return float16_default_nan(s);
3836     }
3837 
3838     /* frsqrt7(qNaN) = canonical NaN */
3839     if (float16_is_quiet_nan(f, s)) {
3840         return float16_default_nan(s);
3841     }
3842 
3843     /* frsqrt7(+-0) = +-inf */
3844     if (float16_is_zero(f)) {
3845         s->float_exception_flags |= float_flag_divbyzero;
3846         return float16_set_sign(float16_infinity, sign);
3847     }
3848 
3849     /* frsqrt7(+inf) = +0 */
3850     if (float16_is_infinity(f) && !sign) {
3851         return float16_set_sign(float16_zero, sign);
3852     }
3853 
3854     /* +normal, +subnormal */
3855     uint64_t val = frsqrt7(f, exp_size, frac_size);
3856     return make_float16(val);
3857 }
3858 
3859 static float32 frsqrt7_s(float32 f, float_status *s)
3860 {
3861     int exp_size = 8, frac_size = 23;
3862     bool sign = float32_is_neg(f);
3863 
3864     /*
3865      * frsqrt7(sNaN) = canonical NaN
3866      * frsqrt7(-inf) = canonical NaN
3867      * frsqrt7(-normal) = canonical NaN
3868      * frsqrt7(-subnormal) = canonical NaN
3869      */
3870     if (float32_is_signaling_nan(f, s) ||
3871         (float32_is_infinity(f) && sign) ||
3872         (float32_is_normal(f) && sign) ||
3873         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3874         s->float_exception_flags |= float_flag_invalid;
3875         return float32_default_nan(s);
3876     }
3877 
3878     /* frsqrt7(qNaN) = canonical NaN */
3879     if (float32_is_quiet_nan(f, s)) {
3880         return float32_default_nan(s);
3881     }
3882 
3883     /* frsqrt7(+-0) = +-inf */
3884     if (float32_is_zero(f)) {
3885         s->float_exception_flags |= float_flag_divbyzero;
3886         return float32_set_sign(float32_infinity, sign);
3887     }
3888 
3889     /* frsqrt7(+inf) = +0 */
3890     if (float32_is_infinity(f) && !sign) {
3891         return float32_set_sign(float32_zero, sign);
3892     }
3893 
3894     /* +normal, +subnormal */
3895     uint64_t val = frsqrt7(f, exp_size, frac_size);
3896     return make_float32(val);
3897 }
3898 
3899 static float64 frsqrt7_d(float64 f, float_status *s)
3900 {
3901     int exp_size = 11, frac_size = 52;
3902     bool sign = float64_is_neg(f);
3903 
3904     /*
3905      * frsqrt7(sNaN) = canonical NaN
3906      * frsqrt7(-inf) = canonical NaN
3907      * frsqrt7(-normal) = canonical NaN
3908      * frsqrt7(-subnormal) = canonical NaN
3909      */
3910     if (float64_is_signaling_nan(f, s) ||
3911         (float64_is_infinity(f) && sign) ||
3912         (float64_is_normal(f) && sign) ||
3913         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3914         s->float_exception_flags |= float_flag_invalid;
3915         return float64_default_nan(s);
3916     }
3917 
3918     /* frsqrt7(qNaN) = canonical NaN */
3919     if (float64_is_quiet_nan(f, s)) {
3920         return float64_default_nan(s);
3921     }
3922 
3923     /* frsqrt7(+-0) = +-inf */
3924     if (float64_is_zero(f)) {
3925         s->float_exception_flags |= float_flag_divbyzero;
3926         return float64_set_sign(float64_infinity, sign);
3927     }
3928 
3929     /* frsqrt7(+inf) = +0 */
3930     if (float64_is_infinity(f) && !sign) {
3931         return float64_set_sign(float64_zero, sign);
3932     }
3933 
3934     /* +normal, +subnormal */
3935     uint64_t val = frsqrt7(f, exp_size, frac_size);
3936     return make_float64(val);
3937 }
3938 
3939 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3940 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3941 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3942 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3943 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3944 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3945 
3946 /*
3947  * Vector Floating-Point Reciprocal Estimate Instruction
3948  *
3949  * Adapted from riscv-v-spec recip.c:
3950  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3951  */
3952 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3953                       float_status *s)
3954 {
3955     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3956     uint64_t exp = extract64(f, frac_size, exp_size);
3957     uint64_t frac = extract64(f, 0, frac_size);
3958 
3959     const uint8_t lookup_table[] = {
3960         127, 125, 123, 121, 119, 117, 116, 114,
3961         112, 110, 109, 107, 105, 104, 102, 100,
3962         99, 97, 96, 94, 93, 91, 90, 88,
3963         87, 85, 84, 83, 81, 80, 79, 77,
3964         76, 75, 74, 72, 71, 70, 69, 68,
3965         66, 65, 64, 63, 62, 61, 60, 59,
3966         58, 57, 56, 55, 54, 53, 52, 51,
3967         50, 49, 48, 47, 46, 45, 44, 43,
3968         42, 41, 40, 40, 39, 38, 37, 36,
3969         35, 35, 34, 33, 32, 31, 31, 30,
3970         29, 28, 28, 27, 26, 25, 25, 24,
3971         23, 23, 22, 21, 21, 20, 19, 19,
3972         18, 17, 17, 16, 15, 15, 14, 14,
3973         13, 12, 12, 11, 11, 10, 9, 9,
3974         8, 8, 7, 7, 6, 5, 5, 4,
3975         4, 3, 3, 2, 2, 1, 1, 0
3976     };
3977     const int precision = 7;
3978 
3979     if (exp == 0 && frac != 0) { /* subnormal */
3980         /* Normalize the subnormal. */
3981         while (extract64(frac, frac_size - 1, 1) == 0) {
3982             exp--;
3983             frac <<= 1;
3984         }
3985 
3986         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3987 
3988         if (exp != 0 && exp != UINT64_MAX) {
3989             /*
3990              * Overflow to inf or max value of same sign,
3991              * depending on sign and rounding mode.
3992              */
3993             s->float_exception_flags |= (float_flag_inexact |
3994                                          float_flag_overflow);
3995 
3996             if ((s->float_rounding_mode == float_round_to_zero) ||
3997                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3998                 ((s->float_rounding_mode == float_round_up) && sign)) {
3999                 /* Return greatest/negative finite value. */
4000                 return (sign << (exp_size + frac_size)) |
4001                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
4002             } else {
4003                 /* Return +-inf. */
4004                 return (sign << (exp_size + frac_size)) |
4005                        MAKE_64BIT_MASK(frac_size, exp_size);
4006             }
4007         }
4008     }
4009 
4010     int idx = frac >> (frac_size - precision);
4011     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
4012                         (frac_size - precision);
4013     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
4014 
4015     if (out_exp == 0 || out_exp == UINT64_MAX) {
4016         /*
4017          * The result is subnormal, but don't raise the underflow exception,
4018          * because there's no additional loss of precision.
4019          */
4020         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
4021         if (out_exp == UINT64_MAX) {
4022             out_frac >>= 1;
4023             out_exp = 0;
4024         }
4025     }
4026 
4027     uint64_t val = 0;
4028     val = deposit64(val, 0, frac_size, out_frac);
4029     val = deposit64(val, frac_size, exp_size, out_exp);
4030     val = deposit64(val, frac_size + exp_size, 1, sign);
4031     return val;
4032 }
4033 
4034 static float16 frec7_h(float16 f, float_status *s)
4035 {
4036     int exp_size = 5, frac_size = 10;
4037     bool sign = float16_is_neg(f);
4038 
4039     /* frec7(+-inf) = +-0 */
4040     if (float16_is_infinity(f)) {
4041         return float16_set_sign(float16_zero, sign);
4042     }
4043 
4044     /* frec7(+-0) = +-inf */
4045     if (float16_is_zero(f)) {
4046         s->float_exception_flags |= float_flag_divbyzero;
4047         return float16_set_sign(float16_infinity, sign);
4048     }
4049 
4050     /* frec7(sNaN) = canonical NaN */
4051     if (float16_is_signaling_nan(f, s)) {
4052         s->float_exception_flags |= float_flag_invalid;
4053         return float16_default_nan(s);
4054     }
4055 
4056     /* frec7(qNaN) = canonical NaN */
4057     if (float16_is_quiet_nan(f, s)) {
4058         return float16_default_nan(s);
4059     }
4060 
4061     /* +-normal, +-subnormal */
4062     uint64_t val = frec7(f, exp_size, frac_size, s);
4063     return make_float16(val);
4064 }
4065 
4066 static float32 frec7_s(float32 f, float_status *s)
4067 {
4068     int exp_size = 8, frac_size = 23;
4069     bool sign = float32_is_neg(f);
4070 
4071     /* frec7(+-inf) = +-0 */
4072     if (float32_is_infinity(f)) {
4073         return float32_set_sign(float32_zero, sign);
4074     }
4075 
4076     /* frec7(+-0) = +-inf */
4077     if (float32_is_zero(f)) {
4078         s->float_exception_flags |= float_flag_divbyzero;
4079         return float32_set_sign(float32_infinity, sign);
4080     }
4081 
4082     /* frec7(sNaN) = canonical NaN */
4083     if (float32_is_signaling_nan(f, s)) {
4084         s->float_exception_flags |= float_flag_invalid;
4085         return float32_default_nan(s);
4086     }
4087 
4088     /* frec7(qNaN) = canonical NaN */
4089     if (float32_is_quiet_nan(f, s)) {
4090         return float32_default_nan(s);
4091     }
4092 
4093     /* +-normal, +-subnormal */
4094     uint64_t val = frec7(f, exp_size, frac_size, s);
4095     return make_float32(val);
4096 }
4097 
4098 static float64 frec7_d(float64 f, float_status *s)
4099 {
4100     int exp_size = 11, frac_size = 52;
4101     bool sign = float64_is_neg(f);
4102 
4103     /* frec7(+-inf) = +-0 */
4104     if (float64_is_infinity(f)) {
4105         return float64_set_sign(float64_zero, sign);
4106     }
4107 
4108     /* frec7(+-0) = +-inf */
4109     if (float64_is_zero(f)) {
4110         s->float_exception_flags |= float_flag_divbyzero;
4111         return float64_set_sign(float64_infinity, sign);
4112     }
4113 
4114     /* frec7(sNaN) = canonical NaN */
4115     if (float64_is_signaling_nan(f, s)) {
4116         s->float_exception_flags |= float_flag_invalid;
4117         return float64_default_nan(s);
4118     }
4119 
4120     /* frec7(qNaN) = canonical NaN */
4121     if (float64_is_quiet_nan(f, s)) {
4122         return float64_default_nan(s);
4123     }
4124 
4125     /* +-normal, +-subnormal */
4126     uint64_t val = frec7(f, exp_size, frac_size, s);
4127     return make_float64(val);
4128 }
4129 
4130 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4131 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4132 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4133 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4134 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4135 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4136 
4137 /* Vector Floating-Point MIN/MAX Instructions */
4138 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4139 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4140 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4141 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4142 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4143 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4144 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4145 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4146 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4147 GEN_VEXT_VF(vfmin_vf_h, 2)
4148 GEN_VEXT_VF(vfmin_vf_w, 4)
4149 GEN_VEXT_VF(vfmin_vf_d, 8)
4150 
4151 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4152 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4153 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4154 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4155 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4156 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4157 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4158 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4159 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4160 GEN_VEXT_VF(vfmax_vf_h, 2)
4161 GEN_VEXT_VF(vfmax_vf_w, 4)
4162 GEN_VEXT_VF(vfmax_vf_d, 8)
4163 
4164 /* Vector Floating-Point Sign-Injection Instructions */
4165 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4166 {
4167     return deposit64(b, 0, 15, a);
4168 }
4169 
4170 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4171 {
4172     return deposit64(b, 0, 31, a);
4173 }
4174 
4175 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4176 {
4177     return deposit64(b, 0, 63, a);
4178 }
4179 
4180 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4181 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4182 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4183 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4184 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4185 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4186 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4187 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4188 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4189 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4190 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4191 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4192 
4193 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4194 {
4195     return deposit64(~b, 0, 15, a);
4196 }
4197 
4198 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4199 {
4200     return deposit64(~b, 0, 31, a);
4201 }
4202 
4203 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4204 {
4205     return deposit64(~b, 0, 63, a);
4206 }
4207 
4208 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4209 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4210 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4211 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4212 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4213 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4214 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4215 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4216 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4217 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4218 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4219 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4220 
4221 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4222 {
4223     return deposit64(b ^ a, 0, 15, a);
4224 }
4225 
4226 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4227 {
4228     return deposit64(b ^ a, 0, 31, a);
4229 }
4230 
4231 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4232 {
4233     return deposit64(b ^ a, 0, 63, a);
4234 }
4235 
4236 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4237 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4238 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4239 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4240 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4241 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4242 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4243 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4244 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4245 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4246 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4247 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4248 
4249 /* Vector Floating-Point Compare Instructions */
4250 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4251 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4252                   CPURISCVState *env, uint32_t desc)          \
4253 {                                                             \
4254     uint32_t vm = vext_vm(desc);                              \
4255     uint32_t vl = env->vl;                                    \
4256     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4257     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4258     uint32_t vma = vext_vma(desc);                            \
4259     uint32_t i;                                               \
4260                                                               \
4261     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4262                                                               \
4263     for (i = env->vstart; i < vl; i++) {                      \
4264         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4265         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4266         if (!vm && !vext_elem_mask(v0, i)) {                  \
4267             /* set masked-off elements to 1s */               \
4268             if (vma) {                                        \
4269                 vext_set_elem_mask(vd, i, 1);                 \
4270             }                                                 \
4271             continue;                                         \
4272         }                                                     \
4273         vext_set_elem_mask(vd, i,                             \
4274                            DO_OP(s2, s1, &env->fp_status));   \
4275     }                                                         \
4276     env->vstart = 0;                                          \
4277     /*
4278      * mask destination register are always tail-agnostic
4279      * set tail elements to 1s
4280      */                                                       \
4281     if (vta_all_1s) {                                         \
4282         for (; i < total_elems; i++) {                        \
4283             vext_set_elem_mask(vd, i, 1);                     \
4284         }                                                     \
4285     }                                                         \
4286 }
4287 
4288 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4289 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4290 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4291 
4292 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4293 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4294                   CPURISCVState *env, uint32_t desc)                \
4295 {                                                                   \
4296     uint32_t vm = vext_vm(desc);                                    \
4297     uint32_t vl = env->vl;                                          \
4298     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4299     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4300     uint32_t vma = vext_vma(desc);                                  \
4301     uint32_t i;                                                     \
4302                                                                     \
4303     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
4304                                                                     \
4305     for (i = env->vstart; i < vl; i++) {                            \
4306         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4307         if (!vm && !vext_elem_mask(v0, i)) {                        \
4308             /* set masked-off elements to 1s */                     \
4309             if (vma) {                                              \
4310                 vext_set_elem_mask(vd, i, 1);                       \
4311             }                                                       \
4312             continue;                                               \
4313         }                                                           \
4314         vext_set_elem_mask(vd, i,                                   \
4315                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4316     }                                                               \
4317     env->vstart = 0;                                                \
4318     /*
4319      * mask destination register are always tail-agnostic
4320      * set tail elements to 1s
4321      */                                                             \
4322     if (vta_all_1s) {                                               \
4323         for (; i < total_elems; i++) {                              \
4324             vext_set_elem_mask(vd, i, 1);                           \
4325         }                                                           \
4326     }                                                               \
4327 }
4328 
4329 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4330 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4331 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4332 
4333 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4334 {
4335     FloatRelation compare = float16_compare_quiet(a, b, s);
4336     return compare != float_relation_equal;
4337 }
4338 
4339 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4340 {
4341     FloatRelation compare = float32_compare_quiet(a, b, s);
4342     return compare != float_relation_equal;
4343 }
4344 
4345 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4346 {
4347     FloatRelation compare = float64_compare_quiet(a, b, s);
4348     return compare != float_relation_equal;
4349 }
4350 
4351 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4352 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4353 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4354 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4355 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4356 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4357 
4358 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4359 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4360 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4361 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4362 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4363 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4364 
4365 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4366 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4367 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4368 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4369 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4370 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4371 
4372 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4373 {
4374     FloatRelation compare = float16_compare(a, b, s);
4375     return compare == float_relation_greater;
4376 }
4377 
4378 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4379 {
4380     FloatRelation compare = float32_compare(a, b, s);
4381     return compare == float_relation_greater;
4382 }
4383 
4384 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4385 {
4386     FloatRelation compare = float64_compare(a, b, s);
4387     return compare == float_relation_greater;
4388 }
4389 
4390 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4391 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4392 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4393 
4394 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4395 {
4396     FloatRelation compare = float16_compare(a, b, s);
4397     return compare == float_relation_greater ||
4398            compare == float_relation_equal;
4399 }
4400 
4401 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4402 {
4403     FloatRelation compare = float32_compare(a, b, s);
4404     return compare == float_relation_greater ||
4405            compare == float_relation_equal;
4406 }
4407 
4408 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4409 {
4410     FloatRelation compare = float64_compare(a, b, s);
4411     return compare == float_relation_greater ||
4412            compare == float_relation_equal;
4413 }
4414 
4415 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4416 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4417 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4418 
4419 /* Vector Floating-Point Classify Instruction */
4420 target_ulong fclass_h(uint64_t frs1)
4421 {
4422     float16 f = frs1;
4423     bool sign = float16_is_neg(f);
4424 
4425     if (float16_is_infinity(f)) {
4426         return sign ? 1 << 0 : 1 << 7;
4427     } else if (float16_is_zero(f)) {
4428         return sign ? 1 << 3 : 1 << 4;
4429     } else if (float16_is_zero_or_denormal(f)) {
4430         return sign ? 1 << 2 : 1 << 5;
4431     } else if (float16_is_any_nan(f)) {
4432         float_status s = { }; /* for snan_bit_is_one */
4433         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4434     } else {
4435         return sign ? 1 << 1 : 1 << 6;
4436     }
4437 }
4438 
4439 target_ulong fclass_s(uint64_t frs1)
4440 {
4441     float32 f = frs1;
4442     bool sign = float32_is_neg(f);
4443 
4444     if (float32_is_infinity(f)) {
4445         return sign ? 1 << 0 : 1 << 7;
4446     } else if (float32_is_zero(f)) {
4447         return sign ? 1 << 3 : 1 << 4;
4448     } else if (float32_is_zero_or_denormal(f)) {
4449         return sign ? 1 << 2 : 1 << 5;
4450     } else if (float32_is_any_nan(f)) {
4451         float_status s = { }; /* for snan_bit_is_one */
4452         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4453     } else {
4454         return sign ? 1 << 1 : 1 << 6;
4455     }
4456 }
4457 
4458 target_ulong fclass_d(uint64_t frs1)
4459 {
4460     float64 f = frs1;
4461     bool sign = float64_is_neg(f);
4462 
4463     if (float64_is_infinity(f)) {
4464         return sign ? 1 << 0 : 1 << 7;
4465     } else if (float64_is_zero(f)) {
4466         return sign ? 1 << 3 : 1 << 4;
4467     } else if (float64_is_zero_or_denormal(f)) {
4468         return sign ? 1 << 2 : 1 << 5;
4469     } else if (float64_is_any_nan(f)) {
4470         float_status s = { }; /* for snan_bit_is_one */
4471         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4472     } else {
4473         return sign ? 1 << 1 : 1 << 6;
4474     }
4475 }
4476 
4477 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4478 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4479 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4480 GEN_VEXT_V(vfclass_v_h, 2)
4481 GEN_VEXT_V(vfclass_v_w, 4)
4482 GEN_VEXT_V(vfclass_v_d, 8)
4483 
4484 /* Vector Floating-Point Merge Instruction */
4485 
4486 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4487 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4488                   CPURISCVState *env, uint32_t desc)          \
4489 {                                                             \
4490     uint32_t vm = vext_vm(desc);                              \
4491     uint32_t vl = env->vl;                                    \
4492     uint32_t esz = sizeof(ETYPE);                             \
4493     uint32_t total_elems =                                    \
4494         vext_get_total_elems(env, desc, esz);                 \
4495     uint32_t vta = vext_vta(desc);                            \
4496     uint32_t i;                                               \
4497                                                               \
4498     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4499                                                               \
4500     for (i = env->vstart; i < vl; i++) {                      \
4501         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4502         *((ETYPE *)vd + H(i)) =                               \
4503             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4504     }                                                         \
4505     env->vstart = 0;                                          \
4506     /* set tail elements to 1s */                             \
4507     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4508 }
4509 
4510 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4511 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4512 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4513 
4514 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4515 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4516 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4517 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4518 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4519 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4520 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4521 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4522 
4523 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4524 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4525 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4526 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4527 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4528 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4529 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4530 
4531 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4532 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4533 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4534 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4535 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4536 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4537 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4538 
4539 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4540 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4541 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4542 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4543 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4544 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4545 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4546 
4547 /* Widening Floating-Point/Integer Type-Convert Instructions */
4548 /* (TD, T2, TX2) */
4549 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4550 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4551 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4552 /*
4553  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4554  */
4555 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4556 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4557 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4558 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4559 
4560 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4561 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4562 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4563 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4564 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4565 
4566 /*
4567  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4568  */
4569 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4570 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4571 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4572 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4573 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4574 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4575 
4576 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4577 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4578 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4579 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4580 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4581 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4582 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4583 
4584 /*
4585  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4586  */
4587 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4588 {
4589     return float16_to_float32(a, true, s);
4590 }
4591 
4592 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4593 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4594 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4595 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4596 
4597 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4598 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4599 
4600 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4601 /* (TD, T2, TX2) */
4602 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4603 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4604 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4605 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4606 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4607 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4608 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4609 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4610 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4611 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4612 
4613 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4614 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4615 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4616 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4617 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4618 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4619 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4620 
4621 /*
4622  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4623  */
4624 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4625 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4626 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4627 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4628 
4629 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4630 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4631 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4632 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4633 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4634 
4635 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4636 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4637 {
4638     return float32_to_float16(a, true, s);
4639 }
4640 
4641 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4642 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4643 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4644 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4645 
4646 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4647 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4648 
4649 /*
4650  * Vector Reduction Operations
4651  */
4652 /* Vector Single-Width Integer Reduction Instructions */
4653 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4654 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4655                   void *vs2, CPURISCVState *env,          \
4656                   uint32_t desc)                          \
4657 {                                                         \
4658     uint32_t vm = vext_vm(desc);                          \
4659     uint32_t vl = env->vl;                                \
4660     uint32_t esz = sizeof(TD);                            \
4661     uint32_t vlenb = simd_maxsz(desc);                    \
4662     uint32_t vta = vext_vta(desc);                        \
4663     uint32_t i;                                           \
4664     TD s1 =  *((TD *)vs1 + HD(0));                        \
4665                                                           \
4666     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4667                                                           \
4668     for (i = env->vstart; i < vl; i++) {                  \
4669         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4670         if (!vm && !vext_elem_mask(v0, i)) {              \
4671             continue;                                     \
4672         }                                                 \
4673         s1 = OP(s1, (TD)s2);                              \
4674     }                                                     \
4675     if (vl > 0) {                                         \
4676         *((TD *)vd + HD(0)) = s1;                         \
4677     }                                                     \
4678     env->vstart = 0;                                      \
4679     /* set tail elements to 1s */                         \
4680     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4681 }
4682 
4683 /* vd[0] = sum(vs1[0], vs2[*]) */
4684 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4685 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4686 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4687 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4688 
4689 /* vd[0] = maxu(vs1[0], vs2[*]) */
4690 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4691 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4692 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4693 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4694 
4695 /* vd[0] = max(vs1[0], vs2[*]) */
4696 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4697 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4698 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4699 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4700 
4701 /* vd[0] = minu(vs1[0], vs2[*]) */
4702 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4703 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4704 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4705 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4706 
4707 /* vd[0] = min(vs1[0], vs2[*]) */
4708 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4709 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4710 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4711 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4712 
4713 /* vd[0] = and(vs1[0], vs2[*]) */
4714 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4715 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4716 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4717 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4718 
4719 /* vd[0] = or(vs1[0], vs2[*]) */
4720 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4721 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4722 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4723 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4724 
4725 /* vd[0] = xor(vs1[0], vs2[*]) */
4726 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4727 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4728 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4729 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4730 
4731 /* Vector Widening Integer Reduction Instructions */
4732 /* signed sum reduction into double-width accumulator */
4733 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4734 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4735 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4736 
4737 /* Unsigned sum reduction into double-width accumulator */
4738 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4739 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4740 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4741 
4742 /* Vector Single-Width Floating-Point Reduction Instructions */
4743 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4744 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4745                   void *vs2, CPURISCVState *env,           \
4746                   uint32_t desc)                           \
4747 {                                                          \
4748     uint32_t vm = vext_vm(desc);                           \
4749     uint32_t vl = env->vl;                                 \
4750     uint32_t esz = sizeof(TD);                             \
4751     uint32_t vlenb = simd_maxsz(desc);                     \
4752     uint32_t vta = vext_vta(desc);                         \
4753     uint32_t i;                                            \
4754     TD s1 =  *((TD *)vs1 + HD(0));                         \
4755                                                            \
4756     VSTART_CHECK_EARLY_EXIT(env, vl);                      \
4757                                                            \
4758     for (i = env->vstart; i < vl; i++) {                   \
4759         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4760         if (!vm && !vext_elem_mask(v0, i)) {               \
4761             continue;                                      \
4762         }                                                  \
4763         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4764     }                                                      \
4765     if (vl > 0) {                                          \
4766         *((TD *)vd + HD(0)) = s1;                          \
4767     }                                                      \
4768     env->vstart = 0;                                       \
4769     /* set tail elements to 1s */                          \
4770     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4771 }
4772 
4773 /* Unordered sum */
4774 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4775 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4776 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4777 
4778 /* Ordered sum */
4779 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4780 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4781 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4782 
4783 /* Maximum value */
4784 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4785               float16_maximum_number)
4786 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4787               float32_maximum_number)
4788 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4789               float64_maximum_number)
4790 
4791 /* Minimum value */
4792 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4793               float16_minimum_number)
4794 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4795               float32_minimum_number)
4796 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4797               float64_minimum_number)
4798 
4799 /* Vector Widening Floating-Point Add Instructions */
4800 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4801 {
4802     return float32_add(a, float16_to_float32(b, true, s), s);
4803 }
4804 
4805 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4806 {
4807     return float64_add(a, float32_to_float64(b, s), s);
4808 }
4809 
4810 /* Vector Widening Floating-Point Reduction Instructions */
4811 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4812 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4813 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4814 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4815 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4816 
4817 /*
4818  * Vector Mask Operations
4819  */
4820 /* Vector Mask-Register Logical Instructions */
4821 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4822 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4823                   void *vs2, CPURISCVState *env,          \
4824                   uint32_t desc)                          \
4825 {                                                         \
4826     uint32_t vl = env->vl;                                \
4827     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4828     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4829     uint32_t i;                                           \
4830     int a, b;                                             \
4831                                                           \
4832     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4833                                                           \
4834     for (i = env->vstart; i < vl; i++) {                  \
4835         a = vext_elem_mask(vs1, i);                       \
4836         b = vext_elem_mask(vs2, i);                       \
4837         vext_set_elem_mask(vd, i, OP(b, a));              \
4838     }                                                     \
4839     env->vstart = 0;                                      \
4840     /*
4841      * mask destination register are always tail-agnostic
4842      * set tail elements to 1s
4843      */                                                   \
4844     if (vta_all_1s) {                                     \
4845         for (; i < total_elems; i++) {                    \
4846             vext_set_elem_mask(vd, i, 1);                 \
4847         }                                                 \
4848     }                                                     \
4849 }
4850 
4851 #define DO_NAND(N, M)  (!(N & M))
4852 #define DO_ANDNOT(N, M)  (N & !M)
4853 #define DO_NOR(N, M)  (!(N | M))
4854 #define DO_ORNOT(N, M)  (N | !M)
4855 #define DO_XNOR(N, M)  (!(N ^ M))
4856 
4857 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4858 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4859 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4860 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4861 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4862 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4863 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4864 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4865 
4866 /* Vector count population in mask vcpop */
4867 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4868                              uint32_t desc)
4869 {
4870     target_ulong cnt = 0;
4871     uint32_t vm = vext_vm(desc);
4872     uint32_t vl = env->vl;
4873     int i;
4874 
4875     for (i = env->vstart; i < vl; i++) {
4876         if (vm || vext_elem_mask(v0, i)) {
4877             if (vext_elem_mask(vs2, i)) {
4878                 cnt++;
4879             }
4880         }
4881     }
4882     env->vstart = 0;
4883     return cnt;
4884 }
4885 
4886 /* vfirst find-first-set mask bit */
4887 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4888                               uint32_t desc)
4889 {
4890     uint32_t vm = vext_vm(desc);
4891     uint32_t vl = env->vl;
4892     int i;
4893 
4894     for (i = env->vstart; i < vl; i++) {
4895         if (vm || vext_elem_mask(v0, i)) {
4896             if (vext_elem_mask(vs2, i)) {
4897                 return i;
4898             }
4899         }
4900     }
4901     env->vstart = 0;
4902     return -1LL;
4903 }
4904 
4905 enum set_mask_type {
4906     ONLY_FIRST = 1,
4907     INCLUDE_FIRST,
4908     BEFORE_FIRST,
4909 };
4910 
4911 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4912                    uint32_t desc, enum set_mask_type type)
4913 {
4914     uint32_t vm = vext_vm(desc);
4915     uint32_t vl = env->vl;
4916     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4917     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4918     uint32_t vma = vext_vma(desc);
4919     int i;
4920     bool first_mask_bit = false;
4921 
4922     VSTART_CHECK_EARLY_EXIT(env, vl);
4923 
4924     for (i = env->vstart; i < vl; i++) {
4925         if (!vm && !vext_elem_mask(v0, i)) {
4926             /* set masked-off elements to 1s */
4927             if (vma) {
4928                 vext_set_elem_mask(vd, i, 1);
4929             }
4930             continue;
4931         }
4932         /* write a zero to all following active elements */
4933         if (first_mask_bit) {
4934             vext_set_elem_mask(vd, i, 0);
4935             continue;
4936         }
4937         if (vext_elem_mask(vs2, i)) {
4938             first_mask_bit = true;
4939             if (type == BEFORE_FIRST) {
4940                 vext_set_elem_mask(vd, i, 0);
4941             } else {
4942                 vext_set_elem_mask(vd, i, 1);
4943             }
4944         } else {
4945             if (type == ONLY_FIRST) {
4946                 vext_set_elem_mask(vd, i, 0);
4947             } else {
4948                 vext_set_elem_mask(vd, i, 1);
4949             }
4950         }
4951     }
4952     env->vstart = 0;
4953     /*
4954      * mask destination register are always tail-agnostic
4955      * set tail elements to 1s
4956      */
4957     if (vta_all_1s) {
4958         for (; i < total_elems; i++) {
4959             vext_set_elem_mask(vd, i, 1);
4960         }
4961     }
4962 }
4963 
4964 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4965                      uint32_t desc)
4966 {
4967     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4968 }
4969 
4970 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4971                      uint32_t desc)
4972 {
4973     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4974 }
4975 
4976 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4977                      uint32_t desc)
4978 {
4979     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4980 }
4981 
4982 /* Vector Iota Instruction */
4983 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4984 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4985                   uint32_t desc)                                          \
4986 {                                                                         \
4987     uint32_t vm = vext_vm(desc);                                          \
4988     uint32_t vl = env->vl;                                                \
4989     uint32_t esz = sizeof(ETYPE);                                         \
4990     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4991     uint32_t vta = vext_vta(desc);                                        \
4992     uint32_t vma = vext_vma(desc);                                        \
4993     uint32_t sum = 0;                                                     \
4994     int i;                                                                \
4995                                                                           \
4996     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
4997                                                                           \
4998     for (i = env->vstart; i < vl; i++) {                                  \
4999         if (!vm && !vext_elem_mask(v0, i)) {                              \
5000             /* set masked-off elements to 1s */                           \
5001             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5002             continue;                                                     \
5003         }                                                                 \
5004         *((ETYPE *)vd + H(i)) = sum;                                      \
5005         if (vext_elem_mask(vs2, i)) {                                     \
5006             sum++;                                                        \
5007         }                                                                 \
5008     }                                                                     \
5009     env->vstart = 0;                                                      \
5010     /* set tail elements to 1s */                                         \
5011     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5012 }
5013 
5014 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
5015 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
5016 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
5017 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
5018 
5019 /* Vector Element Index Instruction */
5020 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
5021 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
5022 {                                                                         \
5023     uint32_t vm = vext_vm(desc);                                          \
5024     uint32_t vl = env->vl;                                                \
5025     uint32_t esz = sizeof(ETYPE);                                         \
5026     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5027     uint32_t vta = vext_vta(desc);                                        \
5028     uint32_t vma = vext_vma(desc);                                        \
5029     int i;                                                                \
5030                                                                           \
5031     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5032                                                                           \
5033     for (i = env->vstart; i < vl; i++) {                                  \
5034         if (!vm && !vext_elem_mask(v0, i)) {                              \
5035             /* set masked-off elements to 1s */                           \
5036             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5037             continue;                                                     \
5038         }                                                                 \
5039         *((ETYPE *)vd + H(i)) = i;                                        \
5040     }                                                                     \
5041     env->vstart = 0;                                                      \
5042     /* set tail elements to 1s */                                         \
5043     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5044 }
5045 
5046 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5047 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5048 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5049 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5050 
5051 /*
5052  * Vector Permutation Instructions
5053  */
5054 
5055 /* Vector Slide Instructions */
5056 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5057 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5058                   CPURISCVState *env, uint32_t desc)                      \
5059 {                                                                         \
5060     uint32_t vm = vext_vm(desc);                                          \
5061     uint32_t vl = env->vl;                                                \
5062     uint32_t esz = sizeof(ETYPE);                                         \
5063     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5064     uint32_t vta = vext_vta(desc);                                        \
5065     uint32_t vma = vext_vma(desc);                                        \
5066     target_ulong offset = s1, i_min, i;                                   \
5067                                                                           \
5068     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5069                                                                           \
5070     i_min = MAX(env->vstart, offset);                                     \
5071     for (i = i_min; i < vl; i++) {                                        \
5072         if (!vm && !vext_elem_mask(v0, i)) {                              \
5073             /* set masked-off elements to 1s */                           \
5074             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5075             continue;                                                     \
5076         }                                                                 \
5077         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5078     }                                                                     \
5079     env->vstart = 0;                                                      \
5080     /* set tail elements to 1s */                                         \
5081     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5082 }
5083 
5084 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5085 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5086 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5087 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5088 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5089 
5090 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5091 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5092                   CPURISCVState *env, uint32_t desc)                      \
5093 {                                                                         \
5094     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5095     uint32_t vm = vext_vm(desc);                                          \
5096     uint32_t vl = env->vl;                                                \
5097     uint32_t esz = sizeof(ETYPE);                                         \
5098     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5099     uint32_t vta = vext_vta(desc);                                        \
5100     uint32_t vma = vext_vma(desc);                                        \
5101     target_ulong i_max, i_min, i;                                         \
5102                                                                           \
5103     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5104                                                                           \
5105     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
5106     i_max = MAX(i_min, env->vstart);                                      \
5107     for (i = env->vstart; i < i_max; ++i) {                               \
5108         if (!vm && !vext_elem_mask(v0, i)) {                              \
5109             /* set masked-off elements to 1s */                           \
5110             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5111             continue;                                                     \
5112         }                                                                 \
5113         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5114     }                                                                     \
5115                                                                           \
5116     for (i = i_max; i < vl; ++i) {                                        \
5117         if (vm || vext_elem_mask(v0, i)) {                                \
5118             *((ETYPE *)vd + H(i)) = 0;                                    \
5119         }                                                                 \
5120     }                                                                     \
5121                                                                           \
5122     env->vstart = 0;                                                      \
5123     /* set tail elements to 1s */                                         \
5124     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5125 }
5126 
5127 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5128 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5129 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5130 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5131 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5132 
5133 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5134 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5135                                  void *vs2, CPURISCVState *env,             \
5136                                  uint32_t desc)                             \
5137 {                                                                           \
5138     typedef uint##BITWIDTH##_t ETYPE;                                       \
5139     uint32_t vm = vext_vm(desc);                                            \
5140     uint32_t vl = env->vl;                                                  \
5141     uint32_t esz = sizeof(ETYPE);                                           \
5142     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5143     uint32_t vta = vext_vta(desc);                                          \
5144     uint32_t vma = vext_vma(desc);                                          \
5145     uint32_t i;                                                             \
5146                                                                             \
5147     VSTART_CHECK_EARLY_EXIT(env, vl);                                       \
5148                                                                             \
5149     for (i = env->vstart; i < vl; i++) {                                    \
5150         if (!vm && !vext_elem_mask(v0, i)) {                                \
5151             /* set masked-off elements to 1s */                             \
5152             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5153             continue;                                                       \
5154         }                                                                   \
5155         if (i == 0) {                                                       \
5156             *((ETYPE *)vd + H(i)) = s1;                                     \
5157         } else {                                                            \
5158             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5159         }                                                                   \
5160     }                                                                       \
5161     env->vstart = 0;                                                        \
5162     /* set tail elements to 1s */                                           \
5163     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5164 }
5165 
5166 GEN_VEXT_VSLIE1UP(8,  H1)
5167 GEN_VEXT_VSLIE1UP(16, H2)
5168 GEN_VEXT_VSLIE1UP(32, H4)
5169 GEN_VEXT_VSLIE1UP(64, H8)
5170 
5171 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5172 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5173                   CPURISCVState *env, uint32_t desc)              \
5174 {                                                                 \
5175     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5176 }
5177 
5178 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5179 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5180 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5181 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5182 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5183 
5184 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5185 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5186                                    void *vs2, CPURISCVState *env,             \
5187                                    uint32_t desc)                             \
5188 {                                                                             \
5189     typedef uint##BITWIDTH##_t ETYPE;                                         \
5190     uint32_t vm = vext_vm(desc);                                              \
5191     uint32_t vl = env->vl;                                                    \
5192     uint32_t esz = sizeof(ETYPE);                                             \
5193     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5194     uint32_t vta = vext_vta(desc);                                            \
5195     uint32_t vma = vext_vma(desc);                                            \
5196     uint32_t i;                                                               \
5197                                                                               \
5198     VSTART_CHECK_EARLY_EXIT(env, vl);                                         \
5199                                                                               \
5200     for (i = env->vstart; i < vl; i++) {                                      \
5201         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5202             /* set masked-off elements to 1s */                               \
5203             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5204             continue;                                                         \
5205         }                                                                     \
5206         if (i == vl - 1) {                                                    \
5207             *((ETYPE *)vd + H(i)) = s1;                                       \
5208         } else {                                                              \
5209             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5210         }                                                                     \
5211     }                                                                         \
5212     env->vstart = 0;                                                          \
5213     /* set tail elements to 1s */                                             \
5214     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5215 }
5216 
5217 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5218 GEN_VEXT_VSLIDE1DOWN(16, H2)
5219 GEN_VEXT_VSLIDE1DOWN(32, H4)
5220 GEN_VEXT_VSLIDE1DOWN(64, H8)
5221 
5222 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5223 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5224                   CPURISCVState *env, uint32_t desc)              \
5225 {                                                                 \
5226     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5227 }
5228 
5229 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5230 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5231 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5232 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5233 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5234 
5235 /* Vector Floating-Point Slide Instructions */
5236 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5237 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5238                   CPURISCVState *env, uint32_t desc)          \
5239 {                                                             \
5240     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5241 }
5242 
5243 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5244 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5245 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5246 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5247 
5248 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5249 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5250                   CPURISCVState *env, uint32_t desc)          \
5251 {                                                             \
5252     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5253 }
5254 
5255 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5256 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5257 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5258 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5259 
5260 /* Vector Register Gather Instruction */
5261 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5262 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5263                   CPURISCVState *env, uint32_t desc)                      \
5264 {                                                                         \
5265     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5266     uint32_t vm = vext_vm(desc);                                          \
5267     uint32_t vl = env->vl;                                                \
5268     uint32_t esz = sizeof(TS2);                                           \
5269     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5270     uint32_t vta = vext_vta(desc);                                        \
5271     uint32_t vma = vext_vma(desc);                                        \
5272     uint64_t index;                                                       \
5273     uint32_t i;                                                           \
5274                                                                           \
5275     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5276                                                                           \
5277     for (i = env->vstart; i < vl; i++) {                                  \
5278         if (!vm && !vext_elem_mask(v0, i)) {                              \
5279             /* set masked-off elements to 1s */                           \
5280             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5281             continue;                                                     \
5282         }                                                                 \
5283         index = *((TS1 *)vs1 + HS1(i));                                   \
5284         if (index >= vlmax) {                                             \
5285             *((TS2 *)vd + HS2(i)) = 0;                                    \
5286         } else {                                                          \
5287             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5288         }                                                                 \
5289     }                                                                     \
5290     env->vstart = 0;                                                      \
5291     /* set tail elements to 1s */                                         \
5292     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5293 }
5294 
5295 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5296 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5297 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5298 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5299 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5300 
5301 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5302 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5303 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5304 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5305 
5306 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5307 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5308                   CPURISCVState *env, uint32_t desc)                      \
5309 {                                                                         \
5310     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5311     uint32_t vm = vext_vm(desc);                                          \
5312     uint32_t vl = env->vl;                                                \
5313     uint32_t esz = sizeof(ETYPE);                                         \
5314     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5315     uint32_t vta = vext_vta(desc);                                        \
5316     uint32_t vma = vext_vma(desc);                                        \
5317     uint64_t index = s1;                                                  \
5318     uint32_t i;                                                           \
5319                                                                           \
5320     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5321                                                                           \
5322     for (i = env->vstart; i < vl; i++) {                                  \
5323         if (!vm && !vext_elem_mask(v0, i)) {                              \
5324             /* set masked-off elements to 1s */                           \
5325             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5326             continue;                                                     \
5327         }                                                                 \
5328         if (index >= vlmax) {                                             \
5329             *((ETYPE *)vd + H(i)) = 0;                                    \
5330         } else {                                                          \
5331             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5332         }                                                                 \
5333     }                                                                     \
5334     env->vstart = 0;                                                      \
5335     /* set tail elements to 1s */                                         \
5336     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5337 }
5338 
5339 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5340 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5341 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5342 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5343 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5344 
5345 /* Vector Compress Instruction */
5346 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5347 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5348                   CPURISCVState *env, uint32_t desc)                      \
5349 {                                                                         \
5350     uint32_t vl = env->vl;                                                \
5351     uint32_t esz = sizeof(ETYPE);                                         \
5352     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5353     uint32_t vta = vext_vta(desc);                                        \
5354     uint32_t num = 0, i;                                                  \
5355                                                                           \
5356     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5357                                                                           \
5358     for (i = env->vstart; i < vl; i++) {                                  \
5359         if (!vext_elem_mask(vs1, i)) {                                    \
5360             continue;                                                     \
5361         }                                                                 \
5362         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5363         num++;                                                            \
5364     }                                                                     \
5365     env->vstart = 0;                                                      \
5366     /* set tail elements to 1s */                                         \
5367     vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
5368 }
5369 
5370 /* Compress into vd elements of vs2 where vs1 is enabled */
5371 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5372 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5373 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5374 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5375 
5376 /* Vector Whole Register Move */
5377 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5378 {
5379     /* EEW = SEW */
5380     uint32_t maxsz = simd_maxsz(desc);
5381     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5382     uint32_t startb = env->vstart * sewb;
5383     uint32_t i = startb;
5384 
5385     if (startb >= maxsz) {
5386         env->vstart = 0;
5387         return;
5388     }
5389 
5390     if (HOST_BIG_ENDIAN && i % 8 != 0) {
5391         uint32_t j = ROUND_UP(i, 8);
5392         memcpy((uint8_t *)vd + H1(j - 1),
5393                (uint8_t *)vs2 + H1(j - 1),
5394                j - i);
5395         i = j;
5396     }
5397 
5398     memcpy((uint8_t *)vd + H1(i),
5399            (uint8_t *)vs2 + H1(i),
5400            maxsz - i);
5401 
5402     env->vstart = 0;
5403 }
5404 
5405 /* Vector Integer Extension */
5406 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5407 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5408                   CPURISCVState *env, uint32_t desc)             \
5409 {                                                                \
5410     uint32_t vl = env->vl;                                       \
5411     uint32_t vm = vext_vm(desc);                                 \
5412     uint32_t esz = sizeof(ETYPE);                                \
5413     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5414     uint32_t vta = vext_vta(desc);                               \
5415     uint32_t vma = vext_vma(desc);                               \
5416     uint32_t i;                                                  \
5417                                                                  \
5418     VSTART_CHECK_EARLY_EXIT(env, vl);                            \
5419                                                                  \
5420     for (i = env->vstart; i < vl; i++) {                         \
5421         if (!vm && !vext_elem_mask(v0, i)) {                     \
5422             /* set masked-off elements to 1s */                  \
5423             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5424             continue;                                            \
5425         }                                                        \
5426         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5427     }                                                            \
5428     env->vstart = 0;                                             \
5429     /* set tail elements to 1s */                                \
5430     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5431 }
5432 
5433 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5434 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5435 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5436 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5437 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5438 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5439 
5440 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5441 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5442 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5443 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5444 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5445 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5446