xref: /qemu/target/riscv/vector_helper.c (revision 42fa9665e598c268a7ccfab5b92636618d9574ec)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "accel/tcg/cpu-ldst.h"
26 #include "exec/page-protection.h"
27 #include "exec/helper-proto.h"
28 #include "exec/tlb-flags.h"
29 #include "exec/target_page.h"
30 #include "exec/tswap.h"
31 #include "fpu/softfloat.h"
32 #include "tcg/tcg-gvec-desc.h"
33 #include "internals.h"
34 #include "vector_internals.h"
35 #include <math.h>
36 
37 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
38                             target_ulong s2)
39 {
40     int vlmax, vl;
41     RISCVCPU *cpu = env_archcpu(env);
42     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
43     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
44     uint16_t sew = 8 << vsew;
45     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
46     int xlen = riscv_cpu_xlen(env);
47     bool vill = (s2 >> (xlen - 1)) & 0x1;
48     target_ulong reserved = s2 &
49                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
50                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
51     uint16_t vlen = cpu->cfg.vlenb << 3;
52     int8_t lmul;
53 
54     if (vlmul & 4) {
55         /*
56          * Fractional LMUL, check:
57          *
58          * VLEN * LMUL >= SEW
59          * VLEN >> (8 - lmul) >= sew
60          * (vlenb << 3) >> (8 - lmul) >= sew
61          */
62         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
63             vill = true;
64         }
65     }
66 
67     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
68         /* only set vill bit. */
69         env->vill = 1;
70         env->vtype = 0;
71         env->vl = 0;
72         env->vstart = 0;
73         return 0;
74     }
75 
76     /* lmul encoded as in DisasContext::lmul */
77     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
78     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
79     if (s1 <= vlmax) {
80         vl = s1;
81     } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
82         vl = (s1 + 1) >> 1;
83     } else {
84         vl = vlmax;
85     }
86     env->vl = vl;
87     env->vtype = s2;
88     env->vstart = 0;
89     env->vill = 0;
90     return vl;
91 }
92 
93 /*
94  * Get the maximum number of elements can be operated.
95  *
96  * log2_esz: log2 of element size in bytes.
97  */
98 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
99 {
100     /*
101      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
102      * so vlen in bytes (vlenb) is encoded as maxsz.
103      */
104     uint32_t vlenb = simd_maxsz(desc);
105 
106     /* Return VLMAX */
107     int scale = vext_lmul(desc) - log2_esz;
108     return scale < 0 ? vlenb >> -scale : vlenb << scale;
109 }
110 
111 /*
112  * This function checks watchpoint before real load operation.
113  *
114  * In system mode, the TLB API probe_access is enough for watchpoint check.
115  * In user mode, there is no watchpoint support now.
116  *
117  * It will trigger an exception if there is no mapping in TLB
118  * and page table walk can't fill the TLB entry. Then the guest
119  * software can return here after process the exception or never return.
120  */
121 static void probe_pages(CPURISCVState *env, target_ulong addr,
122                         target_ulong len, uintptr_t ra,
123                         MMUAccessType access_type)
124 {
125     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
126     target_ulong curlen = MIN(pagelen, len);
127     int mmu_index = riscv_env_mmu_index(env, false);
128 
129     probe_access(env, adjust_addr(env, addr), curlen, access_type,
130                  mmu_index, ra);
131     if (len > curlen) {
132         addr += curlen;
133         curlen = len - curlen;
134         probe_access(env, adjust_addr(env, addr), curlen, access_type,
135                      mmu_index, ra);
136     }
137 }
138 
139 static inline void vext_set_elem_mask(void *v0, int index,
140                                       uint8_t value)
141 {
142     int idx = index / 64;
143     int pos = index % 64;
144     uint64_t old = ((uint64_t *)v0)[idx];
145     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
146 }
147 
148 /* elements operations for load and store */
149 typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
150                                    uint32_t idx, void *vd, uintptr_t retaddr);
151 typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
152 
153 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
154 static inline QEMU_ALWAYS_INLINE                            \
155 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
156                 uint32_t idx, void *vd, uintptr_t retaddr)  \
157 {                                                           \
158     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
159     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
160 }                                                           \
161                                                             \
162 static inline QEMU_ALWAYS_INLINE                            \
163 void NAME##_host(void *vd, uint32_t idx, void *host)        \
164 {                                                           \
165     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
166     *cur = (ETYPE)LDSUF##_p(host);                          \
167 }
168 
169 GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
170 GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
171 GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
172 GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
173 
174 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
175 static inline QEMU_ALWAYS_INLINE                            \
176 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
177                 uint32_t idx, void *vd, uintptr_t retaddr)  \
178 {                                                           \
179     ETYPE data = *((ETYPE *)vd + H(idx));                   \
180     cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
181 }                                                           \
182                                                             \
183 static inline QEMU_ALWAYS_INLINE                            \
184 void NAME##_host(void *vd, uint32_t idx, void *host)        \
185 {                                                           \
186     ETYPE data = *((ETYPE *)vd + H(idx));                   \
187     STSUF##_p(host, data);                                  \
188 }
189 
190 GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
191 GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
192 GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
193 GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
194 
195 static inline QEMU_ALWAYS_INLINE void
196 vext_continuous_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
197                        void *vd, uint32_t evl, target_ulong addr,
198                        uint32_t reg_start, uintptr_t ra, uint32_t esz,
199                        bool is_load)
200 {
201     uint32_t i;
202     for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
203         ldst_tlb(env, adjust_addr(env, addr), i, vd, ra);
204     }
205 }
206 
207 static inline QEMU_ALWAYS_INLINE void
208 vext_continuous_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
209                         void *vd, uint32_t evl, uint32_t reg_start, void *host,
210                         uint32_t esz, bool is_load)
211 {
212 #if HOST_BIG_ENDIAN
213     for (; reg_start < evl; reg_start++, host += esz) {
214         ldst_host(vd, reg_start, host);
215     }
216 #else
217     if (esz == 1) {
218         uint32_t byte_offset = reg_start * esz;
219         uint32_t size = (evl - reg_start) * esz;
220 
221         if (is_load) {
222             memcpy(vd + byte_offset, host, size);
223         } else {
224             memcpy(host, vd + byte_offset, size);
225         }
226     } else {
227         for (; reg_start < evl; reg_start++, host += esz) {
228             ldst_host(vd, reg_start, host);
229         }
230     }
231 #endif
232 }
233 
234 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
235                                    uint32_t desc, uint32_t nf,
236                                    uint32_t esz, uint32_t max_elems)
237 {
238     uint32_t vta = vext_vta(desc);
239     int k;
240 
241     if (vta == 0) {
242         return;
243     }
244 
245     for (k = 0; k < nf; ++k) {
246         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
247                           (k * max_elems + max_elems) * esz);
248     }
249 }
250 
251 /*
252  * stride: access vector element from strided memory
253  */
254 static void
255 vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride,
256                  CPURISCVState *env, uint32_t desc, uint32_t vm,
257                  vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
258                  uintptr_t ra)
259 {
260     uint32_t i, k;
261     uint32_t nf = vext_nf(desc);
262     uint32_t max_elems = vext_max_elems(desc, log2_esz);
263     uint32_t esz = 1 << log2_esz;
264     uint32_t vma = vext_vma(desc);
265 
266     VSTART_CHECK_EARLY_EXIT(env, env->vl);
267 
268     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
269         k = 0;
270         while (k < nf) {
271             if (!vm && !vext_elem_mask(v0, i)) {
272                 /* set masked-off elements to 1s */
273                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
274                                   (i + k * max_elems + 1) * esz);
275                 k++;
276                 continue;
277             }
278             target_ulong addr = base + stride * i + (k << log2_esz);
279             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
280             k++;
281         }
282     }
283     env->vstart = 0;
284 
285     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
286 }
287 
288 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
289 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
290                   target_ulong stride, CPURISCVState *env,              \
291                   uint32_t desc)                                        \
292 {                                                                       \
293     uint32_t vm = vext_vm(desc);                                        \
294     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
295                      ctzl(sizeof(ETYPE)), GETPC());                     \
296 }
297 
298 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
299 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
300 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
301 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
302 
303 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
304 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
305                   target_ulong stride, CPURISCVState *env,              \
306                   uint32_t desc)                                        \
307 {                                                                       \
308     uint32_t vm = vext_vm(desc);                                        \
309     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
310                      ctzl(sizeof(ETYPE)), GETPC());                     \
311 }
312 
313 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
314 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
315 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
316 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
317 
318 /*
319  * unit-stride: access elements stored contiguously in memory
320  */
321 
322 /* unmasked unit-stride load and store operation */
323 static inline QEMU_ALWAYS_INLINE void
324 vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
325                   uint32_t elems, uint32_t nf, uint32_t max_elems,
326                   uint32_t log2_esz, bool is_load, int mmu_index,
327                   vext_ldst_elem_fn_tlb *ldst_tlb,
328                   vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
329 {
330     void *host;
331     int i, k, flags;
332     uint32_t esz = 1 << log2_esz;
333     uint32_t size = (elems * nf) << log2_esz;
334     uint32_t evl = env->vstart + elems;
335     MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
336 
337     /* Check page permission/pmp/watchpoint/etc. */
338     flags = probe_access_flags(env, adjust_addr(env, addr), size, access_type,
339                                mmu_index, true, &host, ra);
340 
341     if (flags == 0) {
342         if (nf == 1) {
343             vext_continuous_ldst_host(env, ldst_host, vd, evl, env->vstart,
344                                       host, esz, is_load);
345         } else {
346             for (i = env->vstart; i < evl; ++i) {
347                 k = 0;
348                 while (k < nf) {
349                     ldst_host(vd, i + k * max_elems, host);
350                     host += esz;
351                     k++;
352                 }
353             }
354         }
355         env->vstart += elems;
356     } else {
357         if (nf == 1) {
358             vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
359                                    ra, esz, is_load);
360         } else {
361             /* load bytes from guest memory */
362             for (i = env->vstart; i < evl; env->vstart = ++i) {
363                 k = 0;
364                 while (k < nf) {
365                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
366                              vd, ra);
367                     addr += esz;
368                     k++;
369                 }
370             }
371         }
372     }
373 }
374 
375 static inline QEMU_ALWAYS_INLINE void
376 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
377              vext_ldst_elem_fn_tlb *ldst_tlb,
378              vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
379              uint32_t evl, uintptr_t ra, bool is_load)
380 {
381     uint32_t k;
382     target_ulong page_split, elems, addr;
383     uint32_t nf = vext_nf(desc);
384     uint32_t max_elems = vext_max_elems(desc, log2_esz);
385     uint32_t esz = 1 << log2_esz;
386     uint32_t msize = nf * esz;
387     int mmu_index = riscv_env_mmu_index(env, false);
388 
389     VSTART_CHECK_EARLY_EXIT(env, evl);
390 
391 #if defined(CONFIG_USER_ONLY)
392     /*
393      * For data sizes <= 6 bytes we get better performance by simply calling
394      * vext_continuous_ldst_tlb
395      */
396     if (nf == 1 && (evl << log2_esz) <= 6) {
397         addr = base + (env->vstart << log2_esz);
398         vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart, ra,
399                                  esz, is_load);
400 
401         env->vstart = 0;
402         vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
403         return;
404     }
405 #endif
406 
407     /* Calculate the page range of first page */
408     addr = base + ((env->vstart * nf) << log2_esz);
409     page_split = -(addr | TARGET_PAGE_MASK);
410     /* Get number of elements */
411     elems = page_split / msize;
412     if (unlikely(env->vstart + elems >= evl)) {
413         elems = evl - env->vstart;
414     }
415 
416     /* Load/store elements in the first page */
417     if (likely(elems)) {
418         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
419                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
420     }
421 
422     /* Load/store elements in the second page */
423     if (unlikely(env->vstart < evl)) {
424         /* Cross page element */
425         if (unlikely(page_split % msize)) {
426             for (k = 0; k < nf; k++) {
427                 addr = base + ((env->vstart * nf + k) << log2_esz);
428                 ldst_tlb(env, adjust_addr(env, addr),
429                         env->vstart + k * max_elems, vd, ra);
430             }
431             env->vstart++;
432         }
433 
434         addr = base + ((env->vstart * nf) << log2_esz);
435         /* Get number of elements of second page */
436         elems = evl - env->vstart;
437 
438         /* Load/store elements in the second page */
439         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
440                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
441     }
442 
443     env->vstart = 0;
444     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
445 }
446 
447 /*
448  * masked unit-stride load and store operation will be a special case of
449  * stride, stride = NF * sizeof (ETYPE)
450  */
451 
452 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
453 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
454                          CPURISCVState *env, uint32_t desc)         \
455 {                                                                   \
456     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
457     vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
458                      LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
459 }                                                                   \
460                                                                     \
461 void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
462                   CPURISCVState *env, uint32_t desc)                \
463 {                                                                   \
464     vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
465                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
466 }
467 
468 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
469 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
470 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
471 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
472 
473 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
474 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
475                          CPURISCVState *env, uint32_t desc)              \
476 {                                                                        \
477     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
478     vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
479                      STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
480 }                                                                        \
481                                                                          \
482 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
483                   CPURISCVState *env, uint32_t desc)                     \
484 {                                                                        \
485     vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
486                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
487 }
488 
489 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
490 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
491 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
492 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
493 
494 /*
495  * unit stride mask load and store, EEW = 1
496  */
497 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
498                     CPURISCVState *env, uint32_t desc)
499 {
500     /* evl = ceil(vl/8) */
501     uint8_t evl = (env->vl + 7) >> 3;
502     vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
503                  0, evl, GETPC(), true);
504 }
505 
506 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
507                     CPURISCVState *env, uint32_t desc)
508 {
509     /* evl = ceil(vl/8) */
510     uint8_t evl = (env->vl + 7) >> 3;
511     vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
512                  0, evl, GETPC(), false);
513 }
514 
515 /*
516  * index: access vector element from indexed memory
517  */
518 typedef target_ulong vext_get_index_addr(target_ulong base,
519         uint32_t idx, void *vs2);
520 
521 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
522 static target_ulong NAME(target_ulong base,            \
523                          uint32_t idx, void *vs2)      \
524 {                                                      \
525     return (base + *((ETYPE *)vs2 + H(idx)));          \
526 }
527 
528 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
529 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
530 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
531 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
532 
533 static inline void
534 vext_ldst_index(void *vd, void *v0, target_ulong base,
535                 void *vs2, CPURISCVState *env, uint32_t desc,
536                 vext_get_index_addr get_index_addr,
537                 vext_ldst_elem_fn_tlb *ldst_elem,
538                 uint32_t log2_esz, uintptr_t ra)
539 {
540     uint32_t i, k;
541     uint32_t nf = vext_nf(desc);
542     uint32_t vm = vext_vm(desc);
543     uint32_t max_elems = vext_max_elems(desc, log2_esz);
544     uint32_t esz = 1 << log2_esz;
545     uint32_t vma = vext_vma(desc);
546 
547     VSTART_CHECK_EARLY_EXIT(env, env->vl);
548 
549     /* load bytes from guest memory */
550     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
551         k = 0;
552         while (k < nf) {
553             if (!vm && !vext_elem_mask(v0, i)) {
554                 /* set masked-off elements to 1s */
555                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
556                                   (i + k * max_elems + 1) * esz);
557                 k++;
558                 continue;
559             }
560             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
561             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
562             k++;
563         }
564     }
565     env->vstart = 0;
566 
567     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
568 }
569 
570 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
571 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
572                   void *vs2, CPURISCVState *env, uint32_t desc)            \
573 {                                                                          \
574     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
575                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
576 }
577 
578 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
579 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
580 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
581 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
582 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
583 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
584 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
585 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
586 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
587 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
588 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
589 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
590 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
591 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
592 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
593 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
594 
595 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
596 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
597                   void *vs2, CPURISCVState *env, uint32_t desc)  \
598 {                                                                \
599     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
600                     STORE_FN, ctzl(sizeof(ETYPE)),               \
601                     GETPC());                                    \
602 }
603 
604 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
605 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
606 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
607 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
608 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
609 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
610 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
611 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
612 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
613 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
614 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
615 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
616 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
617 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
618 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
619 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
620 
621 /*
622  * unit-stride fault-only-fisrt load instructions
623  */
624 static inline void
625 vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
626           uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
627           vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
628 {
629     uint32_t i, k, vl = 0;
630     uint32_t nf = vext_nf(desc);
631     uint32_t vm = vext_vm(desc);
632     uint32_t max_elems = vext_max_elems(desc, log2_esz);
633     uint32_t esz = 1 << log2_esz;
634     uint32_t msize = nf * esz;
635     uint32_t vma = vext_vma(desc);
636     target_ulong addr, addr_probe, addr_i, offset, remain, page_split, elems;
637     int mmu_index = riscv_env_mmu_index(env, false);
638     int flags;
639     void *host;
640 
641     VSTART_CHECK_EARLY_EXIT(env, env->vl);
642 
643     addr = base + ((env->vstart * nf) << log2_esz);
644     page_split = -(addr | TARGET_PAGE_MASK);
645     /* Get number of elements */
646     elems = page_split / msize;
647     if (unlikely(env->vstart + elems >= env->vl)) {
648         elems = env->vl - env->vstart;
649     }
650 
651     /* Check page permission/pmp/watchpoint/etc. */
652     flags = probe_access_flags(env, adjust_addr(env, addr), elems * msize,
653                                MMU_DATA_LOAD, mmu_index, true, &host, ra);
654 
655     /* If we are crossing a page check also the second page. */
656     if (env->vl > elems) {
657         addr_probe = addr + (elems << log2_esz);
658         flags |= probe_access_flags(env, adjust_addr(env, addr_probe),
659                                     elems * msize, MMU_DATA_LOAD, mmu_index,
660                                     true, &host, ra);
661     }
662 
663     if (flags & ~TLB_WATCHPOINT) {
664         /* probe every access */
665         for (i = env->vstart; i < env->vl; i++) {
666             if (!vm && !vext_elem_mask(v0, i)) {
667                 continue;
668             }
669             addr_i = adjust_addr(env, base + i * (nf << log2_esz));
670             if (i == 0) {
671                 /* Allow fault on first element. */
672                 probe_pages(env, addr_i, nf << log2_esz, ra, MMU_DATA_LOAD);
673             } else {
674                 remain = nf << log2_esz;
675                 while (remain > 0) {
676                     offset = -(addr_i | TARGET_PAGE_MASK);
677 
678                     /* Probe nonfault on subsequent elements. */
679                     flags = probe_access_flags(env, addr_i, offset,
680                                                MMU_DATA_LOAD, mmu_index, true,
681                                                &host, 0);
682 
683                     /*
684                      * Stop if invalid (unmapped) or mmio (transaction may
685                      * fail). Do not stop if watchpoint, as the spec says that
686                      * first-fault should continue to access the same
687                      * elements regardless of any watchpoint.
688                      */
689                     if (flags & ~TLB_WATCHPOINT) {
690                         vl = i;
691                         goto ProbeSuccess;
692                     }
693                     if (remain <= offset) {
694                         break;
695                     }
696                     remain -= offset;
697                     addr_i = adjust_addr(env, addr_i + offset);
698                 }
699             }
700         }
701     }
702 ProbeSuccess:
703     /* load bytes from guest memory */
704     if (vl != 0) {
705         env->vl = vl;
706     }
707 
708     if (env->vstart < env->vl) {
709         if (vm) {
710             /* Load/store elements in the first page */
711             if (likely(elems)) {
712                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
713                                   log2_esz, true, mmu_index, ldst_tlb,
714                                   ldst_host, ra);
715             }
716 
717             /* Load/store elements in the second page */
718             if (unlikely(env->vstart < env->vl)) {
719                 /* Cross page element */
720                 if (unlikely(page_split % msize)) {
721                     for (k = 0; k < nf; k++) {
722                         addr = base + ((env->vstart * nf + k) << log2_esz);
723                         ldst_tlb(env, adjust_addr(env, addr),
724                                  env->vstart + k * max_elems, vd, ra);
725                     }
726                     env->vstart++;
727                 }
728 
729                 addr = base + ((env->vstart * nf) << log2_esz);
730                 /* Get number of elements of second page */
731                 elems = env->vl - env->vstart;
732 
733                 /* Load/store elements in the second page */
734                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
735                                   log2_esz, true, mmu_index, ldst_tlb,
736                                   ldst_host, ra);
737             }
738         } else {
739             for (i = env->vstart; i < env->vl; i++) {
740                 k = 0;
741                 while (k < nf) {
742                     if (!vext_elem_mask(v0, i)) {
743                         /* set masked-off elements to 1s */
744                         vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
745                                           (i + k * max_elems + 1) * esz);
746                         k++;
747                         continue;
748                     }
749                     addr = base + ((i * nf + k) << log2_esz);
750                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
751                              vd, ra);
752                     k++;
753                 }
754             }
755         }
756     }
757     env->vstart = 0;
758 
759     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
760 }
761 
762 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
763 void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
764                   CPURISCVState *env, uint32_t desc)            \
765 {                                                               \
766     vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
767               LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
768 }
769 
770 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
771 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
772 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
773 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
774 
775 #define DO_SWAP(N, M) (M)
776 #define DO_AND(N, M)  (N & M)
777 #define DO_XOR(N, M)  (N ^ M)
778 #define DO_OR(N, M)   (N | M)
779 #define DO_ADD(N, M)  (N + M)
780 
781 /* Signed min/max */
782 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
783 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
784 
785 /*
786  * load and store whole register instructions
787  */
788 static inline QEMU_ALWAYS_INLINE void
789 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
790                 vext_ldst_elem_fn_tlb *ldst_tlb,
791                 vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
792                 uintptr_t ra, bool is_load)
793 {
794     target_ulong page_split, elems, addr;
795     uint32_t nf = vext_nf(desc);
796     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
797     uint32_t max_elems = vlenb >> log2_esz;
798     uint32_t evl = nf * max_elems;
799     uint32_t esz = 1 << log2_esz;
800     int mmu_index = riscv_env_mmu_index(env, false);
801 
802     /* Calculate the page range of first page */
803     addr = base + (env->vstart << log2_esz);
804     page_split = -(addr | TARGET_PAGE_MASK);
805     /* Get number of elements */
806     elems = page_split / esz;
807     if (unlikely(env->vstart + elems >= evl)) {
808         elems = evl - env->vstart;
809     }
810 
811     /* Load/store elements in the first page */
812     if (likely(elems)) {
813         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
814                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
815     }
816 
817     /* Load/store elements in the second page */
818     if (unlikely(env->vstart < evl)) {
819         /* Cross page element */
820         if (unlikely(page_split % esz)) {
821             addr = base + (env->vstart << log2_esz);
822             ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
823             env->vstart++;
824         }
825 
826         addr = base + (env->vstart << log2_esz);
827         /* Get number of elements of second page */
828         elems = evl - env->vstart;
829 
830         /* Load/store elements in the second page */
831         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
832                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
833     }
834 
835     env->vstart = 0;
836 }
837 
838 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
839 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
840                   uint32_t desc)                                    \
841 {                                                                   \
842     vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
843                     ctzl(sizeof(ETYPE)), GETPC(), true);            \
844 }
845 
846 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
847 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
848 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
849 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
850 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
851 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
852 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
853 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
854 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
855 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
856 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
857 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
858 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
859 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
860 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
861 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
862 
863 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
864 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
865                   uint32_t desc)                                        \
866 {                                                                       \
867     vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
868                     ctzl(sizeof(ETYPE)), GETPC(), false);               \
869 }
870 
871 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
872 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
873 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
874 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
875 
876 /*
877  * Vector Integer Arithmetic Instructions
878  */
879 
880 /* (TD, T1, T2, TX1, TX2) */
881 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
882 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
883 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
884 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
885 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
886 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
887 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
888 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
889 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
890 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
891 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
892 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
893 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
894 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
895 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
896 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
897 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
898 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
899 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
900 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
901 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
902 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
903 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
904 
905 #define DO_SUB(N, M) (N - M)
906 #define DO_RSUB(N, M) (M - N)
907 
908 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
909 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
910 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
911 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
912 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
913 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
914 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
915 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
916 
917 GEN_VEXT_VV(vadd_vv_b, 1)
918 GEN_VEXT_VV(vadd_vv_h, 2)
919 GEN_VEXT_VV(vadd_vv_w, 4)
920 GEN_VEXT_VV(vadd_vv_d, 8)
921 GEN_VEXT_VV(vsub_vv_b, 1)
922 GEN_VEXT_VV(vsub_vv_h, 2)
923 GEN_VEXT_VV(vsub_vv_w, 4)
924 GEN_VEXT_VV(vsub_vv_d, 8)
925 
926 
927 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
928 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
929 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
930 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
931 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
932 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
933 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
934 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
935 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
936 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
937 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
938 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
939 
940 GEN_VEXT_VX(vadd_vx_b, 1)
941 GEN_VEXT_VX(vadd_vx_h, 2)
942 GEN_VEXT_VX(vadd_vx_w, 4)
943 GEN_VEXT_VX(vadd_vx_d, 8)
944 GEN_VEXT_VX(vsub_vx_b, 1)
945 GEN_VEXT_VX(vsub_vx_h, 2)
946 GEN_VEXT_VX(vsub_vx_w, 4)
947 GEN_VEXT_VX(vsub_vx_d, 8)
948 GEN_VEXT_VX(vrsub_vx_b, 1)
949 GEN_VEXT_VX(vrsub_vx_h, 2)
950 GEN_VEXT_VX(vrsub_vx_w, 4)
951 GEN_VEXT_VX(vrsub_vx_d, 8)
952 
953 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
954 {
955     intptr_t oprsz = simd_oprsz(desc);
956     intptr_t i;
957 
958     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
959         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
960     }
961 }
962 
963 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
964 {
965     intptr_t oprsz = simd_oprsz(desc);
966     intptr_t i;
967 
968     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
969         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
970     }
971 }
972 
973 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
974 {
975     intptr_t oprsz = simd_oprsz(desc);
976     intptr_t i;
977 
978     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
979         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
980     }
981 }
982 
983 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
984 {
985     intptr_t oprsz = simd_oprsz(desc);
986     intptr_t i;
987 
988     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
989         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
990     }
991 }
992 
993 /* Vector Widening Integer Add/Subtract */
994 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
995 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
996 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
997 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
998 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
999 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
1000 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
1001 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
1002 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
1003 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
1004 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
1005 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
1006 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
1007 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
1008 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
1009 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
1010 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
1011 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1012 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1013 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1014 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1015 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1016 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1017 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1018 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1019 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1020 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1021 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1022 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1023 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1024 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1025 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1026 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1027 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1028 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1029 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1030 GEN_VEXT_VV(vwaddu_vv_b, 2)
1031 GEN_VEXT_VV(vwaddu_vv_h, 4)
1032 GEN_VEXT_VV(vwaddu_vv_w, 8)
1033 GEN_VEXT_VV(vwsubu_vv_b, 2)
1034 GEN_VEXT_VV(vwsubu_vv_h, 4)
1035 GEN_VEXT_VV(vwsubu_vv_w, 8)
1036 GEN_VEXT_VV(vwadd_vv_b, 2)
1037 GEN_VEXT_VV(vwadd_vv_h, 4)
1038 GEN_VEXT_VV(vwadd_vv_w, 8)
1039 GEN_VEXT_VV(vwsub_vv_b, 2)
1040 GEN_VEXT_VV(vwsub_vv_h, 4)
1041 GEN_VEXT_VV(vwsub_vv_w, 8)
1042 GEN_VEXT_VV(vwaddu_wv_b, 2)
1043 GEN_VEXT_VV(vwaddu_wv_h, 4)
1044 GEN_VEXT_VV(vwaddu_wv_w, 8)
1045 GEN_VEXT_VV(vwsubu_wv_b, 2)
1046 GEN_VEXT_VV(vwsubu_wv_h, 4)
1047 GEN_VEXT_VV(vwsubu_wv_w, 8)
1048 GEN_VEXT_VV(vwadd_wv_b, 2)
1049 GEN_VEXT_VV(vwadd_wv_h, 4)
1050 GEN_VEXT_VV(vwadd_wv_w, 8)
1051 GEN_VEXT_VV(vwsub_wv_b, 2)
1052 GEN_VEXT_VV(vwsub_wv_h, 4)
1053 GEN_VEXT_VV(vwsub_wv_w, 8)
1054 
1055 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1056 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1057 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1058 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1059 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1060 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1061 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1062 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1063 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1064 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1065 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1066 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1067 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1068 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1069 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1070 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1071 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1072 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1073 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1074 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1075 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1076 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1077 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1078 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1079 GEN_VEXT_VX(vwaddu_vx_b, 2)
1080 GEN_VEXT_VX(vwaddu_vx_h, 4)
1081 GEN_VEXT_VX(vwaddu_vx_w, 8)
1082 GEN_VEXT_VX(vwsubu_vx_b, 2)
1083 GEN_VEXT_VX(vwsubu_vx_h, 4)
1084 GEN_VEXT_VX(vwsubu_vx_w, 8)
1085 GEN_VEXT_VX(vwadd_vx_b, 2)
1086 GEN_VEXT_VX(vwadd_vx_h, 4)
1087 GEN_VEXT_VX(vwadd_vx_w, 8)
1088 GEN_VEXT_VX(vwsub_vx_b, 2)
1089 GEN_VEXT_VX(vwsub_vx_h, 4)
1090 GEN_VEXT_VX(vwsub_vx_w, 8)
1091 GEN_VEXT_VX(vwaddu_wx_b, 2)
1092 GEN_VEXT_VX(vwaddu_wx_h, 4)
1093 GEN_VEXT_VX(vwaddu_wx_w, 8)
1094 GEN_VEXT_VX(vwsubu_wx_b, 2)
1095 GEN_VEXT_VX(vwsubu_wx_h, 4)
1096 GEN_VEXT_VX(vwsubu_wx_w, 8)
1097 GEN_VEXT_VX(vwadd_wx_b, 2)
1098 GEN_VEXT_VX(vwadd_wx_h, 4)
1099 GEN_VEXT_VX(vwadd_wx_w, 8)
1100 GEN_VEXT_VX(vwsub_wx_b, 2)
1101 GEN_VEXT_VX(vwsub_wx_h, 4)
1102 GEN_VEXT_VX(vwsub_wx_w, 8)
1103 
1104 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1105 #define DO_VADC(N, M, C) (N + M + C)
1106 #define DO_VSBC(N, M, C) (N - M - C)
1107 
1108 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1109 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1110                   CPURISCVState *env, uint32_t desc)          \
1111 {                                                             \
1112     uint32_t vl = env->vl;                                    \
1113     uint32_t esz = sizeof(ETYPE);                             \
1114     uint32_t total_elems =                                    \
1115         vext_get_total_elems(env, desc, esz);                 \
1116     uint32_t vta = vext_vta(desc);                            \
1117     uint32_t i;                                               \
1118                                                               \
1119     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1120                                                               \
1121     for (i = env->vstart; i < vl; i++) {                      \
1122         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1123         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1124         ETYPE carry = vext_elem_mask(v0, i);                  \
1125                                                               \
1126         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1127     }                                                         \
1128     env->vstart = 0;                                          \
1129     /* set tail elements to 1s */                             \
1130     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1131 }
1132 
1133 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1134 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1135 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1136 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1137 
1138 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1139 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1140 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1141 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1142 
1143 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1144 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1145                   CPURISCVState *env, uint32_t desc)                     \
1146 {                                                                        \
1147     uint32_t vl = env->vl;                                               \
1148     uint32_t esz = sizeof(ETYPE);                                        \
1149     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1150     uint32_t vta = vext_vta(desc);                                       \
1151     uint32_t i;                                                          \
1152                                                                          \
1153     VSTART_CHECK_EARLY_EXIT(env, vl);                                    \
1154                                                                          \
1155     for (i = env->vstart; i < vl; i++) {                                 \
1156         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1157         ETYPE carry = vext_elem_mask(v0, i);                             \
1158                                                                          \
1159         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1160     }                                                                    \
1161     env->vstart = 0;                                                     \
1162     /* set tail elements to 1s */                                        \
1163     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1164 }
1165 
1166 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1167 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1168 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1169 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1170 
1171 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1172 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1173 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1174 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1175 
1176 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1177                           (__typeof(N))(N + M) < N)
1178 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1179 
1180 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1181 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1182                   CPURISCVState *env, uint32_t desc)          \
1183 {                                                             \
1184     uint32_t vl = env->vl;                                    \
1185     uint32_t vm = vext_vm(desc);                              \
1186     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1187     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1188     uint32_t i;                                               \
1189                                                               \
1190     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1191                                                               \
1192     for (i = env->vstart; i < vl; i++) {                      \
1193         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1194         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1195         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1196         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1197     }                                                         \
1198     env->vstart = 0;                                          \
1199     /*
1200      * mask destination register are always tail-agnostic
1201      * set tail elements to 1s
1202      */                                                       \
1203     if (vta_all_1s) {                                         \
1204         for (; i < total_elems; i++) {                        \
1205             vext_set_elem_mask(vd, i, 1);                     \
1206         }                                                     \
1207     }                                                         \
1208 }
1209 
1210 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1211 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1212 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1213 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1214 
1215 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1216 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1217 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1218 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1219 
1220 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1221 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1222                   void *vs2, CPURISCVState *env, uint32_t desc) \
1223 {                                                               \
1224     uint32_t vl = env->vl;                                      \
1225     uint32_t vm = vext_vm(desc);                                \
1226     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1227     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1228     uint32_t i;                                                 \
1229                                                                 \
1230     VSTART_CHECK_EARLY_EXIT(env, vl);                           \
1231                                                                 \
1232     for (i = env->vstart; i < vl; i++) {                        \
1233         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1234         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1235         vext_set_elem_mask(vd, i,                               \
1236                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1237     }                                                           \
1238     env->vstart = 0;                                            \
1239     /*
1240      * mask destination register are always tail-agnostic
1241      * set tail elements to 1s
1242      */                                                         \
1243     if (vta_all_1s) {                                           \
1244         for (; i < total_elems; i++) {                          \
1245             vext_set_elem_mask(vd, i, 1);                       \
1246         }                                                       \
1247     }                                                           \
1248 }
1249 
1250 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1251 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1252 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1253 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1254 
1255 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1256 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1257 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1258 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1259 
1260 /* Vector Bitwise Logical Instructions */
1261 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1262 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1263 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1264 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1265 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1266 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1267 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1268 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1269 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1270 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1271 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1272 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1273 GEN_VEXT_VV(vand_vv_b, 1)
1274 GEN_VEXT_VV(vand_vv_h, 2)
1275 GEN_VEXT_VV(vand_vv_w, 4)
1276 GEN_VEXT_VV(vand_vv_d, 8)
1277 GEN_VEXT_VV(vor_vv_b, 1)
1278 GEN_VEXT_VV(vor_vv_h, 2)
1279 GEN_VEXT_VV(vor_vv_w, 4)
1280 GEN_VEXT_VV(vor_vv_d, 8)
1281 GEN_VEXT_VV(vxor_vv_b, 1)
1282 GEN_VEXT_VV(vxor_vv_h, 2)
1283 GEN_VEXT_VV(vxor_vv_w, 4)
1284 GEN_VEXT_VV(vxor_vv_d, 8)
1285 
1286 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1287 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1288 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1289 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1290 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1291 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1292 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1293 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1294 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1295 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1296 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1297 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1298 GEN_VEXT_VX(vand_vx_b, 1)
1299 GEN_VEXT_VX(vand_vx_h, 2)
1300 GEN_VEXT_VX(vand_vx_w, 4)
1301 GEN_VEXT_VX(vand_vx_d, 8)
1302 GEN_VEXT_VX(vor_vx_b, 1)
1303 GEN_VEXT_VX(vor_vx_h, 2)
1304 GEN_VEXT_VX(vor_vx_w, 4)
1305 GEN_VEXT_VX(vor_vx_d, 8)
1306 GEN_VEXT_VX(vxor_vx_b, 1)
1307 GEN_VEXT_VX(vxor_vx_h, 2)
1308 GEN_VEXT_VX(vxor_vx_w, 4)
1309 GEN_VEXT_VX(vxor_vx_d, 8)
1310 
1311 /* Vector Single-Width Bit Shift Instructions */
1312 #define DO_SLL(N, M)  (N << (M))
1313 #define DO_SRL(N, M)  (N >> (M))
1314 
1315 /* generate the helpers for shift instructions with two vector operators */
1316 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1317 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1318                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1319 {                                                                         \
1320     uint32_t vm = vext_vm(desc);                                          \
1321     uint32_t vl = env->vl;                                                \
1322     uint32_t esz = sizeof(TS1);                                           \
1323     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1324     uint32_t vta = vext_vta(desc);                                        \
1325     uint32_t vma = vext_vma(desc);                                        \
1326     uint32_t i;                                                           \
1327                                                                           \
1328     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
1329                                                                           \
1330     for (i = env->vstart; i < vl; i++) {                                  \
1331         if (!vm && !vext_elem_mask(v0, i)) {                              \
1332             /* set masked-off elements to 1s */                           \
1333             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1334             continue;                                                     \
1335         }                                                                 \
1336         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1337         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1338         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1339     }                                                                     \
1340     env->vstart = 0;                                                      \
1341     /* set tail elements to 1s */                                         \
1342     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1343 }
1344 
1345 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1346 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1347 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1348 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1349 
1350 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1351 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1352 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1353 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1354 
1355 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1356 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1357 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1358 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1359 
1360 /*
1361  * generate the helpers for shift instructions with one vector and one scalar
1362  */
1363 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1364 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1365                   void *vs2, CPURISCVState *env,            \
1366                   uint32_t desc)                            \
1367 {                                                           \
1368     uint32_t vm = vext_vm(desc);                            \
1369     uint32_t vl = env->vl;                                  \
1370     uint32_t esz = sizeof(TD);                              \
1371     uint32_t total_elems =                                  \
1372         vext_get_total_elems(env, desc, esz);               \
1373     uint32_t vta = vext_vta(desc);                          \
1374     uint32_t vma = vext_vma(desc);                          \
1375     uint32_t i;                                             \
1376                                                             \
1377     VSTART_CHECK_EARLY_EXIT(env, vl);                       \
1378                                                             \
1379     for (i = env->vstart; i < vl; i++) {                    \
1380         if (!vm && !vext_elem_mask(v0, i)) {                \
1381             /* set masked-off elements to 1s */             \
1382             vext_set_elems_1s(vd, vma, i * esz,             \
1383                               (i + 1) * esz);               \
1384             continue;                                       \
1385         }                                                   \
1386         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1387         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1388     }                                                       \
1389     env->vstart = 0;                                        \
1390     /* set tail elements to 1s */                           \
1391     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1392 }
1393 
1394 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1395 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1396 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1397 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1398 
1399 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1400 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1401 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1402 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1403 
1404 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1405 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1406 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1407 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1408 
1409 /* Vector Narrowing Integer Right Shift Instructions */
1410 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1411 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1412 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1413 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1414 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1415 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1416 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1417 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1418 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1419 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1420 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1421 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1422 
1423 /* Vector Integer Comparison Instructions */
1424 #define DO_MSEQ(N, M) (N == M)
1425 #define DO_MSNE(N, M) (N != M)
1426 #define DO_MSLT(N, M) (N < M)
1427 #define DO_MSLE(N, M) (N <= M)
1428 #define DO_MSGT(N, M) (N > M)
1429 
1430 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1431 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1432                   CPURISCVState *env, uint32_t desc)          \
1433 {                                                             \
1434     uint32_t vm = vext_vm(desc);                              \
1435     uint32_t vl = env->vl;                                    \
1436     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1437     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1438     uint32_t vma = vext_vma(desc);                            \
1439     uint32_t i;                                               \
1440                                                               \
1441     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1442                                                               \
1443     for (i = env->vstart; i < vl; i++) {                      \
1444         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1445         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1446         if (!vm && !vext_elem_mask(v0, i)) {                  \
1447             /* set masked-off elements to 1s */               \
1448             if (vma) {                                        \
1449                 vext_set_elem_mask(vd, i, 1);                 \
1450             }                                                 \
1451             continue;                                         \
1452         }                                                     \
1453         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1454     }                                                         \
1455     env->vstart = 0;                                          \
1456     /*
1457      * mask destination register are always tail-agnostic
1458      * set tail elements to 1s
1459      */                                                       \
1460     if (vta_all_1s) {                                         \
1461         for (; i < total_elems; i++) {                        \
1462             vext_set_elem_mask(vd, i, 1);                     \
1463         }                                                     \
1464     }                                                         \
1465 }
1466 
1467 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1468 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1469 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1470 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1471 
1472 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1473 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1474 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1475 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1476 
1477 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1478 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1479 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1480 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1481 
1482 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1483 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1484 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1485 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1486 
1487 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1488 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1489 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1490 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1491 
1492 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1493 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1494 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1495 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1496 
1497 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1498 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1499                   CPURISCVState *env, uint32_t desc)                \
1500 {                                                                   \
1501     uint32_t vm = vext_vm(desc);                                    \
1502     uint32_t vl = env->vl;                                          \
1503     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1504     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1505     uint32_t vma = vext_vma(desc);                                  \
1506     uint32_t i;                                                     \
1507                                                                     \
1508     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
1509                                                                     \
1510     for (i = env->vstart; i < vl; i++) {                            \
1511         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1512         if (!vm && !vext_elem_mask(v0, i)) {                        \
1513             /* set masked-off elements to 1s */                     \
1514             if (vma) {                                              \
1515                 vext_set_elem_mask(vd, i, 1);                       \
1516             }                                                       \
1517             continue;                                               \
1518         }                                                           \
1519         vext_set_elem_mask(vd, i,                                   \
1520                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1521     }                                                               \
1522     env->vstart = 0;                                                \
1523     /*
1524      * mask destination register are always tail-agnostic
1525      * set tail elements to 1s
1526      */                                                             \
1527     if (vta_all_1s) {                                               \
1528         for (; i < total_elems; i++) {                              \
1529             vext_set_elem_mask(vd, i, 1);                           \
1530         }                                                           \
1531     }                                                               \
1532 }
1533 
1534 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1535 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1536 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1537 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1538 
1539 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1540 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1541 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1542 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1543 
1544 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1545 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1546 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1547 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1548 
1549 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1550 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1551 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1552 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1553 
1554 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1555 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1556 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1557 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1558 
1559 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1560 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1561 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1562 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1563 
1564 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1565 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1566 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1567 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1568 
1569 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1570 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1571 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1572 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1573 
1574 /* Vector Integer Min/Max Instructions */
1575 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1576 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1577 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1578 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1579 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1580 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1581 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1582 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1583 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1584 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1585 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1586 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1587 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1588 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1589 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1590 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1591 GEN_VEXT_VV(vminu_vv_b, 1)
1592 GEN_VEXT_VV(vminu_vv_h, 2)
1593 GEN_VEXT_VV(vminu_vv_w, 4)
1594 GEN_VEXT_VV(vminu_vv_d, 8)
1595 GEN_VEXT_VV(vmin_vv_b, 1)
1596 GEN_VEXT_VV(vmin_vv_h, 2)
1597 GEN_VEXT_VV(vmin_vv_w, 4)
1598 GEN_VEXT_VV(vmin_vv_d, 8)
1599 GEN_VEXT_VV(vmaxu_vv_b, 1)
1600 GEN_VEXT_VV(vmaxu_vv_h, 2)
1601 GEN_VEXT_VV(vmaxu_vv_w, 4)
1602 GEN_VEXT_VV(vmaxu_vv_d, 8)
1603 GEN_VEXT_VV(vmax_vv_b, 1)
1604 GEN_VEXT_VV(vmax_vv_h, 2)
1605 GEN_VEXT_VV(vmax_vv_w, 4)
1606 GEN_VEXT_VV(vmax_vv_d, 8)
1607 
1608 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1609 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1610 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1611 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1612 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1613 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1614 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1615 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1616 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1617 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1618 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1619 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1620 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1621 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1622 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1623 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1624 GEN_VEXT_VX(vminu_vx_b, 1)
1625 GEN_VEXT_VX(vminu_vx_h, 2)
1626 GEN_VEXT_VX(vminu_vx_w, 4)
1627 GEN_VEXT_VX(vminu_vx_d, 8)
1628 GEN_VEXT_VX(vmin_vx_b, 1)
1629 GEN_VEXT_VX(vmin_vx_h, 2)
1630 GEN_VEXT_VX(vmin_vx_w, 4)
1631 GEN_VEXT_VX(vmin_vx_d, 8)
1632 GEN_VEXT_VX(vmaxu_vx_b, 1)
1633 GEN_VEXT_VX(vmaxu_vx_h, 2)
1634 GEN_VEXT_VX(vmaxu_vx_w, 4)
1635 GEN_VEXT_VX(vmaxu_vx_d, 8)
1636 GEN_VEXT_VX(vmax_vx_b, 1)
1637 GEN_VEXT_VX(vmax_vx_h, 2)
1638 GEN_VEXT_VX(vmax_vx_w, 4)
1639 GEN_VEXT_VX(vmax_vx_d, 8)
1640 
1641 /* Vector Single-Width Integer Multiply Instructions */
1642 #define DO_MUL(N, M) (N * M)
1643 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1644 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1645 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1646 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1647 GEN_VEXT_VV(vmul_vv_b, 1)
1648 GEN_VEXT_VV(vmul_vv_h, 2)
1649 GEN_VEXT_VV(vmul_vv_w, 4)
1650 GEN_VEXT_VV(vmul_vv_d, 8)
1651 
1652 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1653 {
1654     return (int16_t)s2 * (int16_t)s1 >> 8;
1655 }
1656 
1657 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1658 {
1659     return (int32_t)s2 * (int32_t)s1 >> 16;
1660 }
1661 
1662 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1663 {
1664     return (int64_t)s2 * (int64_t)s1 >> 32;
1665 }
1666 
1667 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1668 {
1669     uint64_t hi_64, lo_64;
1670 
1671     muls64(&lo_64, &hi_64, s1, s2);
1672     return hi_64;
1673 }
1674 
1675 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1676 {
1677     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1678 }
1679 
1680 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1681 {
1682     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1683 }
1684 
1685 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1686 {
1687     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1688 }
1689 
1690 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1691 {
1692     uint64_t hi_64, lo_64;
1693 
1694     mulu64(&lo_64, &hi_64, s2, s1);
1695     return hi_64;
1696 }
1697 
1698 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1699 {
1700     return (int16_t)s2 * (uint16_t)s1 >> 8;
1701 }
1702 
1703 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1704 {
1705     return (int32_t)s2 * (uint32_t)s1 >> 16;
1706 }
1707 
1708 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1709 {
1710     return (int64_t)s2 * (uint64_t)s1 >> 32;
1711 }
1712 
1713 /*
1714  * Let  A = signed operand,
1715  *      B = unsigned operand
1716  *      P = mulu64(A, B), unsigned product
1717  *
1718  * LET  X = 2 ** 64  - A, 2's complement of A
1719  *      SP = signed product
1720  * THEN
1721  *      IF A < 0
1722  *          SP = -X * B
1723  *             = -(2 ** 64 - A) * B
1724  *             = A * B - 2 ** 64 * B
1725  *             = P - 2 ** 64 * B
1726  *      ELSE
1727  *          SP = P
1728  * THEN
1729  *      HI_P -= (A < 0 ? B : 0)
1730  */
1731 
1732 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1733 {
1734     uint64_t hi_64, lo_64;
1735 
1736     mulu64(&lo_64, &hi_64, s2, s1);
1737 
1738     hi_64 -= s2 < 0 ? s1 : 0;
1739     return hi_64;
1740 }
1741 
1742 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1743 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1744 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1745 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1746 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1747 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1748 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1749 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1750 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1751 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1752 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1753 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1754 GEN_VEXT_VV(vmulh_vv_b, 1)
1755 GEN_VEXT_VV(vmulh_vv_h, 2)
1756 GEN_VEXT_VV(vmulh_vv_w, 4)
1757 GEN_VEXT_VV(vmulh_vv_d, 8)
1758 GEN_VEXT_VV(vmulhu_vv_b, 1)
1759 GEN_VEXT_VV(vmulhu_vv_h, 2)
1760 GEN_VEXT_VV(vmulhu_vv_w, 4)
1761 GEN_VEXT_VV(vmulhu_vv_d, 8)
1762 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1763 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1764 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1765 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1766 
1767 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1768 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1769 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1770 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1771 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1772 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1773 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1774 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1775 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1776 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1777 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1778 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1779 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1780 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1781 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1782 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1783 GEN_VEXT_VX(vmul_vx_b, 1)
1784 GEN_VEXT_VX(vmul_vx_h, 2)
1785 GEN_VEXT_VX(vmul_vx_w, 4)
1786 GEN_VEXT_VX(vmul_vx_d, 8)
1787 GEN_VEXT_VX(vmulh_vx_b, 1)
1788 GEN_VEXT_VX(vmulh_vx_h, 2)
1789 GEN_VEXT_VX(vmulh_vx_w, 4)
1790 GEN_VEXT_VX(vmulh_vx_d, 8)
1791 GEN_VEXT_VX(vmulhu_vx_b, 1)
1792 GEN_VEXT_VX(vmulhu_vx_h, 2)
1793 GEN_VEXT_VX(vmulhu_vx_w, 4)
1794 GEN_VEXT_VX(vmulhu_vx_d, 8)
1795 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1796 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1797 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1798 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1799 
1800 /* Vector Integer Divide Instructions */
1801 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1802 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1803 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1804         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1805 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1806         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1807 
1808 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1809 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1810 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1811 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1812 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1813 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1814 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1815 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1816 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1817 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1818 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1819 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1820 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1821 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1822 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1823 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1824 GEN_VEXT_VV(vdivu_vv_b, 1)
1825 GEN_VEXT_VV(vdivu_vv_h, 2)
1826 GEN_VEXT_VV(vdivu_vv_w, 4)
1827 GEN_VEXT_VV(vdivu_vv_d, 8)
1828 GEN_VEXT_VV(vdiv_vv_b, 1)
1829 GEN_VEXT_VV(vdiv_vv_h, 2)
1830 GEN_VEXT_VV(vdiv_vv_w, 4)
1831 GEN_VEXT_VV(vdiv_vv_d, 8)
1832 GEN_VEXT_VV(vremu_vv_b, 1)
1833 GEN_VEXT_VV(vremu_vv_h, 2)
1834 GEN_VEXT_VV(vremu_vv_w, 4)
1835 GEN_VEXT_VV(vremu_vv_d, 8)
1836 GEN_VEXT_VV(vrem_vv_b, 1)
1837 GEN_VEXT_VV(vrem_vv_h, 2)
1838 GEN_VEXT_VV(vrem_vv_w, 4)
1839 GEN_VEXT_VV(vrem_vv_d, 8)
1840 
1841 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1842 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1843 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1844 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1845 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1846 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1847 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1848 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1849 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1850 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1851 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1852 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1853 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1854 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1855 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1856 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1857 GEN_VEXT_VX(vdivu_vx_b, 1)
1858 GEN_VEXT_VX(vdivu_vx_h, 2)
1859 GEN_VEXT_VX(vdivu_vx_w, 4)
1860 GEN_VEXT_VX(vdivu_vx_d, 8)
1861 GEN_VEXT_VX(vdiv_vx_b, 1)
1862 GEN_VEXT_VX(vdiv_vx_h, 2)
1863 GEN_VEXT_VX(vdiv_vx_w, 4)
1864 GEN_VEXT_VX(vdiv_vx_d, 8)
1865 GEN_VEXT_VX(vremu_vx_b, 1)
1866 GEN_VEXT_VX(vremu_vx_h, 2)
1867 GEN_VEXT_VX(vremu_vx_w, 4)
1868 GEN_VEXT_VX(vremu_vx_d, 8)
1869 GEN_VEXT_VX(vrem_vx_b, 1)
1870 GEN_VEXT_VX(vrem_vx_h, 2)
1871 GEN_VEXT_VX(vrem_vx_w, 4)
1872 GEN_VEXT_VX(vrem_vx_d, 8)
1873 
1874 /* Vector Widening Integer Multiply Instructions */
1875 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1876 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1877 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1878 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1879 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1880 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1881 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1882 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1883 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1884 GEN_VEXT_VV(vwmul_vv_b, 2)
1885 GEN_VEXT_VV(vwmul_vv_h, 4)
1886 GEN_VEXT_VV(vwmul_vv_w, 8)
1887 GEN_VEXT_VV(vwmulu_vv_b, 2)
1888 GEN_VEXT_VV(vwmulu_vv_h, 4)
1889 GEN_VEXT_VV(vwmulu_vv_w, 8)
1890 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1891 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1892 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1893 
1894 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1895 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1896 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1897 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1898 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1899 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1900 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1901 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1902 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1903 GEN_VEXT_VX(vwmul_vx_b, 2)
1904 GEN_VEXT_VX(vwmul_vx_h, 4)
1905 GEN_VEXT_VX(vwmul_vx_w, 8)
1906 GEN_VEXT_VX(vwmulu_vx_b, 2)
1907 GEN_VEXT_VX(vwmulu_vx_h, 4)
1908 GEN_VEXT_VX(vwmulu_vx_w, 8)
1909 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1910 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1911 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1912 
1913 /* Vector Single-Width Integer Multiply-Add Instructions */
1914 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1915 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1916 {                                                                  \
1917     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1918     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1919     TD d = *((TD *)vd + HD(i));                                    \
1920     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1921 }
1922 
1923 #define DO_MACC(N, M, D) (M * N + D)
1924 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1925 #define DO_MADD(N, M, D) (M * D + N)
1926 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1927 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1928 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1929 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1930 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1931 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1932 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1933 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1934 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1935 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1936 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1937 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1938 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1939 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1940 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1941 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1942 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1943 GEN_VEXT_VV(vmacc_vv_b, 1)
1944 GEN_VEXT_VV(vmacc_vv_h, 2)
1945 GEN_VEXT_VV(vmacc_vv_w, 4)
1946 GEN_VEXT_VV(vmacc_vv_d, 8)
1947 GEN_VEXT_VV(vnmsac_vv_b, 1)
1948 GEN_VEXT_VV(vnmsac_vv_h, 2)
1949 GEN_VEXT_VV(vnmsac_vv_w, 4)
1950 GEN_VEXT_VV(vnmsac_vv_d, 8)
1951 GEN_VEXT_VV(vmadd_vv_b, 1)
1952 GEN_VEXT_VV(vmadd_vv_h, 2)
1953 GEN_VEXT_VV(vmadd_vv_w, 4)
1954 GEN_VEXT_VV(vmadd_vv_d, 8)
1955 GEN_VEXT_VV(vnmsub_vv_b, 1)
1956 GEN_VEXT_VV(vnmsub_vv_h, 2)
1957 GEN_VEXT_VV(vnmsub_vv_w, 4)
1958 GEN_VEXT_VV(vnmsub_vv_d, 8)
1959 
1960 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1961 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1962 {                                                                   \
1963     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1964     TD d = *((TD *)vd + HD(i));                                     \
1965     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1966 }
1967 
1968 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1969 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1970 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1971 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1972 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1973 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1974 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1975 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1976 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1977 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1978 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1979 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1980 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1981 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1982 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1983 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1984 GEN_VEXT_VX(vmacc_vx_b, 1)
1985 GEN_VEXT_VX(vmacc_vx_h, 2)
1986 GEN_VEXT_VX(vmacc_vx_w, 4)
1987 GEN_VEXT_VX(vmacc_vx_d, 8)
1988 GEN_VEXT_VX(vnmsac_vx_b, 1)
1989 GEN_VEXT_VX(vnmsac_vx_h, 2)
1990 GEN_VEXT_VX(vnmsac_vx_w, 4)
1991 GEN_VEXT_VX(vnmsac_vx_d, 8)
1992 GEN_VEXT_VX(vmadd_vx_b, 1)
1993 GEN_VEXT_VX(vmadd_vx_h, 2)
1994 GEN_VEXT_VX(vmadd_vx_w, 4)
1995 GEN_VEXT_VX(vmadd_vx_d, 8)
1996 GEN_VEXT_VX(vnmsub_vx_b, 1)
1997 GEN_VEXT_VX(vnmsub_vx_h, 2)
1998 GEN_VEXT_VX(vnmsub_vx_w, 4)
1999 GEN_VEXT_VX(vnmsub_vx_d, 8)
2000 
2001 /* Vector Widening Integer Multiply-Add Instructions */
2002 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
2003 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
2004 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
2005 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
2006 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
2007 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
2008 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
2009 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
2010 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
2011 GEN_VEXT_VV(vwmaccu_vv_b, 2)
2012 GEN_VEXT_VV(vwmaccu_vv_h, 4)
2013 GEN_VEXT_VV(vwmaccu_vv_w, 8)
2014 GEN_VEXT_VV(vwmacc_vv_b, 2)
2015 GEN_VEXT_VV(vwmacc_vv_h, 4)
2016 GEN_VEXT_VV(vwmacc_vv_w, 8)
2017 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
2018 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
2019 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
2020 
2021 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
2022 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
2023 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
2024 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
2025 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
2026 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
2027 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
2028 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
2029 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
2030 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
2031 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
2032 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
2033 GEN_VEXT_VX(vwmaccu_vx_b, 2)
2034 GEN_VEXT_VX(vwmaccu_vx_h, 4)
2035 GEN_VEXT_VX(vwmaccu_vx_w, 8)
2036 GEN_VEXT_VX(vwmacc_vx_b, 2)
2037 GEN_VEXT_VX(vwmacc_vx_h, 4)
2038 GEN_VEXT_VX(vwmacc_vx_w, 8)
2039 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2040 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2041 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2042 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2043 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2044 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2045 
2046 /* Vector Integer Merge and Move Instructions */
2047 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2048 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2049                   uint32_t desc)                                     \
2050 {                                                                    \
2051     uint32_t vl = env->vl;                                           \
2052     uint32_t esz = sizeof(ETYPE);                                    \
2053     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2054     uint32_t vta = vext_vta(desc);                                   \
2055     uint32_t i;                                                      \
2056                                                                      \
2057     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2058                                                                      \
2059     for (i = env->vstart; i < vl; i++) {                             \
2060         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2061         *((ETYPE *)vd + H(i)) = s1;                                  \
2062     }                                                                \
2063     env->vstart = 0;                                                 \
2064     /* set tail elements to 1s */                                    \
2065     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2066 }
2067 
2068 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2069 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2070 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2071 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2072 
2073 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2074 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2075                   uint32_t desc)                                     \
2076 {                                                                    \
2077     uint32_t vl = env->vl;                                           \
2078     uint32_t esz = sizeof(ETYPE);                                    \
2079     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2080     uint32_t vta = vext_vta(desc);                                   \
2081     uint32_t i;                                                      \
2082                                                                      \
2083     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2084                                                                      \
2085     for (i = env->vstart; i < vl; i++) {                             \
2086         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2087     }                                                                \
2088     env->vstart = 0;                                                 \
2089     /* set tail elements to 1s */                                    \
2090     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2091 }
2092 
2093 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2094 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2095 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2096 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2097 
2098 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2099 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2100                   CPURISCVState *env, uint32_t desc)                 \
2101 {                                                                    \
2102     uint32_t vl = env->vl;                                           \
2103     uint32_t esz = sizeof(ETYPE);                                    \
2104     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2105     uint32_t vta = vext_vta(desc);                                   \
2106     uint32_t i;                                                      \
2107                                                                      \
2108     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2109                                                                      \
2110     for (i = env->vstart; i < vl; i++) {                             \
2111         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2112         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2113     }                                                                \
2114     env->vstart = 0;                                                 \
2115     /* set tail elements to 1s */                                    \
2116     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2117 }
2118 
2119 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2120 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2121 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2122 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2123 
2124 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2125 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2126                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2127 {                                                                    \
2128     uint32_t vl = env->vl;                                           \
2129     uint32_t esz = sizeof(ETYPE);                                    \
2130     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2131     uint32_t vta = vext_vta(desc);                                   \
2132     uint32_t i;                                                      \
2133                                                                      \
2134     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2135                                                                      \
2136     for (i = env->vstart; i < vl; i++) {                             \
2137         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2138         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2139                    (ETYPE)(target_long)s1);                          \
2140         *((ETYPE *)vd + H(i)) = d;                                   \
2141     }                                                                \
2142     env->vstart = 0;                                                 \
2143     /* set tail elements to 1s */                                    \
2144     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2145 }
2146 
2147 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2148 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2149 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2150 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2151 
2152 /*
2153  * Vector Fixed-Point Arithmetic Instructions
2154  */
2155 
2156 /* Vector Single-Width Saturating Add and Subtract */
2157 
2158 /*
2159  * As fixed point instructions probably have round mode and saturation,
2160  * define common macros for fixed point here.
2161  */
2162 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2163                           CPURISCVState *env, int vxrm);
2164 
2165 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2166 static inline void                                                  \
2167 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2168           CPURISCVState *env, int vxrm)                             \
2169 {                                                                   \
2170     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2171     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2172     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2173 }
2174 
2175 static inline void
2176 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2177              CPURISCVState *env,
2178              uint32_t vl, uint32_t vm, int vxrm,
2179              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2180 {
2181     for (uint32_t i = env->vstart; i < vl; i++) {
2182         if (!vm && !vext_elem_mask(v0, i)) {
2183             /* set masked-off elements to 1s */
2184             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2185             continue;
2186         }
2187         fn(vd, vs1, vs2, i, env, vxrm);
2188     }
2189     env->vstart = 0;
2190 }
2191 
2192 static inline void
2193 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2194              CPURISCVState *env,
2195              uint32_t desc,
2196              opivv2_rm_fn *fn, uint32_t esz)
2197 {
2198     uint32_t vm = vext_vm(desc);
2199     uint32_t vl = env->vl;
2200     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2201     uint32_t vta = vext_vta(desc);
2202     uint32_t vma = vext_vma(desc);
2203 
2204     VSTART_CHECK_EARLY_EXIT(env, vl);
2205 
2206     switch (env->vxrm) {
2207     case 0: /* rnu */
2208         vext_vv_rm_1(vd, v0, vs1, vs2,
2209                      env, vl, vm, 0, fn, vma, esz);
2210         break;
2211     case 1: /* rne */
2212         vext_vv_rm_1(vd, v0, vs1, vs2,
2213                      env, vl, vm, 1, fn, vma, esz);
2214         break;
2215     case 2: /* rdn */
2216         vext_vv_rm_1(vd, v0, vs1, vs2,
2217                      env, vl, vm, 2, fn, vma, esz);
2218         break;
2219     default: /* rod */
2220         vext_vv_rm_1(vd, v0, vs1, vs2,
2221                      env, vl, vm, 3, fn, vma, esz);
2222         break;
2223     }
2224     /* set tail elements to 1s */
2225     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2226 }
2227 
2228 /* generate helpers for fixed point instructions with OPIVV format */
2229 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2230 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2231                   CPURISCVState *env, uint32_t desc)            \
2232 {                                                               \
2233     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2234                  do_##NAME, ESZ);                               \
2235 }
2236 
2237 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2238                              uint8_t b)
2239 {
2240     uint8_t res = a + b;
2241     if (res < a) {
2242         res = UINT8_MAX;
2243         env->vxsat = 0x1;
2244     }
2245     return res;
2246 }
2247 
2248 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2249                                uint16_t b)
2250 {
2251     uint16_t res = a + b;
2252     if (res < a) {
2253         res = UINT16_MAX;
2254         env->vxsat = 0x1;
2255     }
2256     return res;
2257 }
2258 
2259 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2260                                uint32_t b)
2261 {
2262     uint32_t res = a + b;
2263     if (res < a) {
2264         res = UINT32_MAX;
2265         env->vxsat = 0x1;
2266     }
2267     return res;
2268 }
2269 
2270 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2271                                uint64_t b)
2272 {
2273     uint64_t res = a + b;
2274     if (res < a) {
2275         res = UINT64_MAX;
2276         env->vxsat = 0x1;
2277     }
2278     return res;
2279 }
2280 
2281 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2282 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2283 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2284 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2285 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2286 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2287 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2288 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2289 
2290 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2291                           CPURISCVState *env, int vxrm);
2292 
2293 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2294 static inline void                                                  \
2295 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2296           CPURISCVState *env, int vxrm)                             \
2297 {                                                                   \
2298     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2299     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2300 }
2301 
2302 static inline void
2303 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2304              CPURISCVState *env,
2305              uint32_t vl, uint32_t vm, int vxrm,
2306              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2307 {
2308     for (uint32_t i = env->vstart; i < vl; i++) {
2309         if (!vm && !vext_elem_mask(v0, i)) {
2310             /* set masked-off elements to 1s */
2311             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2312             continue;
2313         }
2314         fn(vd, s1, vs2, i, env, vxrm);
2315     }
2316     env->vstart = 0;
2317 }
2318 
2319 static inline void
2320 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2321              CPURISCVState *env,
2322              uint32_t desc,
2323              opivx2_rm_fn *fn, uint32_t esz)
2324 {
2325     uint32_t vm = vext_vm(desc);
2326     uint32_t vl = env->vl;
2327     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2328     uint32_t vta = vext_vta(desc);
2329     uint32_t vma = vext_vma(desc);
2330 
2331     VSTART_CHECK_EARLY_EXIT(env, vl);
2332 
2333     switch (env->vxrm) {
2334     case 0: /* rnu */
2335         vext_vx_rm_1(vd, v0, s1, vs2,
2336                      env, vl, vm, 0, fn, vma, esz);
2337         break;
2338     case 1: /* rne */
2339         vext_vx_rm_1(vd, v0, s1, vs2,
2340                      env, vl, vm, 1, fn, vma, esz);
2341         break;
2342     case 2: /* rdn */
2343         vext_vx_rm_1(vd, v0, s1, vs2,
2344                      env, vl, vm, 2, fn, vma, esz);
2345         break;
2346     default: /* rod */
2347         vext_vx_rm_1(vd, v0, s1, vs2,
2348                      env, vl, vm, 3, fn, vma, esz);
2349         break;
2350     }
2351     /* set tail elements to 1s */
2352     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2353 }
2354 
2355 /* generate helpers for fixed point instructions with OPIVX format */
2356 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2357 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2358                   void *vs2, CPURISCVState *env,          \
2359                   uint32_t desc)                          \
2360 {                                                         \
2361     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2362                  do_##NAME, ESZ);                         \
2363 }
2364 
2365 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2366 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2367 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2368 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2369 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2370 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2371 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2372 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2373 
2374 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2375 {
2376     int8_t res = a + b;
2377     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2378         res = a > 0 ? INT8_MAX : INT8_MIN;
2379         env->vxsat = 0x1;
2380     }
2381     return res;
2382 }
2383 
2384 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2385                              int16_t b)
2386 {
2387     int16_t res = a + b;
2388     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2389         res = a > 0 ? INT16_MAX : INT16_MIN;
2390         env->vxsat = 0x1;
2391     }
2392     return res;
2393 }
2394 
2395 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2396                              int32_t b)
2397 {
2398     int32_t res = a + b;
2399     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2400         res = a > 0 ? INT32_MAX : INT32_MIN;
2401         env->vxsat = 0x1;
2402     }
2403     return res;
2404 }
2405 
2406 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2407                              int64_t b)
2408 {
2409     int64_t res = a + b;
2410     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2411         res = a > 0 ? INT64_MAX : INT64_MIN;
2412         env->vxsat = 0x1;
2413     }
2414     return res;
2415 }
2416 
2417 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2418 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2419 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2420 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2421 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2422 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2423 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2424 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2425 
2426 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2427 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2428 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2429 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2430 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2431 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2432 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2433 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2434 
2435 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2436                              uint8_t b)
2437 {
2438     uint8_t res = a - b;
2439     if (res > a) {
2440         res = 0;
2441         env->vxsat = 0x1;
2442     }
2443     return res;
2444 }
2445 
2446 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2447                                uint16_t b)
2448 {
2449     uint16_t res = a - b;
2450     if (res > a) {
2451         res = 0;
2452         env->vxsat = 0x1;
2453     }
2454     return res;
2455 }
2456 
2457 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2458                                uint32_t b)
2459 {
2460     uint32_t res = a - b;
2461     if (res > a) {
2462         res = 0;
2463         env->vxsat = 0x1;
2464     }
2465     return res;
2466 }
2467 
2468 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2469                                uint64_t b)
2470 {
2471     uint64_t res = a - b;
2472     if (res > a) {
2473         res = 0;
2474         env->vxsat = 0x1;
2475     }
2476     return res;
2477 }
2478 
2479 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2480 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2481 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2482 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2483 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2484 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2485 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2486 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2487 
2488 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2489 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2490 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2491 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2492 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2493 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2494 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2495 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2496 
2497 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2498 {
2499     int8_t res = a - b;
2500     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2501         res = a >= 0 ? INT8_MAX : INT8_MIN;
2502         env->vxsat = 0x1;
2503     }
2504     return res;
2505 }
2506 
2507 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2508                              int16_t b)
2509 {
2510     int16_t res = a - b;
2511     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2512         res = a >= 0 ? INT16_MAX : INT16_MIN;
2513         env->vxsat = 0x1;
2514     }
2515     return res;
2516 }
2517 
2518 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2519                              int32_t b)
2520 {
2521     int32_t res = a - b;
2522     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2523         res = a >= 0 ? INT32_MAX : INT32_MIN;
2524         env->vxsat = 0x1;
2525     }
2526     return res;
2527 }
2528 
2529 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2530                              int64_t b)
2531 {
2532     int64_t res = a - b;
2533     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2534         res = a >= 0 ? INT64_MAX : INT64_MIN;
2535         env->vxsat = 0x1;
2536     }
2537     return res;
2538 }
2539 
2540 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2541 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2542 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2543 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2544 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2545 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2546 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2547 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2548 
2549 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2550 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2551 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2552 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2553 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2554 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2555 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2556 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2557 
2558 /* Vector Single-Width Averaging Add and Subtract */
2559 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2560 {
2561     uint8_t d = extract64(v, shift, 1);
2562     uint8_t d1;
2563     uint64_t D1, D2;
2564 
2565     if (shift == 0 || shift > 64) {
2566         return 0;
2567     }
2568 
2569     d1 = extract64(v, shift - 1, 1);
2570     D1 = extract64(v, 0, shift);
2571     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2572         return d1;
2573     } else if (vxrm == 1) { /* round-to-nearest-even */
2574         if (shift > 1) {
2575             D2 = extract64(v, 0, shift - 1);
2576             return d1 & ((D2 != 0) | d);
2577         } else {
2578             return d1 & d;
2579         }
2580     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2581         return !d & (D1 != 0);
2582     }
2583     return 0; /* round-down (truncate) */
2584 }
2585 
2586 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2587                              int32_t b)
2588 {
2589     int64_t res = (int64_t)a + b;
2590     uint8_t round = get_round(vxrm, res, 1);
2591 
2592     return (res >> 1) + round;
2593 }
2594 
2595 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2596                              int64_t b)
2597 {
2598     int64_t res = a + b;
2599     uint8_t round = get_round(vxrm, res, 1);
2600     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2601 
2602     /* With signed overflow, bit 64 is inverse of bit 63. */
2603     return ((res >> 1) ^ over) + round;
2604 }
2605 
2606 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2607 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2608 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2609 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2610 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2611 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2612 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2613 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2614 
2615 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2616 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2617 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2618 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2619 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2620 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2621 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2622 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2623 
2624 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2625                                uint32_t a, uint32_t b)
2626 {
2627     uint64_t res = (uint64_t)a + b;
2628     uint8_t round = get_round(vxrm, res, 1);
2629 
2630     return (res >> 1) + round;
2631 }
2632 
2633 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2634                                uint64_t a, uint64_t b)
2635 {
2636     uint64_t res = a + b;
2637     uint8_t round = get_round(vxrm, res, 1);
2638     uint64_t over = (uint64_t)(res < a) << 63;
2639 
2640     return ((res >> 1) | over) + round;
2641 }
2642 
2643 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2644 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2645 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2646 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2647 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2648 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2649 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2650 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2651 
2652 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2653 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2654 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2655 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2656 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2657 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2658 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2659 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2660 
2661 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2662                              int32_t b)
2663 {
2664     int64_t res = (int64_t)a - b;
2665     uint8_t round = get_round(vxrm, res, 1);
2666 
2667     return (res >> 1) + round;
2668 }
2669 
2670 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2671                              int64_t b)
2672 {
2673     int64_t res = (int64_t)a - b;
2674     uint8_t round = get_round(vxrm, res, 1);
2675     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2676 
2677     /* With signed overflow, bit 64 is inverse of bit 63. */
2678     return ((res >> 1) ^ over) + round;
2679 }
2680 
2681 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2682 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2683 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2684 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2685 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2686 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2687 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2688 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2689 
2690 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2691 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2692 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2693 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2694 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2695 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2696 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2697 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2698 
2699 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2700                                uint32_t a, uint32_t b)
2701 {
2702     int64_t res = (int64_t)a - b;
2703     uint8_t round = get_round(vxrm, res, 1);
2704 
2705     return (res >> 1) + round;
2706 }
2707 
2708 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2709                                uint64_t a, uint64_t b)
2710 {
2711     uint64_t res = (uint64_t)a - b;
2712     uint8_t round = get_round(vxrm, res, 1);
2713     uint64_t over = (uint64_t)(res > a) << 63;
2714 
2715     return ((res >> 1) | over) + round;
2716 }
2717 
2718 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2719 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2720 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2721 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2722 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2723 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2724 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2725 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2726 
2727 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2728 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2729 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2730 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2731 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2732 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2733 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2734 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2735 
2736 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2737 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2738 {
2739     uint8_t round;
2740     int16_t res;
2741 
2742     res = (int16_t)a * (int16_t)b;
2743     round = get_round(vxrm, res, 7);
2744     res = (res >> 7) + round;
2745 
2746     if (res > INT8_MAX) {
2747         env->vxsat = 0x1;
2748         return INT8_MAX;
2749     } else if (res < INT8_MIN) {
2750         env->vxsat = 0x1;
2751         return INT8_MIN;
2752     } else {
2753         return res;
2754     }
2755 }
2756 
2757 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2758 {
2759     uint8_t round;
2760     int32_t res;
2761 
2762     res = (int32_t)a * (int32_t)b;
2763     round = get_round(vxrm, res, 15);
2764     res = (res >> 15) + round;
2765 
2766     if (res > INT16_MAX) {
2767         env->vxsat = 0x1;
2768         return INT16_MAX;
2769     } else if (res < INT16_MIN) {
2770         env->vxsat = 0x1;
2771         return INT16_MIN;
2772     } else {
2773         return res;
2774     }
2775 }
2776 
2777 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2778 {
2779     uint8_t round;
2780     int64_t res;
2781 
2782     res = (int64_t)a * (int64_t)b;
2783     round = get_round(vxrm, res, 31);
2784     res = (res >> 31) + round;
2785 
2786     if (res > INT32_MAX) {
2787         env->vxsat = 0x1;
2788         return INT32_MAX;
2789     } else if (res < INT32_MIN) {
2790         env->vxsat = 0x1;
2791         return INT32_MIN;
2792     } else {
2793         return res;
2794     }
2795 }
2796 
2797 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2798 {
2799     uint8_t round;
2800     uint64_t hi_64, lo_64;
2801     int64_t res;
2802 
2803     if (a == INT64_MIN && b == INT64_MIN) {
2804         env->vxsat = 1;
2805         return INT64_MAX;
2806     }
2807 
2808     muls64(&lo_64, &hi_64, a, b);
2809     round = get_round(vxrm, lo_64, 63);
2810     /*
2811      * Cannot overflow, as there are always
2812      * 2 sign bits after multiply.
2813      */
2814     res = (hi_64 << 1) | (lo_64 >> 63);
2815     if (round) {
2816         if (res == INT64_MAX) {
2817             env->vxsat = 1;
2818         } else {
2819             res += 1;
2820         }
2821     }
2822     return res;
2823 }
2824 
2825 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2826 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2827 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2828 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2829 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2830 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2831 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2832 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2833 
2834 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2835 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2836 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2837 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2838 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2839 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2840 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2841 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2842 
2843 /* Vector Single-Width Scaling Shift Instructions */
2844 static inline uint8_t
2845 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2846 {
2847     uint8_t round, shift = b & 0x7;
2848     uint8_t res;
2849 
2850     round = get_round(vxrm, a, shift);
2851     res = (a >> shift) + round;
2852     return res;
2853 }
2854 static inline uint16_t
2855 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2856 {
2857     uint8_t round, shift = b & 0xf;
2858 
2859     round = get_round(vxrm, a, shift);
2860     return (a >> shift) + round;
2861 }
2862 static inline uint32_t
2863 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2864 {
2865     uint8_t round, shift = b & 0x1f;
2866 
2867     round = get_round(vxrm, a, shift);
2868     return (a >> shift) + round;
2869 }
2870 static inline uint64_t
2871 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2872 {
2873     uint8_t round, shift = b & 0x3f;
2874 
2875     round = get_round(vxrm, a, shift);
2876     return (a >> shift) + round;
2877 }
2878 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2879 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2880 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2881 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2882 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2883 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2884 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2885 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2886 
2887 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2888 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2889 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2890 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2891 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2892 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2893 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2894 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2895 
2896 static inline int8_t
2897 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2898 {
2899     uint8_t round, shift = b & 0x7;
2900 
2901     round = get_round(vxrm, a, shift);
2902     return (a >> shift) + round;
2903 }
2904 static inline int16_t
2905 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2906 {
2907     uint8_t round, shift = b & 0xf;
2908 
2909     round = get_round(vxrm, a, shift);
2910     return (a >> shift) + round;
2911 }
2912 static inline int32_t
2913 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2914 {
2915     uint8_t round, shift = b & 0x1f;
2916 
2917     round = get_round(vxrm, a, shift);
2918     return (a >> shift) + round;
2919 }
2920 static inline int64_t
2921 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2922 {
2923     uint8_t round, shift = b & 0x3f;
2924 
2925     round = get_round(vxrm, a, shift);
2926     return (a >> shift) + round;
2927 }
2928 
2929 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2930 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2931 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2932 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2933 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2934 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2935 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2936 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2937 
2938 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2939 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2940 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2941 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2942 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2943 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2944 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2945 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2946 
2947 /* Vector Narrowing Fixed-Point Clip Instructions */
2948 static inline int8_t
2949 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2950 {
2951     uint8_t round, shift = b & 0xf;
2952     int16_t res;
2953 
2954     round = get_round(vxrm, a, shift);
2955     res = (a >> shift) + round;
2956     if (res > INT8_MAX) {
2957         env->vxsat = 0x1;
2958         return INT8_MAX;
2959     } else if (res < INT8_MIN) {
2960         env->vxsat = 0x1;
2961         return INT8_MIN;
2962     } else {
2963         return res;
2964     }
2965 }
2966 
2967 static inline int16_t
2968 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2969 {
2970     uint8_t round, shift = b & 0x1f;
2971     int32_t res;
2972 
2973     round = get_round(vxrm, a, shift);
2974     res = (a >> shift) + round;
2975     if (res > INT16_MAX) {
2976         env->vxsat = 0x1;
2977         return INT16_MAX;
2978     } else if (res < INT16_MIN) {
2979         env->vxsat = 0x1;
2980         return INT16_MIN;
2981     } else {
2982         return res;
2983     }
2984 }
2985 
2986 static inline int32_t
2987 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2988 {
2989     uint8_t round, shift = b & 0x3f;
2990     int64_t res;
2991 
2992     round = get_round(vxrm, a, shift);
2993     res = (a >> shift) + round;
2994     if (res > INT32_MAX) {
2995         env->vxsat = 0x1;
2996         return INT32_MAX;
2997     } else if (res < INT32_MIN) {
2998         env->vxsat = 0x1;
2999         return INT32_MIN;
3000     } else {
3001         return res;
3002     }
3003 }
3004 
3005 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
3006 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
3007 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
3008 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
3009 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
3010 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
3011 
3012 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
3013 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
3014 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
3015 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
3016 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
3017 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
3018 
3019 static inline uint8_t
3020 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
3021 {
3022     uint8_t round, shift = b & 0xf;
3023     uint16_t res;
3024 
3025     round = get_round(vxrm, a, shift);
3026     res = (a >> shift) + round;
3027     if (res > UINT8_MAX) {
3028         env->vxsat = 0x1;
3029         return UINT8_MAX;
3030     } else {
3031         return res;
3032     }
3033 }
3034 
3035 static inline uint16_t
3036 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
3037 {
3038     uint8_t round, shift = b & 0x1f;
3039     uint32_t res;
3040 
3041     round = get_round(vxrm, a, shift);
3042     res = (a >> shift) + round;
3043     if (res > UINT16_MAX) {
3044         env->vxsat = 0x1;
3045         return UINT16_MAX;
3046     } else {
3047         return res;
3048     }
3049 }
3050 
3051 static inline uint32_t
3052 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3053 {
3054     uint8_t round, shift = b & 0x3f;
3055     uint64_t res;
3056 
3057     round = get_round(vxrm, a, shift);
3058     res = (a >> shift) + round;
3059     if (res > UINT32_MAX) {
3060         env->vxsat = 0x1;
3061         return UINT32_MAX;
3062     } else {
3063         return res;
3064     }
3065 }
3066 
3067 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3068 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3069 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3070 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3071 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3072 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3073 
3074 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3075 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3076 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3077 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3078 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3079 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3080 
3081 /*
3082  * Vector Float Point Arithmetic Instructions
3083  */
3084 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3085 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3086 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3087                       CPURISCVState *env)                      \
3088 {                                                              \
3089     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3090     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3091     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3092 }
3093 
3094 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3095 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3096                   void *vs2, CPURISCVState *env,          \
3097                   uint32_t desc)                          \
3098 {                                                         \
3099     uint32_t vm = vext_vm(desc);                          \
3100     uint32_t vl = env->vl;                                \
3101     uint32_t total_elems =                                \
3102         vext_get_total_elems(env, desc, ESZ);             \
3103     uint32_t vta = vext_vta(desc);                        \
3104     uint32_t vma = vext_vma(desc);                        \
3105     uint32_t i;                                           \
3106                                                           \
3107     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3108                                                           \
3109     for (i = env->vstart; i < vl; i++) {                  \
3110         if (!vm && !vext_elem_mask(v0, i)) {              \
3111             /* set masked-off elements to 1s */           \
3112             vext_set_elems_1s(vd, vma, i * ESZ,           \
3113                               (i + 1) * ESZ);             \
3114             continue;                                     \
3115         }                                                 \
3116         do_##NAME(vd, vs1, vs2, i, env);                  \
3117     }                                                     \
3118     env->vstart = 0;                                      \
3119     /* set tail elements to 1s */                         \
3120     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3121                       total_elems * ESZ);                 \
3122 }
3123 
3124 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3125 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3126 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3127 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3128 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3129 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3130 
3131 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3132 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3133                       CPURISCVState *env)                      \
3134 {                                                              \
3135     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3136     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3137 }
3138 
3139 #define GEN_VEXT_VF(NAME, ESZ)                            \
3140 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3141                   void *vs2, CPURISCVState *env,          \
3142                   uint32_t desc)                          \
3143 {                                                         \
3144     uint32_t vm = vext_vm(desc);                          \
3145     uint32_t vl = env->vl;                                \
3146     uint32_t total_elems =                                \
3147         vext_get_total_elems(env, desc, ESZ);             \
3148     uint32_t vta = vext_vta(desc);                        \
3149     uint32_t vma = vext_vma(desc);                        \
3150     uint32_t i;                                           \
3151                                                           \
3152     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3153                                                           \
3154     for (i = env->vstart; i < vl; i++) {                  \
3155         if (!vm && !vext_elem_mask(v0, i)) {              \
3156             /* set masked-off elements to 1s */           \
3157             vext_set_elems_1s(vd, vma, i * ESZ,           \
3158                               (i + 1) * ESZ);             \
3159             continue;                                     \
3160         }                                                 \
3161         do_##NAME(vd, s1, vs2, i, env);                   \
3162     }                                                     \
3163     env->vstart = 0;                                      \
3164     /* set tail elements to 1s */                         \
3165     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3166                       total_elems * ESZ);                 \
3167 }
3168 
3169 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3170 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3171 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3172 GEN_VEXT_VF(vfadd_vf_h, 2)
3173 GEN_VEXT_VF(vfadd_vf_w, 4)
3174 GEN_VEXT_VF(vfadd_vf_d, 8)
3175 
3176 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3177 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3178 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3179 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3180 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3181 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3182 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3183 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3184 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3185 GEN_VEXT_VF(vfsub_vf_h, 2)
3186 GEN_VEXT_VF(vfsub_vf_w, 4)
3187 GEN_VEXT_VF(vfsub_vf_d, 8)
3188 
3189 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3190 {
3191     return float16_sub(b, a, s);
3192 }
3193 
3194 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3195 {
3196     return float32_sub(b, a, s);
3197 }
3198 
3199 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3200 {
3201     return float64_sub(b, a, s);
3202 }
3203 
3204 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3205 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3206 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3207 GEN_VEXT_VF(vfrsub_vf_h, 2)
3208 GEN_VEXT_VF(vfrsub_vf_w, 4)
3209 GEN_VEXT_VF(vfrsub_vf_d, 8)
3210 
3211 /* Vector Widening Floating-Point Add/Subtract Instructions */
3212 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3213 {
3214     return float32_add(float16_to_float32(a, true, s),
3215                        float16_to_float32(b, true, s), s);
3216 }
3217 
3218 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3219 {
3220     return float64_add(float32_to_float64(a, s),
3221                        float32_to_float64(b, s), s);
3222 
3223 }
3224 
3225 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3226 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3227 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3228 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3229 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3230 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3231 GEN_VEXT_VF(vfwadd_vf_h, 4)
3232 GEN_VEXT_VF(vfwadd_vf_w, 8)
3233 
3234 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3235 {
3236     return float32_sub(float16_to_float32(a, true, s),
3237                        float16_to_float32(b, true, s), s);
3238 }
3239 
3240 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3241 {
3242     return float64_sub(float32_to_float64(a, s),
3243                        float32_to_float64(b, s), s);
3244 
3245 }
3246 
3247 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3248 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3249 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3250 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3251 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3252 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3253 GEN_VEXT_VF(vfwsub_vf_h, 4)
3254 GEN_VEXT_VF(vfwsub_vf_w, 8)
3255 
3256 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3257 {
3258     return float32_add(a, float16_to_float32(b, true, s), s);
3259 }
3260 
3261 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3262 {
3263     return float64_add(a, float32_to_float64(b, s), s);
3264 }
3265 
3266 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3267 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3268 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3269 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3270 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3271 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3272 GEN_VEXT_VF(vfwadd_wf_h, 4)
3273 GEN_VEXT_VF(vfwadd_wf_w, 8)
3274 
3275 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3276 {
3277     return float32_sub(a, float16_to_float32(b, true, s), s);
3278 }
3279 
3280 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3281 {
3282     return float64_sub(a, float32_to_float64(b, s), s);
3283 }
3284 
3285 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3286 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3287 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3288 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3289 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3290 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3291 GEN_VEXT_VF(vfwsub_wf_h, 4)
3292 GEN_VEXT_VF(vfwsub_wf_w, 8)
3293 
3294 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3295 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3296 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3297 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3298 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3299 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3300 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3301 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3302 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3303 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3304 GEN_VEXT_VF(vfmul_vf_h, 2)
3305 GEN_VEXT_VF(vfmul_vf_w, 4)
3306 GEN_VEXT_VF(vfmul_vf_d, 8)
3307 
3308 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3309 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3310 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3311 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3312 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3313 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3314 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3315 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3316 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3317 GEN_VEXT_VF(vfdiv_vf_h, 2)
3318 GEN_VEXT_VF(vfdiv_vf_w, 4)
3319 GEN_VEXT_VF(vfdiv_vf_d, 8)
3320 
3321 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3322 {
3323     return float16_div(b, a, s);
3324 }
3325 
3326 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3327 {
3328     return float32_div(b, a, s);
3329 }
3330 
3331 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3332 {
3333     return float64_div(b, a, s);
3334 }
3335 
3336 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3337 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3338 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3339 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3340 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3341 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3342 
3343 /* Vector Widening Floating-Point Multiply */
3344 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3345 {
3346     return float32_mul(float16_to_float32(a, true, s),
3347                        float16_to_float32(b, true, s), s);
3348 }
3349 
3350 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3351 {
3352     return float64_mul(float32_to_float64(a, s),
3353                        float32_to_float64(b, s), s);
3354 
3355 }
3356 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3357 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3358 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3359 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3360 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3361 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3362 GEN_VEXT_VF(vfwmul_vf_h, 4)
3363 GEN_VEXT_VF(vfwmul_vf_w, 8)
3364 
3365 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3366 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3367 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3368                       CPURISCVState *env)                          \
3369 {                                                                  \
3370     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3371     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3372     TD d = *((TD *)vd + HD(i));                                    \
3373     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3374 }
3375 
3376 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3377 {
3378     return float16_muladd(a, b, d, 0, s);
3379 }
3380 
3381 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3382 {
3383     return float32_muladd(a, b, d, 0, s);
3384 }
3385 
3386 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3387 {
3388     return float64_muladd(a, b, d, 0, s);
3389 }
3390 
3391 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3392 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3393 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3394 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3395 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3396 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3397 
3398 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3399 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3400                       CPURISCVState *env)                         \
3401 {                                                                 \
3402     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3403     TD d = *((TD *)vd + HD(i));                                   \
3404     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3405 }
3406 
3407 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3408 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3409 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3410 GEN_VEXT_VF(vfmacc_vf_h, 2)
3411 GEN_VEXT_VF(vfmacc_vf_w, 4)
3412 GEN_VEXT_VF(vfmacc_vf_d, 8)
3413 
3414 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3415 {
3416     return float16_muladd(a, b, d, float_muladd_negate_c |
3417                                    float_muladd_negate_product, s);
3418 }
3419 
3420 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3421 {
3422     return float32_muladd(a, b, d, float_muladd_negate_c |
3423                                    float_muladd_negate_product, s);
3424 }
3425 
3426 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3427 {
3428     return float64_muladd(a, b, d, float_muladd_negate_c |
3429                                    float_muladd_negate_product, s);
3430 }
3431 
3432 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3433 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3434 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3435 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3436 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3437 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3438 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3439 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3440 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3441 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3442 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3443 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3444 
3445 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3446 {
3447     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3448 }
3449 
3450 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3451 {
3452     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3453 }
3454 
3455 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3456 {
3457     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3458 }
3459 
3460 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3461 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3462 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3463 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3464 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3465 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3466 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3467 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3468 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3469 GEN_VEXT_VF(vfmsac_vf_h, 2)
3470 GEN_VEXT_VF(vfmsac_vf_w, 4)
3471 GEN_VEXT_VF(vfmsac_vf_d, 8)
3472 
3473 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3474 {
3475     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3476 }
3477 
3478 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3479 {
3480     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3481 }
3482 
3483 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3484 {
3485     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3486 }
3487 
3488 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3489 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3490 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3491 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3492 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3493 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3494 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3495 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3496 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3497 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3498 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3499 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3500 
3501 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3502 {
3503     return float16_muladd(d, b, a, 0, s);
3504 }
3505 
3506 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3507 {
3508     return float32_muladd(d, b, a, 0, s);
3509 }
3510 
3511 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3512 {
3513     return float64_muladd(d, b, a, 0, s);
3514 }
3515 
3516 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3517 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3518 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3519 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3520 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3521 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3522 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3523 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3524 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3525 GEN_VEXT_VF(vfmadd_vf_h, 2)
3526 GEN_VEXT_VF(vfmadd_vf_w, 4)
3527 GEN_VEXT_VF(vfmadd_vf_d, 8)
3528 
3529 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3530 {
3531     return float16_muladd(d, b, a, float_muladd_negate_c |
3532                                    float_muladd_negate_product, s);
3533 }
3534 
3535 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3536 {
3537     return float32_muladd(d, b, a, float_muladd_negate_c |
3538                                    float_muladd_negate_product, s);
3539 }
3540 
3541 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3542 {
3543     return float64_muladd(d, b, a, float_muladd_negate_c |
3544                                    float_muladd_negate_product, s);
3545 }
3546 
3547 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3548 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3549 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3550 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3551 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3552 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3553 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3554 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3555 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3556 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3557 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3558 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3559 
3560 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3561 {
3562     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3563 }
3564 
3565 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3566 {
3567     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3568 }
3569 
3570 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3571 {
3572     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3573 }
3574 
3575 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3576 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3577 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3578 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3579 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3580 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3581 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3582 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3583 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3584 GEN_VEXT_VF(vfmsub_vf_h, 2)
3585 GEN_VEXT_VF(vfmsub_vf_w, 4)
3586 GEN_VEXT_VF(vfmsub_vf_d, 8)
3587 
3588 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3589 {
3590     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3591 }
3592 
3593 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3594 {
3595     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3596 }
3597 
3598 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3599 {
3600     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3601 }
3602 
3603 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3604 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3605 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3606 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3607 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3608 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3609 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3610 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3611 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3612 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3613 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3614 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3615 
3616 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3617 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3618 {
3619     return float32_muladd(float16_to_float32(a, true, s),
3620                           float16_to_float32(b, true, s), d, 0, s);
3621 }
3622 
3623 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3624 {
3625     return float64_muladd(float32_to_float64(a, s),
3626                           float32_to_float64(b, s), d, 0, s);
3627 }
3628 
3629 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3630 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3631 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3632 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3633 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3634 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3635 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3636 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3637 
3638 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3639 {
3640     return float32_muladd(bfloat16_to_float32(a, s),
3641                           bfloat16_to_float32(b, s), d, 0, s);
3642 }
3643 
3644 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3645 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3646 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3647 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3648 
3649 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3650 {
3651     return float32_muladd(float16_to_float32(a, true, s),
3652                           float16_to_float32(b, true, s), d,
3653                           float_muladd_negate_c | float_muladd_negate_product,
3654                           s);
3655 }
3656 
3657 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3658 {
3659     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3660                           d, float_muladd_negate_c |
3661                              float_muladd_negate_product, s);
3662 }
3663 
3664 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3665 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3666 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3667 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3668 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3669 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3670 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3671 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3672 
3673 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3674 {
3675     return float32_muladd(float16_to_float32(a, true, s),
3676                           float16_to_float32(b, true, s), d,
3677                           float_muladd_negate_c, s);
3678 }
3679 
3680 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3681 {
3682     return float64_muladd(float32_to_float64(a, s),
3683                           float32_to_float64(b, s), d,
3684                           float_muladd_negate_c, s);
3685 }
3686 
3687 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3688 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3689 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3690 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3691 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3692 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3693 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3694 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3695 
3696 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3697 {
3698     return float32_muladd(float16_to_float32(a, true, s),
3699                           float16_to_float32(b, true, s), d,
3700                           float_muladd_negate_product, s);
3701 }
3702 
3703 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3704 {
3705     return float64_muladd(float32_to_float64(a, s),
3706                           float32_to_float64(b, s), d,
3707                           float_muladd_negate_product, s);
3708 }
3709 
3710 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3711 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3712 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3713 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3714 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3715 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3716 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3717 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3718 
3719 /* Vector Floating-Point Square-Root Instruction */
3720 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3721 static void do_##NAME(void *vd, void *vs2, int i,      \
3722                       CPURISCVState *env)              \
3723 {                                                      \
3724     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3725     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3726 }
3727 
3728 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3729 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3730                   CPURISCVState *env, uint32_t desc)   \
3731 {                                                      \
3732     uint32_t vm = vext_vm(desc);                       \
3733     uint32_t vl = env->vl;                             \
3734     uint32_t total_elems =                             \
3735         vext_get_total_elems(env, desc, ESZ);          \
3736     uint32_t vta = vext_vta(desc);                     \
3737     uint32_t vma = vext_vma(desc);                     \
3738     uint32_t i;                                        \
3739                                                        \
3740     VSTART_CHECK_EARLY_EXIT(env, vl);                  \
3741                                                        \
3742     if (vl == 0) {                                     \
3743         return;                                        \
3744     }                                                  \
3745     for (i = env->vstart; i < vl; i++) {               \
3746         if (!vm && !vext_elem_mask(v0, i)) {           \
3747             /* set masked-off elements to 1s */        \
3748             vext_set_elems_1s(vd, vma, i * ESZ,        \
3749                               (i + 1) * ESZ);          \
3750             continue;                                  \
3751         }                                              \
3752         do_##NAME(vd, vs2, i, env);                    \
3753     }                                                  \
3754     env->vstart = 0;                                   \
3755     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3756                       total_elems * ESZ);              \
3757 }
3758 
3759 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3760 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3761 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3762 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3763 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3764 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3765 
3766 /*
3767  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3768  *
3769  * Adapted from riscv-v-spec recip.c:
3770  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3771  */
3772 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3773 {
3774     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3775     uint64_t exp = extract64(f, frac_size, exp_size);
3776     uint64_t frac = extract64(f, 0, frac_size);
3777 
3778     const uint8_t lookup_table[] = {
3779         52, 51, 50, 48, 47, 46, 44, 43,
3780         42, 41, 40, 39, 38, 36, 35, 34,
3781         33, 32, 31, 30, 30, 29, 28, 27,
3782         26, 25, 24, 23, 23, 22, 21, 20,
3783         19, 19, 18, 17, 16, 16, 15, 14,
3784         14, 13, 12, 12, 11, 10, 10, 9,
3785         9, 8, 7, 7, 6, 6, 5, 4,
3786         4, 3, 3, 2, 2, 1, 1, 0,
3787         127, 125, 123, 121, 119, 118, 116, 114,
3788         113, 111, 109, 108, 106, 105, 103, 102,
3789         100, 99, 97, 96, 95, 93, 92, 91,
3790         90, 88, 87, 86, 85, 84, 83, 82,
3791         80, 79, 78, 77, 76, 75, 74, 73,
3792         72, 71, 70, 70, 69, 68, 67, 66,
3793         65, 64, 63, 63, 62, 61, 60, 59,
3794         59, 58, 57, 56, 56, 55, 54, 53
3795     };
3796     const int precision = 7;
3797 
3798     if (exp == 0 && frac != 0) { /* subnormal */
3799         /* Normalize the subnormal. */
3800         while (extract64(frac, frac_size - 1, 1) == 0) {
3801             exp--;
3802             frac <<= 1;
3803         }
3804 
3805         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3806     }
3807 
3808     int idx = ((exp & 1) << (precision - 1)) |
3809               (frac >> (frac_size - precision + 1));
3810     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3811                         (frac_size - precision);
3812     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3813 
3814     uint64_t val = 0;
3815     val = deposit64(val, 0, frac_size, out_frac);
3816     val = deposit64(val, frac_size, exp_size, out_exp);
3817     val = deposit64(val, frac_size + exp_size, 1, sign);
3818     return val;
3819 }
3820 
3821 static float16 frsqrt7_h(float16 f, float_status *s)
3822 {
3823     int exp_size = 5, frac_size = 10;
3824     bool sign = float16_is_neg(f);
3825 
3826     /*
3827      * frsqrt7(sNaN) = canonical NaN
3828      * frsqrt7(-inf) = canonical NaN
3829      * frsqrt7(-normal) = canonical NaN
3830      * frsqrt7(-subnormal) = canonical NaN
3831      */
3832     if (float16_is_signaling_nan(f, s) ||
3833         (float16_is_infinity(f) && sign) ||
3834         (float16_is_normal(f) && sign) ||
3835         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3836         s->float_exception_flags |= float_flag_invalid;
3837         return float16_default_nan(s);
3838     }
3839 
3840     /* frsqrt7(qNaN) = canonical NaN */
3841     if (float16_is_quiet_nan(f, s)) {
3842         return float16_default_nan(s);
3843     }
3844 
3845     /* frsqrt7(+-0) = +-inf */
3846     if (float16_is_zero(f)) {
3847         s->float_exception_flags |= float_flag_divbyzero;
3848         return float16_set_sign(float16_infinity, sign);
3849     }
3850 
3851     /* frsqrt7(+inf) = +0 */
3852     if (float16_is_infinity(f) && !sign) {
3853         return float16_set_sign(float16_zero, sign);
3854     }
3855 
3856     /* +normal, +subnormal */
3857     uint64_t val = frsqrt7(f, exp_size, frac_size);
3858     return make_float16(val);
3859 }
3860 
3861 static float32 frsqrt7_s(float32 f, float_status *s)
3862 {
3863     int exp_size = 8, frac_size = 23;
3864     bool sign = float32_is_neg(f);
3865 
3866     /*
3867      * frsqrt7(sNaN) = canonical NaN
3868      * frsqrt7(-inf) = canonical NaN
3869      * frsqrt7(-normal) = canonical NaN
3870      * frsqrt7(-subnormal) = canonical NaN
3871      */
3872     if (float32_is_signaling_nan(f, s) ||
3873         (float32_is_infinity(f) && sign) ||
3874         (float32_is_normal(f) && sign) ||
3875         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3876         s->float_exception_flags |= float_flag_invalid;
3877         return float32_default_nan(s);
3878     }
3879 
3880     /* frsqrt7(qNaN) = canonical NaN */
3881     if (float32_is_quiet_nan(f, s)) {
3882         return float32_default_nan(s);
3883     }
3884 
3885     /* frsqrt7(+-0) = +-inf */
3886     if (float32_is_zero(f)) {
3887         s->float_exception_flags |= float_flag_divbyzero;
3888         return float32_set_sign(float32_infinity, sign);
3889     }
3890 
3891     /* frsqrt7(+inf) = +0 */
3892     if (float32_is_infinity(f) && !sign) {
3893         return float32_set_sign(float32_zero, sign);
3894     }
3895 
3896     /* +normal, +subnormal */
3897     uint64_t val = frsqrt7(f, exp_size, frac_size);
3898     return make_float32(val);
3899 }
3900 
3901 static float64 frsqrt7_d(float64 f, float_status *s)
3902 {
3903     int exp_size = 11, frac_size = 52;
3904     bool sign = float64_is_neg(f);
3905 
3906     /*
3907      * frsqrt7(sNaN) = canonical NaN
3908      * frsqrt7(-inf) = canonical NaN
3909      * frsqrt7(-normal) = canonical NaN
3910      * frsqrt7(-subnormal) = canonical NaN
3911      */
3912     if (float64_is_signaling_nan(f, s) ||
3913         (float64_is_infinity(f) && sign) ||
3914         (float64_is_normal(f) && sign) ||
3915         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3916         s->float_exception_flags |= float_flag_invalid;
3917         return float64_default_nan(s);
3918     }
3919 
3920     /* frsqrt7(qNaN) = canonical NaN */
3921     if (float64_is_quiet_nan(f, s)) {
3922         return float64_default_nan(s);
3923     }
3924 
3925     /* frsqrt7(+-0) = +-inf */
3926     if (float64_is_zero(f)) {
3927         s->float_exception_flags |= float_flag_divbyzero;
3928         return float64_set_sign(float64_infinity, sign);
3929     }
3930 
3931     /* frsqrt7(+inf) = +0 */
3932     if (float64_is_infinity(f) && !sign) {
3933         return float64_set_sign(float64_zero, sign);
3934     }
3935 
3936     /* +normal, +subnormal */
3937     uint64_t val = frsqrt7(f, exp_size, frac_size);
3938     return make_float64(val);
3939 }
3940 
3941 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3942 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3943 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3944 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3945 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3946 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3947 
3948 /*
3949  * Vector Floating-Point Reciprocal Estimate Instruction
3950  *
3951  * Adapted from riscv-v-spec recip.c:
3952  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3953  */
3954 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3955                       float_status *s)
3956 {
3957     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3958     uint64_t exp = extract64(f, frac_size, exp_size);
3959     uint64_t frac = extract64(f, 0, frac_size);
3960 
3961     const uint8_t lookup_table[] = {
3962         127, 125, 123, 121, 119, 117, 116, 114,
3963         112, 110, 109, 107, 105, 104, 102, 100,
3964         99, 97, 96, 94, 93, 91, 90, 88,
3965         87, 85, 84, 83, 81, 80, 79, 77,
3966         76, 75, 74, 72, 71, 70, 69, 68,
3967         66, 65, 64, 63, 62, 61, 60, 59,
3968         58, 57, 56, 55, 54, 53, 52, 51,
3969         50, 49, 48, 47, 46, 45, 44, 43,
3970         42, 41, 40, 40, 39, 38, 37, 36,
3971         35, 35, 34, 33, 32, 31, 31, 30,
3972         29, 28, 28, 27, 26, 25, 25, 24,
3973         23, 23, 22, 21, 21, 20, 19, 19,
3974         18, 17, 17, 16, 15, 15, 14, 14,
3975         13, 12, 12, 11, 11, 10, 9, 9,
3976         8, 8, 7, 7, 6, 5, 5, 4,
3977         4, 3, 3, 2, 2, 1, 1, 0
3978     };
3979     const int precision = 7;
3980 
3981     if (exp == 0 && frac != 0) { /* subnormal */
3982         /* Normalize the subnormal. */
3983         while (extract64(frac, frac_size - 1, 1) == 0) {
3984             exp--;
3985             frac <<= 1;
3986         }
3987 
3988         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3989 
3990         if (exp != 0 && exp != UINT64_MAX) {
3991             /*
3992              * Overflow to inf or max value of same sign,
3993              * depending on sign and rounding mode.
3994              */
3995             s->float_exception_flags |= (float_flag_inexact |
3996                                          float_flag_overflow);
3997 
3998             if ((s->float_rounding_mode == float_round_to_zero) ||
3999                 ((s->float_rounding_mode == float_round_down) && !sign) ||
4000                 ((s->float_rounding_mode == float_round_up) && sign)) {
4001                 /* Return greatest/negative finite value. */
4002                 return (sign << (exp_size + frac_size)) |
4003                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
4004             } else {
4005                 /* Return +-inf. */
4006                 return (sign << (exp_size + frac_size)) |
4007                        MAKE_64BIT_MASK(frac_size, exp_size);
4008             }
4009         }
4010     }
4011 
4012     int idx = frac >> (frac_size - precision);
4013     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
4014                         (frac_size - precision);
4015     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
4016 
4017     if (out_exp == 0 || out_exp == UINT64_MAX) {
4018         /*
4019          * The result is subnormal, but don't raise the underflow exception,
4020          * because there's no additional loss of precision.
4021          */
4022         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
4023         if (out_exp == UINT64_MAX) {
4024             out_frac >>= 1;
4025             out_exp = 0;
4026         }
4027     }
4028 
4029     uint64_t val = 0;
4030     val = deposit64(val, 0, frac_size, out_frac);
4031     val = deposit64(val, frac_size, exp_size, out_exp);
4032     val = deposit64(val, frac_size + exp_size, 1, sign);
4033     return val;
4034 }
4035 
4036 static float16 frec7_h(float16 f, float_status *s)
4037 {
4038     int exp_size = 5, frac_size = 10;
4039     bool sign = float16_is_neg(f);
4040 
4041     /* frec7(+-inf) = +-0 */
4042     if (float16_is_infinity(f)) {
4043         return float16_set_sign(float16_zero, sign);
4044     }
4045 
4046     /* frec7(+-0) = +-inf */
4047     if (float16_is_zero(f)) {
4048         s->float_exception_flags |= float_flag_divbyzero;
4049         return float16_set_sign(float16_infinity, sign);
4050     }
4051 
4052     /* frec7(sNaN) = canonical NaN */
4053     if (float16_is_signaling_nan(f, s)) {
4054         s->float_exception_flags |= float_flag_invalid;
4055         return float16_default_nan(s);
4056     }
4057 
4058     /* frec7(qNaN) = canonical NaN */
4059     if (float16_is_quiet_nan(f, s)) {
4060         return float16_default_nan(s);
4061     }
4062 
4063     /* +-normal, +-subnormal */
4064     uint64_t val = frec7(f, exp_size, frac_size, s);
4065     return make_float16(val);
4066 }
4067 
4068 static float32 frec7_s(float32 f, float_status *s)
4069 {
4070     int exp_size = 8, frac_size = 23;
4071     bool sign = float32_is_neg(f);
4072 
4073     /* frec7(+-inf) = +-0 */
4074     if (float32_is_infinity(f)) {
4075         return float32_set_sign(float32_zero, sign);
4076     }
4077 
4078     /* frec7(+-0) = +-inf */
4079     if (float32_is_zero(f)) {
4080         s->float_exception_flags |= float_flag_divbyzero;
4081         return float32_set_sign(float32_infinity, sign);
4082     }
4083 
4084     /* frec7(sNaN) = canonical NaN */
4085     if (float32_is_signaling_nan(f, s)) {
4086         s->float_exception_flags |= float_flag_invalid;
4087         return float32_default_nan(s);
4088     }
4089 
4090     /* frec7(qNaN) = canonical NaN */
4091     if (float32_is_quiet_nan(f, s)) {
4092         return float32_default_nan(s);
4093     }
4094 
4095     /* +-normal, +-subnormal */
4096     uint64_t val = frec7(f, exp_size, frac_size, s);
4097     return make_float32(val);
4098 }
4099 
4100 static float64 frec7_d(float64 f, float_status *s)
4101 {
4102     int exp_size = 11, frac_size = 52;
4103     bool sign = float64_is_neg(f);
4104 
4105     /* frec7(+-inf) = +-0 */
4106     if (float64_is_infinity(f)) {
4107         return float64_set_sign(float64_zero, sign);
4108     }
4109 
4110     /* frec7(+-0) = +-inf */
4111     if (float64_is_zero(f)) {
4112         s->float_exception_flags |= float_flag_divbyzero;
4113         return float64_set_sign(float64_infinity, sign);
4114     }
4115 
4116     /* frec7(sNaN) = canonical NaN */
4117     if (float64_is_signaling_nan(f, s)) {
4118         s->float_exception_flags |= float_flag_invalid;
4119         return float64_default_nan(s);
4120     }
4121 
4122     /* frec7(qNaN) = canonical NaN */
4123     if (float64_is_quiet_nan(f, s)) {
4124         return float64_default_nan(s);
4125     }
4126 
4127     /* +-normal, +-subnormal */
4128     uint64_t val = frec7(f, exp_size, frac_size, s);
4129     return make_float64(val);
4130 }
4131 
4132 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4133 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4134 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4135 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4136 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4137 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4138 
4139 /* Vector Floating-Point MIN/MAX Instructions */
4140 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4141 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4142 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4143 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4144 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4145 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4146 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4147 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4148 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4149 GEN_VEXT_VF(vfmin_vf_h, 2)
4150 GEN_VEXT_VF(vfmin_vf_w, 4)
4151 GEN_VEXT_VF(vfmin_vf_d, 8)
4152 
4153 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4154 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4155 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4156 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4157 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4158 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4159 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4160 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4161 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4162 GEN_VEXT_VF(vfmax_vf_h, 2)
4163 GEN_VEXT_VF(vfmax_vf_w, 4)
4164 GEN_VEXT_VF(vfmax_vf_d, 8)
4165 
4166 /* Vector Floating-Point Sign-Injection Instructions */
4167 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4168 {
4169     return deposit64(b, 0, 15, a);
4170 }
4171 
4172 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4173 {
4174     return deposit64(b, 0, 31, a);
4175 }
4176 
4177 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4178 {
4179     return deposit64(b, 0, 63, a);
4180 }
4181 
4182 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4183 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4184 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4185 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4186 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4187 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4188 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4189 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4190 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4191 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4192 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4193 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4194 
4195 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4196 {
4197     return deposit64(~b, 0, 15, a);
4198 }
4199 
4200 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4201 {
4202     return deposit64(~b, 0, 31, a);
4203 }
4204 
4205 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4206 {
4207     return deposit64(~b, 0, 63, a);
4208 }
4209 
4210 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4211 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4212 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4213 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4214 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4215 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4216 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4217 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4218 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4219 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4220 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4221 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4222 
4223 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4224 {
4225     return deposit64(b ^ a, 0, 15, a);
4226 }
4227 
4228 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4229 {
4230     return deposit64(b ^ a, 0, 31, a);
4231 }
4232 
4233 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4234 {
4235     return deposit64(b ^ a, 0, 63, a);
4236 }
4237 
4238 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4239 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4240 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4241 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4242 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4243 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4244 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4245 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4246 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4247 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4248 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4249 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4250 
4251 /* Vector Floating-Point Compare Instructions */
4252 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4253 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4254                   CPURISCVState *env, uint32_t desc)          \
4255 {                                                             \
4256     uint32_t vm = vext_vm(desc);                              \
4257     uint32_t vl = env->vl;                                    \
4258     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4259     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4260     uint32_t vma = vext_vma(desc);                            \
4261     uint32_t i;                                               \
4262                                                               \
4263     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4264                                                               \
4265     for (i = env->vstart; i < vl; i++) {                      \
4266         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4267         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4268         if (!vm && !vext_elem_mask(v0, i)) {                  \
4269             /* set masked-off elements to 1s */               \
4270             if (vma) {                                        \
4271                 vext_set_elem_mask(vd, i, 1);                 \
4272             }                                                 \
4273             continue;                                         \
4274         }                                                     \
4275         vext_set_elem_mask(vd, i,                             \
4276                            DO_OP(s2, s1, &env->fp_status));   \
4277     }                                                         \
4278     env->vstart = 0;                                          \
4279     /*
4280      * mask destination register are always tail-agnostic
4281      * set tail elements to 1s
4282      */                                                       \
4283     if (vta_all_1s) {                                         \
4284         for (; i < total_elems; i++) {                        \
4285             vext_set_elem_mask(vd, i, 1);                     \
4286         }                                                     \
4287     }                                                         \
4288 }
4289 
4290 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4291 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4292 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4293 
4294 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4295 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4296                   CPURISCVState *env, uint32_t desc)                \
4297 {                                                                   \
4298     uint32_t vm = vext_vm(desc);                                    \
4299     uint32_t vl = env->vl;                                          \
4300     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4301     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4302     uint32_t vma = vext_vma(desc);                                  \
4303     uint32_t i;                                                     \
4304                                                                     \
4305     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
4306                                                                     \
4307     for (i = env->vstart; i < vl; i++) {                            \
4308         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4309         if (!vm && !vext_elem_mask(v0, i)) {                        \
4310             /* set masked-off elements to 1s */                     \
4311             if (vma) {                                              \
4312                 vext_set_elem_mask(vd, i, 1);                       \
4313             }                                                       \
4314             continue;                                               \
4315         }                                                           \
4316         vext_set_elem_mask(vd, i,                                   \
4317                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4318     }                                                               \
4319     env->vstart = 0;                                                \
4320     /*
4321      * mask destination register are always tail-agnostic
4322      * set tail elements to 1s
4323      */                                                             \
4324     if (vta_all_1s) {                                               \
4325         for (; i < total_elems; i++) {                              \
4326             vext_set_elem_mask(vd, i, 1);                           \
4327         }                                                           \
4328     }                                                               \
4329 }
4330 
4331 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4332 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4333 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4334 
4335 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4336 {
4337     FloatRelation compare = float16_compare_quiet(a, b, s);
4338     return compare != float_relation_equal;
4339 }
4340 
4341 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4342 {
4343     FloatRelation compare = float32_compare_quiet(a, b, s);
4344     return compare != float_relation_equal;
4345 }
4346 
4347 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4348 {
4349     FloatRelation compare = float64_compare_quiet(a, b, s);
4350     return compare != float_relation_equal;
4351 }
4352 
4353 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4354 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4355 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4356 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4357 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4358 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4359 
4360 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4361 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4362 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4363 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4364 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4365 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4366 
4367 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4368 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4369 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4370 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4371 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4372 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4373 
4374 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4375 {
4376     FloatRelation compare = float16_compare(a, b, s);
4377     return compare == float_relation_greater;
4378 }
4379 
4380 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4381 {
4382     FloatRelation compare = float32_compare(a, b, s);
4383     return compare == float_relation_greater;
4384 }
4385 
4386 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4387 {
4388     FloatRelation compare = float64_compare(a, b, s);
4389     return compare == float_relation_greater;
4390 }
4391 
4392 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4393 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4394 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4395 
4396 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4397 {
4398     FloatRelation compare = float16_compare(a, b, s);
4399     return compare == float_relation_greater ||
4400            compare == float_relation_equal;
4401 }
4402 
4403 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4404 {
4405     FloatRelation compare = float32_compare(a, b, s);
4406     return compare == float_relation_greater ||
4407            compare == float_relation_equal;
4408 }
4409 
4410 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4411 {
4412     FloatRelation compare = float64_compare(a, b, s);
4413     return compare == float_relation_greater ||
4414            compare == float_relation_equal;
4415 }
4416 
4417 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4418 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4419 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4420 
4421 /* Vector Floating-Point Classify Instruction */
4422 target_ulong fclass_h(uint64_t frs1)
4423 {
4424     float16 f = frs1;
4425     bool sign = float16_is_neg(f);
4426 
4427     if (float16_is_infinity(f)) {
4428         return sign ? 1 << 0 : 1 << 7;
4429     } else if (float16_is_zero(f)) {
4430         return sign ? 1 << 3 : 1 << 4;
4431     } else if (float16_is_zero_or_denormal(f)) {
4432         return sign ? 1 << 2 : 1 << 5;
4433     } else if (float16_is_any_nan(f)) {
4434         float_status s = { }; /* for snan_bit_is_one */
4435         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4436     } else {
4437         return sign ? 1 << 1 : 1 << 6;
4438     }
4439 }
4440 
4441 target_ulong fclass_s(uint64_t frs1)
4442 {
4443     float32 f = frs1;
4444     bool sign = float32_is_neg(f);
4445 
4446     if (float32_is_infinity(f)) {
4447         return sign ? 1 << 0 : 1 << 7;
4448     } else if (float32_is_zero(f)) {
4449         return sign ? 1 << 3 : 1 << 4;
4450     } else if (float32_is_zero_or_denormal(f)) {
4451         return sign ? 1 << 2 : 1 << 5;
4452     } else if (float32_is_any_nan(f)) {
4453         float_status s = { }; /* for snan_bit_is_one */
4454         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4455     } else {
4456         return sign ? 1 << 1 : 1 << 6;
4457     }
4458 }
4459 
4460 target_ulong fclass_d(uint64_t frs1)
4461 {
4462     float64 f = frs1;
4463     bool sign = float64_is_neg(f);
4464 
4465     if (float64_is_infinity(f)) {
4466         return sign ? 1 << 0 : 1 << 7;
4467     } else if (float64_is_zero(f)) {
4468         return sign ? 1 << 3 : 1 << 4;
4469     } else if (float64_is_zero_or_denormal(f)) {
4470         return sign ? 1 << 2 : 1 << 5;
4471     } else if (float64_is_any_nan(f)) {
4472         float_status s = { }; /* for snan_bit_is_one */
4473         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4474     } else {
4475         return sign ? 1 << 1 : 1 << 6;
4476     }
4477 }
4478 
4479 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4480 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4481 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4482 GEN_VEXT_V(vfclass_v_h, 2)
4483 GEN_VEXT_V(vfclass_v_w, 4)
4484 GEN_VEXT_V(vfclass_v_d, 8)
4485 
4486 /* Vector Floating-Point Merge Instruction */
4487 
4488 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4489 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4490                   CPURISCVState *env, uint32_t desc)          \
4491 {                                                             \
4492     uint32_t vm = vext_vm(desc);                              \
4493     uint32_t vl = env->vl;                                    \
4494     uint32_t esz = sizeof(ETYPE);                             \
4495     uint32_t total_elems =                                    \
4496         vext_get_total_elems(env, desc, esz);                 \
4497     uint32_t vta = vext_vta(desc);                            \
4498     uint32_t i;                                               \
4499                                                               \
4500     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4501                                                               \
4502     for (i = env->vstart; i < vl; i++) {                      \
4503         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4504         *((ETYPE *)vd + H(i)) =                               \
4505             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4506     }                                                         \
4507     env->vstart = 0;                                          \
4508     /* set tail elements to 1s */                             \
4509     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4510 }
4511 
4512 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4513 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4514 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4515 
4516 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4517 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4518 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4519 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4520 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4521 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4522 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4523 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4524 
4525 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4526 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4527 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4528 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4529 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4530 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4531 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4532 
4533 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4534 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4535 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4536 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4537 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4538 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4539 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4540 
4541 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4542 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4543 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4544 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4545 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4546 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4547 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4548 
4549 /* Widening Floating-Point/Integer Type-Convert Instructions */
4550 /* (TD, T2, TX2) */
4551 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4552 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4553 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4554 /*
4555  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4556  */
4557 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4558 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4559 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4560 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4561 
4562 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4563 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4564 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4565 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4566 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4567 
4568 /*
4569  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4570  */
4571 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4572 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4573 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4574 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4575 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4576 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4577 
4578 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4579 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4580 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4581 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4582 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4583 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4584 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4585 
4586 /*
4587  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4588  */
4589 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4590 {
4591     return float16_to_float32(a, true, s);
4592 }
4593 
4594 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4595 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4596 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4597 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4598 
4599 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4600 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4601 
4602 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4603 /* (TD, T2, TX2) */
4604 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4605 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4606 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4607 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4608 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4609 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4610 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4611 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4612 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4613 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4614 
4615 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4616 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4617 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4618 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4619 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4620 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4621 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4622 
4623 /*
4624  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4625  */
4626 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4627 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4628 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4629 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4630 
4631 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4632 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4633 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4634 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4635 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4636 
4637 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4638 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4639 {
4640     return float32_to_float16(a, true, s);
4641 }
4642 
4643 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4644 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4645 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4646 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4647 
4648 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4649 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4650 
4651 /*
4652  * Vector Reduction Operations
4653  */
4654 /* Vector Single-Width Integer Reduction Instructions */
4655 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4656 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4657                   void *vs2, CPURISCVState *env,          \
4658                   uint32_t desc)                          \
4659 {                                                         \
4660     uint32_t vm = vext_vm(desc);                          \
4661     uint32_t vl = env->vl;                                \
4662     uint32_t esz = sizeof(TD);                            \
4663     uint32_t vlenb = simd_maxsz(desc);                    \
4664     uint32_t vta = vext_vta(desc);                        \
4665     uint32_t i;                                           \
4666     TD s1 =  *((TD *)vs1 + HD(0));                        \
4667                                                           \
4668     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4669                                                           \
4670     for (i = env->vstart; i < vl; i++) {                  \
4671         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4672         if (!vm && !vext_elem_mask(v0, i)) {              \
4673             continue;                                     \
4674         }                                                 \
4675         s1 = OP(s1, (TD)s2);                              \
4676     }                                                     \
4677     if (vl > 0) {                                         \
4678         *((TD *)vd + HD(0)) = s1;                         \
4679     }                                                     \
4680     env->vstart = 0;                                      \
4681     /* set tail elements to 1s */                         \
4682     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4683 }
4684 
4685 /* vd[0] = sum(vs1[0], vs2[*]) */
4686 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4687 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4688 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4689 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4690 
4691 /* vd[0] = maxu(vs1[0], vs2[*]) */
4692 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4693 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4694 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4695 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4696 
4697 /* vd[0] = max(vs1[0], vs2[*]) */
4698 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4699 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4700 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4701 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4702 
4703 /* vd[0] = minu(vs1[0], vs2[*]) */
4704 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4705 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4706 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4707 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4708 
4709 /* vd[0] = min(vs1[0], vs2[*]) */
4710 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4711 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4712 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4713 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4714 
4715 /* vd[0] = and(vs1[0], vs2[*]) */
4716 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4717 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4718 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4719 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4720 
4721 /* vd[0] = or(vs1[0], vs2[*]) */
4722 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4723 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4724 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4725 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4726 
4727 /* vd[0] = xor(vs1[0], vs2[*]) */
4728 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4729 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4730 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4731 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4732 
4733 /* Vector Widening Integer Reduction Instructions */
4734 /* signed sum reduction into double-width accumulator */
4735 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4736 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4737 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4738 
4739 /* Unsigned sum reduction into double-width accumulator */
4740 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4741 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4742 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4743 
4744 /* Vector Single-Width Floating-Point Reduction Instructions */
4745 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4746 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4747                   void *vs2, CPURISCVState *env,           \
4748                   uint32_t desc)                           \
4749 {                                                          \
4750     uint32_t vm = vext_vm(desc);                           \
4751     uint32_t vl = env->vl;                                 \
4752     uint32_t esz = sizeof(TD);                             \
4753     uint32_t vlenb = simd_maxsz(desc);                     \
4754     uint32_t vta = vext_vta(desc);                         \
4755     uint32_t i;                                            \
4756     TD s1 =  *((TD *)vs1 + HD(0));                         \
4757                                                            \
4758     VSTART_CHECK_EARLY_EXIT(env, vl);                      \
4759                                                            \
4760     for (i = env->vstart; i < vl; i++) {                   \
4761         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4762         if (!vm && !vext_elem_mask(v0, i)) {               \
4763             continue;                                      \
4764         }                                                  \
4765         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4766     }                                                      \
4767     if (vl > 0) {                                          \
4768         *((TD *)vd + HD(0)) = s1;                          \
4769     }                                                      \
4770     env->vstart = 0;                                       \
4771     /* set tail elements to 1s */                          \
4772     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4773 }
4774 
4775 /* Unordered sum */
4776 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4777 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4778 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4779 
4780 /* Ordered sum */
4781 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4782 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4783 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4784 
4785 /* Maximum value */
4786 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4787               float16_maximum_number)
4788 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4789               float32_maximum_number)
4790 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4791               float64_maximum_number)
4792 
4793 /* Minimum value */
4794 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4795               float16_minimum_number)
4796 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4797               float32_minimum_number)
4798 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4799               float64_minimum_number)
4800 
4801 /* Vector Widening Floating-Point Add Instructions */
4802 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4803 {
4804     return float32_add(a, float16_to_float32(b, true, s), s);
4805 }
4806 
4807 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4808 {
4809     return float64_add(a, float32_to_float64(b, s), s);
4810 }
4811 
4812 /* Vector Widening Floating-Point Reduction Instructions */
4813 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4814 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4815 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4816 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4817 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4818 
4819 /*
4820  * Vector Mask Operations
4821  */
4822 /* Vector Mask-Register Logical Instructions */
4823 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4824 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4825                   void *vs2, CPURISCVState *env,          \
4826                   uint32_t desc)                          \
4827 {                                                         \
4828     uint32_t vl = env->vl;                                \
4829     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4830     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4831     uint32_t i;                                           \
4832     int a, b;                                             \
4833                                                           \
4834     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4835                                                           \
4836     for (i = env->vstart; i < vl; i++) {                  \
4837         a = vext_elem_mask(vs1, i);                       \
4838         b = vext_elem_mask(vs2, i);                       \
4839         vext_set_elem_mask(vd, i, OP(b, a));              \
4840     }                                                     \
4841     env->vstart = 0;                                      \
4842     /*
4843      * mask destination register are always tail-agnostic
4844      * set tail elements to 1s
4845      */                                                   \
4846     if (vta_all_1s) {                                     \
4847         for (; i < total_elems; i++) {                    \
4848             vext_set_elem_mask(vd, i, 1);                 \
4849         }                                                 \
4850     }                                                     \
4851 }
4852 
4853 #define DO_NAND(N, M)  (!(N & M))
4854 #define DO_ANDNOT(N, M)  (N & !M)
4855 #define DO_NOR(N, M)  (!(N | M))
4856 #define DO_ORNOT(N, M)  (N | !M)
4857 #define DO_XNOR(N, M)  (!(N ^ M))
4858 
4859 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4860 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4861 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4862 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4863 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4864 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4865 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4866 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4867 
4868 /* Vector count population in mask vcpop */
4869 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4870                              uint32_t desc)
4871 {
4872     target_ulong cnt = 0;
4873     uint32_t vm = vext_vm(desc);
4874     uint32_t vl = env->vl;
4875     int i;
4876 
4877     for (i = env->vstart; i < vl; i++) {
4878         if (vm || vext_elem_mask(v0, i)) {
4879             if (vext_elem_mask(vs2, i)) {
4880                 cnt++;
4881             }
4882         }
4883     }
4884     env->vstart = 0;
4885     return cnt;
4886 }
4887 
4888 /* vfirst find-first-set mask bit */
4889 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4890                               uint32_t desc)
4891 {
4892     uint32_t vm = vext_vm(desc);
4893     uint32_t vl = env->vl;
4894     int i;
4895 
4896     for (i = env->vstart; i < vl; i++) {
4897         if (vm || vext_elem_mask(v0, i)) {
4898             if (vext_elem_mask(vs2, i)) {
4899                 return i;
4900             }
4901         }
4902     }
4903     env->vstart = 0;
4904     return -1LL;
4905 }
4906 
4907 enum set_mask_type {
4908     ONLY_FIRST = 1,
4909     INCLUDE_FIRST,
4910     BEFORE_FIRST,
4911 };
4912 
4913 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4914                    uint32_t desc, enum set_mask_type type)
4915 {
4916     uint32_t vm = vext_vm(desc);
4917     uint32_t vl = env->vl;
4918     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4919     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4920     uint32_t vma = vext_vma(desc);
4921     int i;
4922     bool first_mask_bit = false;
4923 
4924     VSTART_CHECK_EARLY_EXIT(env, vl);
4925 
4926     for (i = env->vstart; i < vl; i++) {
4927         if (!vm && !vext_elem_mask(v0, i)) {
4928             /* set masked-off elements to 1s */
4929             if (vma) {
4930                 vext_set_elem_mask(vd, i, 1);
4931             }
4932             continue;
4933         }
4934         /* write a zero to all following active elements */
4935         if (first_mask_bit) {
4936             vext_set_elem_mask(vd, i, 0);
4937             continue;
4938         }
4939         if (vext_elem_mask(vs2, i)) {
4940             first_mask_bit = true;
4941             if (type == BEFORE_FIRST) {
4942                 vext_set_elem_mask(vd, i, 0);
4943             } else {
4944                 vext_set_elem_mask(vd, i, 1);
4945             }
4946         } else {
4947             if (type == ONLY_FIRST) {
4948                 vext_set_elem_mask(vd, i, 0);
4949             } else {
4950                 vext_set_elem_mask(vd, i, 1);
4951             }
4952         }
4953     }
4954     env->vstart = 0;
4955     /*
4956      * mask destination register are always tail-agnostic
4957      * set tail elements to 1s
4958      */
4959     if (vta_all_1s) {
4960         for (; i < total_elems; i++) {
4961             vext_set_elem_mask(vd, i, 1);
4962         }
4963     }
4964 }
4965 
4966 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4967                      uint32_t desc)
4968 {
4969     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4970 }
4971 
4972 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4973                      uint32_t desc)
4974 {
4975     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4976 }
4977 
4978 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4979                      uint32_t desc)
4980 {
4981     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4982 }
4983 
4984 /* Vector Iota Instruction */
4985 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4986 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4987                   uint32_t desc)                                          \
4988 {                                                                         \
4989     uint32_t vm = vext_vm(desc);                                          \
4990     uint32_t vl = env->vl;                                                \
4991     uint32_t esz = sizeof(ETYPE);                                         \
4992     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4993     uint32_t vta = vext_vta(desc);                                        \
4994     uint32_t vma = vext_vma(desc);                                        \
4995     uint32_t sum = 0;                                                     \
4996     int i;                                                                \
4997                                                                           \
4998     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
4999                                                                           \
5000     for (i = env->vstart; i < vl; i++) {                                  \
5001         if (!vm && !vext_elem_mask(v0, i)) {                              \
5002             /* set masked-off elements to 1s */                           \
5003             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5004             continue;                                                     \
5005         }                                                                 \
5006         *((ETYPE *)vd + H(i)) = sum;                                      \
5007         if (vext_elem_mask(vs2, i)) {                                     \
5008             sum++;                                                        \
5009         }                                                                 \
5010     }                                                                     \
5011     env->vstart = 0;                                                      \
5012     /* set tail elements to 1s */                                         \
5013     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5014 }
5015 
5016 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
5017 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
5018 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
5019 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
5020 
5021 /* Vector Element Index Instruction */
5022 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
5023 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
5024 {                                                                         \
5025     uint32_t vm = vext_vm(desc);                                          \
5026     uint32_t vl = env->vl;                                                \
5027     uint32_t esz = sizeof(ETYPE);                                         \
5028     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5029     uint32_t vta = vext_vta(desc);                                        \
5030     uint32_t vma = vext_vma(desc);                                        \
5031     int i;                                                                \
5032                                                                           \
5033     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5034                                                                           \
5035     for (i = env->vstart; i < vl; i++) {                                  \
5036         if (!vm && !vext_elem_mask(v0, i)) {                              \
5037             /* set masked-off elements to 1s */                           \
5038             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5039             continue;                                                     \
5040         }                                                                 \
5041         *((ETYPE *)vd + H(i)) = i;                                        \
5042     }                                                                     \
5043     env->vstart = 0;                                                      \
5044     /* set tail elements to 1s */                                         \
5045     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5046 }
5047 
5048 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5049 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5050 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5051 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5052 
5053 /*
5054  * Vector Permutation Instructions
5055  */
5056 
5057 /* Vector Slide Instructions */
5058 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5059 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5060                   CPURISCVState *env, uint32_t desc)                      \
5061 {                                                                         \
5062     uint32_t vm = vext_vm(desc);                                          \
5063     uint32_t vl = env->vl;                                                \
5064     uint32_t esz = sizeof(ETYPE);                                         \
5065     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5066     uint32_t vta = vext_vta(desc);                                        \
5067     uint32_t vma = vext_vma(desc);                                        \
5068     target_ulong offset = s1, i_min, i;                                   \
5069                                                                           \
5070     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5071                                                                           \
5072     i_min = MAX(env->vstart, offset);                                     \
5073     for (i = i_min; i < vl; i++) {                                        \
5074         if (!vm && !vext_elem_mask(v0, i)) {                              \
5075             /* set masked-off elements to 1s */                           \
5076             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5077             continue;                                                     \
5078         }                                                                 \
5079         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5080     }                                                                     \
5081     env->vstart = 0;                                                      \
5082     /* set tail elements to 1s */                                         \
5083     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5084 }
5085 
5086 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5087 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5088 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5089 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5090 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5091 
5092 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5093 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5094                   CPURISCVState *env, uint32_t desc)                      \
5095 {                                                                         \
5096     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5097     uint32_t vm = vext_vm(desc);                                          \
5098     uint32_t vl = env->vl;                                                \
5099     uint32_t esz = sizeof(ETYPE);                                         \
5100     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5101     uint32_t vta = vext_vta(desc);                                        \
5102     uint32_t vma = vext_vma(desc);                                        \
5103     target_ulong i_max, i_min, i;                                         \
5104                                                                           \
5105     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5106                                                                           \
5107     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
5108     i_max = MAX(i_min, env->vstart);                                      \
5109     for (i = env->vstart; i < i_max; ++i) {                               \
5110         if (!vm && !vext_elem_mask(v0, i)) {                              \
5111             /* set masked-off elements to 1s */                           \
5112             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5113             continue;                                                     \
5114         }                                                                 \
5115         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5116     }                                                                     \
5117                                                                           \
5118     for (i = i_max; i < vl; ++i) {                                        \
5119         if (vm || vext_elem_mask(v0, i)) {                                \
5120             *((ETYPE *)vd + H(i)) = 0;                                    \
5121         }                                                                 \
5122     }                                                                     \
5123                                                                           \
5124     env->vstart = 0;                                                      \
5125     /* set tail elements to 1s */                                         \
5126     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5127 }
5128 
5129 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5130 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5131 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5132 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5133 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5134 
5135 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5136 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5137                                  void *vs2, CPURISCVState *env,             \
5138                                  uint32_t desc)                             \
5139 {                                                                           \
5140     typedef uint##BITWIDTH##_t ETYPE;                                       \
5141     uint32_t vm = vext_vm(desc);                                            \
5142     uint32_t vl = env->vl;                                                  \
5143     uint32_t esz = sizeof(ETYPE);                                           \
5144     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5145     uint32_t vta = vext_vta(desc);                                          \
5146     uint32_t vma = vext_vma(desc);                                          \
5147     uint32_t i;                                                             \
5148                                                                             \
5149     VSTART_CHECK_EARLY_EXIT(env, vl);                                       \
5150                                                                             \
5151     for (i = env->vstart; i < vl; i++) {                                    \
5152         if (!vm && !vext_elem_mask(v0, i)) {                                \
5153             /* set masked-off elements to 1s */                             \
5154             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5155             continue;                                                       \
5156         }                                                                   \
5157         if (i == 0) {                                                       \
5158             *((ETYPE *)vd + H(i)) = s1;                                     \
5159         } else {                                                            \
5160             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5161         }                                                                   \
5162     }                                                                       \
5163     env->vstart = 0;                                                        \
5164     /* set tail elements to 1s */                                           \
5165     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5166 }
5167 
5168 GEN_VEXT_VSLIE1UP(8,  H1)
5169 GEN_VEXT_VSLIE1UP(16, H2)
5170 GEN_VEXT_VSLIE1UP(32, H4)
5171 GEN_VEXT_VSLIE1UP(64, H8)
5172 
5173 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5174 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5175                   CPURISCVState *env, uint32_t desc)              \
5176 {                                                                 \
5177     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5178 }
5179 
5180 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5181 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5182 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5183 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5184 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5185 
5186 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5187 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5188                                    void *vs2, CPURISCVState *env,             \
5189                                    uint32_t desc)                             \
5190 {                                                                             \
5191     typedef uint##BITWIDTH##_t ETYPE;                                         \
5192     uint32_t vm = vext_vm(desc);                                              \
5193     uint32_t vl = env->vl;                                                    \
5194     uint32_t esz = sizeof(ETYPE);                                             \
5195     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5196     uint32_t vta = vext_vta(desc);                                            \
5197     uint32_t vma = vext_vma(desc);                                            \
5198     uint32_t i;                                                               \
5199                                                                               \
5200     VSTART_CHECK_EARLY_EXIT(env, vl);                                         \
5201                                                                               \
5202     for (i = env->vstart; i < vl; i++) {                                      \
5203         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5204             /* set masked-off elements to 1s */                               \
5205             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5206             continue;                                                         \
5207         }                                                                     \
5208         if (i == vl - 1) {                                                    \
5209             *((ETYPE *)vd + H(i)) = s1;                                       \
5210         } else {                                                              \
5211             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5212         }                                                                     \
5213     }                                                                         \
5214     env->vstart = 0;                                                          \
5215     /* set tail elements to 1s */                                             \
5216     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5217 }
5218 
5219 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5220 GEN_VEXT_VSLIDE1DOWN(16, H2)
5221 GEN_VEXT_VSLIDE1DOWN(32, H4)
5222 GEN_VEXT_VSLIDE1DOWN(64, H8)
5223 
5224 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5225 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5226                   CPURISCVState *env, uint32_t desc)              \
5227 {                                                                 \
5228     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5229 }
5230 
5231 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5232 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5233 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5234 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5235 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5236 
5237 /* Vector Floating-Point Slide Instructions */
5238 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5239 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5240                   CPURISCVState *env, uint32_t desc)          \
5241 {                                                             \
5242     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5243 }
5244 
5245 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5246 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5247 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5248 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5249 
5250 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5251 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5252                   CPURISCVState *env, uint32_t desc)          \
5253 {                                                             \
5254     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5255 }
5256 
5257 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5258 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5259 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5260 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5261 
5262 /* Vector Register Gather Instruction */
5263 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5264 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5265                   CPURISCVState *env, uint32_t desc)                      \
5266 {                                                                         \
5267     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5268     uint32_t vm = vext_vm(desc);                                          \
5269     uint32_t vl = env->vl;                                                \
5270     uint32_t esz = sizeof(TS2);                                           \
5271     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5272     uint32_t vta = vext_vta(desc);                                        \
5273     uint32_t vma = vext_vma(desc);                                        \
5274     uint64_t index;                                                       \
5275     uint32_t i;                                                           \
5276                                                                           \
5277     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5278                                                                           \
5279     for (i = env->vstart; i < vl; i++) {                                  \
5280         if (!vm && !vext_elem_mask(v0, i)) {                              \
5281             /* set masked-off elements to 1s */                           \
5282             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5283             continue;                                                     \
5284         }                                                                 \
5285         index = *((TS1 *)vs1 + HS1(i));                                   \
5286         if (index >= vlmax) {                                             \
5287             *((TS2 *)vd + HS2(i)) = 0;                                    \
5288         } else {                                                          \
5289             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5290         }                                                                 \
5291     }                                                                     \
5292     env->vstart = 0;                                                      \
5293     /* set tail elements to 1s */                                         \
5294     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5295 }
5296 
5297 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5298 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5299 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5300 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5301 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5302 
5303 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5304 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5305 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5306 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5307 
5308 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5309 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5310                   CPURISCVState *env, uint32_t desc)                      \
5311 {                                                                         \
5312     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5313     uint32_t vm = vext_vm(desc);                                          \
5314     uint32_t vl = env->vl;                                                \
5315     uint32_t esz = sizeof(ETYPE);                                         \
5316     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5317     uint32_t vta = vext_vta(desc);                                        \
5318     uint32_t vma = vext_vma(desc);                                        \
5319     uint64_t index = s1;                                                  \
5320     uint32_t i;                                                           \
5321                                                                           \
5322     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5323                                                                           \
5324     for (i = env->vstart; i < vl; i++) {                                  \
5325         if (!vm && !vext_elem_mask(v0, i)) {                              \
5326             /* set masked-off elements to 1s */                           \
5327             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5328             continue;                                                     \
5329         }                                                                 \
5330         if (index >= vlmax) {                                             \
5331             *((ETYPE *)vd + H(i)) = 0;                                    \
5332         } else {                                                          \
5333             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5334         }                                                                 \
5335     }                                                                     \
5336     env->vstart = 0;                                                      \
5337     /* set tail elements to 1s */                                         \
5338     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5339 }
5340 
5341 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5342 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5343 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5344 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5345 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5346 
5347 /* Vector Compress Instruction */
5348 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5349 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5350                   CPURISCVState *env, uint32_t desc)                      \
5351 {                                                                         \
5352     uint32_t vl = env->vl;                                                \
5353     uint32_t esz = sizeof(ETYPE);                                         \
5354     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5355     uint32_t vta = vext_vta(desc);                                        \
5356     uint32_t num = 0, i;                                                  \
5357                                                                           \
5358     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5359                                                                           \
5360     for (i = env->vstart; i < vl; i++) {                                  \
5361         if (!vext_elem_mask(vs1, i)) {                                    \
5362             continue;                                                     \
5363         }                                                                 \
5364         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5365         num++;                                                            \
5366     }                                                                     \
5367     env->vstart = 0;                                                      \
5368     /* set tail elements to 1s */                                         \
5369     vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
5370 }
5371 
5372 /* Compress into vd elements of vs2 where vs1 is enabled */
5373 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5374 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5375 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5376 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5377 
5378 /* Vector Whole Register Move */
5379 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5380 {
5381     /* EEW = SEW */
5382     uint32_t maxsz = simd_maxsz(desc);
5383     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5384     uint32_t startb = env->vstart * sewb;
5385     uint32_t i = startb;
5386 
5387     if (startb >= maxsz) {
5388         env->vstart = 0;
5389         return;
5390     }
5391 
5392     if (HOST_BIG_ENDIAN && i % 8 != 0) {
5393         uint32_t j = ROUND_UP(i, 8);
5394         memcpy((uint8_t *)vd + H1(j - 1),
5395                (uint8_t *)vs2 + H1(j - 1),
5396                j - i);
5397         i = j;
5398     }
5399 
5400     memcpy((uint8_t *)vd + H1(i),
5401            (uint8_t *)vs2 + H1(i),
5402            maxsz - i);
5403 
5404     env->vstart = 0;
5405 }
5406 
5407 /* Vector Integer Extension */
5408 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5409 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5410                   CPURISCVState *env, uint32_t desc)             \
5411 {                                                                \
5412     uint32_t vl = env->vl;                                       \
5413     uint32_t vm = vext_vm(desc);                                 \
5414     uint32_t esz = sizeof(ETYPE);                                \
5415     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5416     uint32_t vta = vext_vta(desc);                               \
5417     uint32_t vma = vext_vma(desc);                               \
5418     uint32_t i;                                                  \
5419                                                                  \
5420     VSTART_CHECK_EARLY_EXIT(env, vl);                            \
5421                                                                  \
5422     for (i = env->vstart; i < vl; i++) {                         \
5423         if (!vm && !vext_elem_mask(v0, i)) {                     \
5424             /* set masked-off elements to 1s */                  \
5425             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5426             continue;                                            \
5427         }                                                        \
5428         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5429     }                                                            \
5430     env->vstart = 0;                                             \
5431     /* set tail elements to 1s */                                \
5432     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5433 }
5434 
5435 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5436 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5437 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5438 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5439 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5440 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5441 
5442 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5443 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5444 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5445 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5446 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5447 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5448