xref: /qemu/target/riscv/vector_helper.c (revision 897c68fb795cf03b89b6688a6f945d68a765c3e4)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/page-protection.h"
27 #include "exec/helper-proto.h"
28 #include "fpu/softfloat.h"
29 #include "tcg/tcg-gvec-desc.h"
30 #include "internals.h"
31 #include "vector_internals.h"
32 #include <math.h>
33 
34 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
35                             target_ulong s2)
36 {
37     int vlmax, vl;
38     RISCVCPU *cpu = env_archcpu(env);
39     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
40     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
41     uint16_t sew = 8 << vsew;
42     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
43     int xlen = riscv_cpu_xlen(env);
44     bool vill = (s2 >> (xlen - 1)) & 0x1;
45     target_ulong reserved = s2 &
46                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
47                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
48     uint16_t vlen = cpu->cfg.vlenb << 3;
49     int8_t lmul;
50 
51     if (vlmul & 4) {
52         /*
53          * Fractional LMUL, check:
54          *
55          * VLEN * LMUL >= SEW
56          * VLEN >> (8 - lmul) >= sew
57          * (vlenb << 3) >> (8 - lmul) >= sew
58          */
59         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
60             vill = true;
61         }
62     }
63 
64     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
65         /* only set vill bit. */
66         env->vill = 1;
67         env->vtype = 0;
68         env->vl = 0;
69         env->vstart = 0;
70         return 0;
71     }
72 
73     /* lmul encoded as in DisasContext::lmul */
74     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
75     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
76     if (s1 <= vlmax) {
77         vl = s1;
78     } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
79         vl = (s1 + 1) >> 1;
80     } else {
81         vl = vlmax;
82     }
83     env->vl = vl;
84     env->vtype = s2;
85     env->vstart = 0;
86     env->vill = 0;
87     return vl;
88 }
89 
90 /*
91  * Get the maximum number of elements can be operated.
92  *
93  * log2_esz: log2 of element size in bytes.
94  */
95 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
96 {
97     /*
98      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
99      * so vlen in bytes (vlenb) is encoded as maxsz.
100      */
101     uint32_t vlenb = simd_maxsz(desc);
102 
103     /* Return VLMAX */
104     int scale = vext_lmul(desc) - log2_esz;
105     return scale < 0 ? vlenb >> -scale : vlenb << scale;
106 }
107 
108 /*
109  * This function checks watchpoint before real load operation.
110  *
111  * In system mode, the TLB API probe_access is enough for watchpoint check.
112  * In user mode, there is no watchpoint support now.
113  *
114  * It will trigger an exception if there is no mapping in TLB
115  * and page table walk can't fill the TLB entry. Then the guest
116  * software can return here after process the exception or never return.
117  */
118 static void probe_pages(CPURISCVState *env, target_ulong addr,
119                         target_ulong len, uintptr_t ra,
120                         MMUAccessType access_type)
121 {
122     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
123     target_ulong curlen = MIN(pagelen, len);
124     int mmu_index = riscv_env_mmu_index(env, false);
125 
126     probe_access(env, adjust_addr(env, addr), curlen, access_type,
127                  mmu_index, ra);
128     if (len > curlen) {
129         addr += curlen;
130         curlen = len - curlen;
131         probe_access(env, adjust_addr(env, addr), curlen, access_type,
132                      mmu_index, ra);
133     }
134 }
135 
136 static inline void vext_set_elem_mask(void *v0, int index,
137                                       uint8_t value)
138 {
139     int idx = index / 64;
140     int pos = index % 64;
141     uint64_t old = ((uint64_t *)v0)[idx];
142     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
143 }
144 
145 /* elements operations for load and store */
146 typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
147                                    uint32_t idx, void *vd, uintptr_t retaddr);
148 typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
149 
150 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
151 static inline QEMU_ALWAYS_INLINE                            \
152 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
153                 uint32_t idx, void *vd, uintptr_t retaddr)  \
154 {                                                           \
155     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
156     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
157 }                                                           \
158                                                             \
159 static inline QEMU_ALWAYS_INLINE                            \
160 void NAME##_host(void *vd, uint32_t idx, void *host)        \
161 {                                                           \
162     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
163     *cur = (ETYPE)LDSUF##_p(host);                          \
164 }
165 
166 GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
167 GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
168 GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
169 GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
170 
171 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
172 static inline QEMU_ALWAYS_INLINE                            \
173 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
174                 uint32_t idx, void *vd, uintptr_t retaddr)  \
175 {                                                           \
176     ETYPE data = *((ETYPE *)vd + H(idx));                   \
177     cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
178 }                                                           \
179                                                             \
180 static inline QEMU_ALWAYS_INLINE                            \
181 void NAME##_host(void *vd, uint32_t idx, void *host)        \
182 {                                                           \
183     ETYPE data = *((ETYPE *)vd + H(idx));                   \
184     STSUF##_p(host, data);                                  \
185 }
186 
187 GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
188 GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
189 GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
190 GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
191 
192 static inline QEMU_ALWAYS_INLINE void
193 vext_continuous_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
194                        void *vd, uint32_t evl, target_ulong addr,
195                        uint32_t reg_start, uintptr_t ra, uint32_t esz,
196                        bool is_load)
197 {
198     uint32_t i;
199     for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
200         ldst_tlb(env, adjust_addr(env, addr), i, vd, ra);
201     }
202 }
203 
204 static inline QEMU_ALWAYS_INLINE void
205 vext_continuous_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
206                         void *vd, uint32_t evl, uint32_t reg_start, void *host,
207                         uint32_t esz, bool is_load)
208 {
209 #if HOST_BIG_ENDIAN
210     for (; reg_start < evl; reg_start++, host += esz) {
211         ldst_host(vd, reg_start, host);
212     }
213 #else
214     if (esz == 1) {
215         uint32_t byte_offset = reg_start * esz;
216         uint32_t size = (evl - reg_start) * esz;
217 
218         if (is_load) {
219             memcpy(vd + byte_offset, host, size);
220         } else {
221             memcpy(host, vd + byte_offset, size);
222         }
223     } else {
224         for (; reg_start < evl; reg_start++, host += esz) {
225             ldst_host(vd, reg_start, host);
226         }
227     }
228 #endif
229 }
230 
231 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
232                                    uint32_t desc, uint32_t nf,
233                                    uint32_t esz, uint32_t max_elems)
234 {
235     uint32_t vta = vext_vta(desc);
236     int k;
237 
238     if (vta == 0) {
239         return;
240     }
241 
242     for (k = 0; k < nf; ++k) {
243         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
244                           (k * max_elems + max_elems) * esz);
245     }
246 }
247 
248 /*
249  * stride: access vector element from strided memory
250  */
251 static void
252 vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride,
253                  CPURISCVState *env, uint32_t desc, uint32_t vm,
254                  vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
255                  uintptr_t ra)
256 {
257     uint32_t i, k;
258     uint32_t nf = vext_nf(desc);
259     uint32_t max_elems = vext_max_elems(desc, log2_esz);
260     uint32_t esz = 1 << log2_esz;
261     uint32_t vma = vext_vma(desc);
262 
263     VSTART_CHECK_EARLY_EXIT(env, env->vl);
264 
265     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
266         k = 0;
267         while (k < nf) {
268             if (!vm && !vext_elem_mask(v0, i)) {
269                 /* set masked-off elements to 1s */
270                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
271                                   (i + k * max_elems + 1) * esz);
272                 k++;
273                 continue;
274             }
275             target_ulong addr = base + stride * i + (k << log2_esz);
276             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
277             k++;
278         }
279     }
280     env->vstart = 0;
281 
282     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
283 }
284 
285 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
286 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
287                   target_ulong stride, CPURISCVState *env,              \
288                   uint32_t desc)                                        \
289 {                                                                       \
290     uint32_t vm = vext_vm(desc);                                        \
291     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
292                      ctzl(sizeof(ETYPE)), GETPC());                     \
293 }
294 
295 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
296 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
297 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
298 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
299 
300 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
301 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
302                   target_ulong stride, CPURISCVState *env,              \
303                   uint32_t desc)                                        \
304 {                                                                       \
305     uint32_t vm = vext_vm(desc);                                        \
306     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
307                      ctzl(sizeof(ETYPE)), GETPC());                     \
308 }
309 
310 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
311 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
312 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
313 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
314 
315 /*
316  * unit-stride: access elements stored contiguously in memory
317  */
318 
319 /* unmasked unit-stride load and store operation */
320 static inline QEMU_ALWAYS_INLINE void
321 vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
322                   uint32_t elems, uint32_t nf, uint32_t max_elems,
323                   uint32_t log2_esz, bool is_load, int mmu_index,
324                   vext_ldst_elem_fn_tlb *ldst_tlb,
325                   vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
326 {
327     void *host;
328     int i, k, flags;
329     uint32_t esz = 1 << log2_esz;
330     uint32_t size = (elems * nf) << log2_esz;
331     uint32_t evl = env->vstart + elems;
332     MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
333 
334     /* Check page permission/pmp/watchpoint/etc. */
335     flags = probe_access_flags(env, adjust_addr(env, addr), size, access_type,
336                                mmu_index, true, &host, ra);
337 
338     if (flags == 0) {
339         if (nf == 1) {
340             vext_continuous_ldst_host(env, ldst_host, vd, evl, env->vstart,
341                                       host, esz, is_load);
342         } else {
343             for (i = env->vstart; i < evl; ++i) {
344                 k = 0;
345                 while (k < nf) {
346                     ldst_host(vd, i + k * max_elems, host);
347                     host += esz;
348                     k++;
349                 }
350             }
351         }
352         env->vstart += elems;
353     } else {
354         if (nf == 1) {
355             vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
356                                    ra, esz, is_load);
357         } else {
358             /* load bytes from guest memory */
359             for (i = env->vstart; i < evl; env->vstart = ++i) {
360                 k = 0;
361                 while (k < nf) {
362                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
363                              vd, ra);
364                     addr += esz;
365                     k++;
366                 }
367             }
368         }
369     }
370 }
371 
372 static inline QEMU_ALWAYS_INLINE void
373 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
374              vext_ldst_elem_fn_tlb *ldst_tlb,
375              vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
376              uint32_t evl, uintptr_t ra, bool is_load)
377 {
378     uint32_t k;
379     target_ulong page_split, elems, addr;
380     uint32_t nf = vext_nf(desc);
381     uint32_t max_elems = vext_max_elems(desc, log2_esz);
382     uint32_t esz = 1 << log2_esz;
383     uint32_t msize = nf * esz;
384     int mmu_index = riscv_env_mmu_index(env, false);
385 
386     VSTART_CHECK_EARLY_EXIT(env, evl);
387 
388 #if defined(CONFIG_USER_ONLY)
389     /*
390      * For data sizes <= 6 bytes we get better performance by simply calling
391      * vext_continuous_ldst_tlb
392      */
393     if (nf == 1 && (evl << log2_esz) <= 6) {
394         addr = base + (env->vstart << log2_esz);
395         vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart, ra,
396                                  esz, is_load);
397 
398         env->vstart = 0;
399         vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
400         return;
401     }
402 #endif
403 
404     /* Calculate the page range of first page */
405     addr = base + ((env->vstart * nf) << log2_esz);
406     page_split = -(addr | TARGET_PAGE_MASK);
407     /* Get number of elements */
408     elems = page_split / msize;
409     if (unlikely(env->vstart + elems >= evl)) {
410         elems = evl - env->vstart;
411     }
412 
413     /* Load/store elements in the first page */
414     if (likely(elems)) {
415         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
416                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
417     }
418 
419     /* Load/store elements in the second page */
420     if (unlikely(env->vstart < evl)) {
421         /* Cross page element */
422         if (unlikely(page_split % msize)) {
423             for (k = 0; k < nf; k++) {
424                 addr = base + ((env->vstart * nf + k) << log2_esz);
425                 ldst_tlb(env, adjust_addr(env, addr),
426                         env->vstart + k * max_elems, vd, ra);
427             }
428             env->vstart++;
429         }
430 
431         addr = base + ((env->vstart * nf) << log2_esz);
432         /* Get number of elements of second page */
433         elems = evl - env->vstart;
434 
435         /* Load/store elements in the second page */
436         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
437                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
438     }
439 
440     env->vstart = 0;
441     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
442 }
443 
444 /*
445  * masked unit-stride load and store operation will be a special case of
446  * stride, stride = NF * sizeof (ETYPE)
447  */
448 
449 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
450 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
451                          CPURISCVState *env, uint32_t desc)         \
452 {                                                                   \
453     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
454     vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
455                      LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
456 }                                                                   \
457                                                                     \
458 void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
459                   CPURISCVState *env, uint32_t desc)                \
460 {                                                                   \
461     vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
462                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
463 }
464 
465 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
466 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
467 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
468 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
469 
470 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
471 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
472                          CPURISCVState *env, uint32_t desc)              \
473 {                                                                        \
474     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
475     vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
476                      STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
477 }                                                                        \
478                                                                          \
479 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
480                   CPURISCVState *env, uint32_t desc)                     \
481 {                                                                        \
482     vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
483                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
484 }
485 
486 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
487 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
488 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
489 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
490 
491 /*
492  * unit stride mask load and store, EEW = 1
493  */
494 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
495                     CPURISCVState *env, uint32_t desc)
496 {
497     /* evl = ceil(vl/8) */
498     uint8_t evl = (env->vl + 7) >> 3;
499     vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
500                  0, evl, GETPC(), true);
501 }
502 
503 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
504                     CPURISCVState *env, uint32_t desc)
505 {
506     /* evl = ceil(vl/8) */
507     uint8_t evl = (env->vl + 7) >> 3;
508     vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
509                  0, evl, GETPC(), false);
510 }
511 
512 /*
513  * index: access vector element from indexed memory
514  */
515 typedef target_ulong vext_get_index_addr(target_ulong base,
516         uint32_t idx, void *vs2);
517 
518 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
519 static target_ulong NAME(target_ulong base,            \
520                          uint32_t idx, void *vs2)      \
521 {                                                      \
522     return (base + *((ETYPE *)vs2 + H(idx)));          \
523 }
524 
525 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
526 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
527 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
528 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
529 
530 static inline void
531 vext_ldst_index(void *vd, void *v0, target_ulong base,
532                 void *vs2, CPURISCVState *env, uint32_t desc,
533                 vext_get_index_addr get_index_addr,
534                 vext_ldst_elem_fn_tlb *ldst_elem,
535                 uint32_t log2_esz, uintptr_t ra)
536 {
537     uint32_t i, k;
538     uint32_t nf = vext_nf(desc);
539     uint32_t vm = vext_vm(desc);
540     uint32_t max_elems = vext_max_elems(desc, log2_esz);
541     uint32_t esz = 1 << log2_esz;
542     uint32_t vma = vext_vma(desc);
543 
544     VSTART_CHECK_EARLY_EXIT(env, env->vl);
545 
546     /* load bytes from guest memory */
547     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
548         k = 0;
549         while (k < nf) {
550             if (!vm && !vext_elem_mask(v0, i)) {
551                 /* set masked-off elements to 1s */
552                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
553                                   (i + k * max_elems + 1) * esz);
554                 k++;
555                 continue;
556             }
557             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
558             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
559             k++;
560         }
561     }
562     env->vstart = 0;
563 
564     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
565 }
566 
567 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
568 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
569                   void *vs2, CPURISCVState *env, uint32_t desc)            \
570 {                                                                          \
571     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
572                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
573 }
574 
575 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
576 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
577 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
578 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
579 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
580 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
581 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
582 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
583 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
584 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
585 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
586 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
587 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
588 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
589 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
590 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
591 
592 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
593 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
594                   void *vs2, CPURISCVState *env, uint32_t desc)  \
595 {                                                                \
596     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
597                     STORE_FN, ctzl(sizeof(ETYPE)),               \
598                     GETPC());                                    \
599 }
600 
601 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
602 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
603 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
604 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
605 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
606 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
607 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
608 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
609 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
610 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
611 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
612 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
613 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
614 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
615 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
616 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
617 
618 /*
619  * unit-stride fault-only-fisrt load instructions
620  */
621 static inline void
622 vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
623           uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
624           vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
625 {
626     uint32_t i, k, vl = 0;
627     uint32_t nf = vext_nf(desc);
628     uint32_t vm = vext_vm(desc);
629     uint32_t max_elems = vext_max_elems(desc, log2_esz);
630     uint32_t esz = 1 << log2_esz;
631     uint32_t msize = nf * esz;
632     uint32_t vma = vext_vma(desc);
633     target_ulong addr, addr_probe, addr_i, offset, remain, page_split, elems;
634     int mmu_index = riscv_env_mmu_index(env, false);
635     int flags;
636     void *host;
637 
638     VSTART_CHECK_EARLY_EXIT(env, env->vl);
639 
640     addr = base + ((env->vstart * nf) << log2_esz);
641     page_split = -(addr | TARGET_PAGE_MASK);
642     /* Get number of elements */
643     elems = page_split / msize;
644     if (unlikely(env->vstart + elems >= env->vl)) {
645         elems = env->vl - env->vstart;
646     }
647 
648     /* Check page permission/pmp/watchpoint/etc. */
649     flags = probe_access_flags(env, adjust_addr(env, addr), elems * msize,
650                                MMU_DATA_LOAD, mmu_index, true, &host, ra);
651 
652     /* If we are crossing a page check also the second page. */
653     if (env->vl > elems) {
654         addr_probe = addr + (elems << log2_esz);
655         flags |= probe_access_flags(env, adjust_addr(env, addr_probe),
656                                     elems * msize, MMU_DATA_LOAD, mmu_index,
657                                     true, &host, ra);
658     }
659 
660     if (flags & ~TLB_WATCHPOINT) {
661         /* probe every access */
662         for (i = env->vstart; i < env->vl; i++) {
663             if (!vm && !vext_elem_mask(v0, i)) {
664                 continue;
665             }
666             addr_i = adjust_addr(env, base + i * (nf << log2_esz));
667             if (i == 0) {
668                 /* Allow fault on first element. */
669                 probe_pages(env, addr_i, nf << log2_esz, ra, MMU_DATA_LOAD);
670             } else {
671                 remain = nf << log2_esz;
672                 while (remain > 0) {
673                     offset = -(addr_i | TARGET_PAGE_MASK);
674 
675                     /* Probe nonfault on subsequent elements. */
676                     flags = probe_access_flags(env, addr_i, offset,
677                                                MMU_DATA_LOAD, mmu_index, true,
678                                                &host, 0);
679 
680                     /*
681                      * Stop if invalid (unmapped) or mmio (transaction may
682                      * fail). Do not stop if watchpoint, as the spec says that
683                      * first-fault should continue to access the same
684                      * elements regardless of any watchpoint.
685                      */
686                     if (flags & ~TLB_WATCHPOINT) {
687                         vl = i;
688                         goto ProbeSuccess;
689                     }
690                     if (remain <= offset) {
691                         break;
692                     }
693                     remain -= offset;
694                     addr_i = adjust_addr(env, addr_i + offset);
695                 }
696             }
697         }
698     }
699 ProbeSuccess:
700     /* load bytes from guest memory */
701     if (vl != 0) {
702         env->vl = vl;
703     }
704 
705     if (env->vstart < env->vl) {
706         if (vm) {
707             /* Load/store elements in the first page */
708             if (likely(elems)) {
709                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
710                                   log2_esz, true, mmu_index, ldst_tlb,
711                                   ldst_host, ra);
712             }
713 
714             /* Load/store elements in the second page */
715             if (unlikely(env->vstart < env->vl)) {
716                 /* Cross page element */
717                 if (unlikely(page_split % msize)) {
718                     for (k = 0; k < nf; k++) {
719                         addr = base + ((env->vstart * nf + k) << log2_esz);
720                         ldst_tlb(env, adjust_addr(env, addr),
721                                  env->vstart + k * max_elems, vd, ra);
722                     }
723                     env->vstart++;
724                 }
725 
726                 addr = base + ((env->vstart * nf) << log2_esz);
727                 /* Get number of elements of second page */
728                 elems = env->vl - env->vstart;
729 
730                 /* Load/store elements in the second page */
731                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
732                                   log2_esz, true, mmu_index, ldst_tlb,
733                                   ldst_host, ra);
734             }
735         } else {
736             for (i = env->vstart; i < env->vl; i++) {
737                 k = 0;
738                 while (k < nf) {
739                     if (!vext_elem_mask(v0, i)) {
740                         /* set masked-off elements to 1s */
741                         vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
742                                           (i + k * max_elems + 1) * esz);
743                         k++;
744                         continue;
745                     }
746                     addr = base + ((i * nf + k) << log2_esz);
747                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
748                              vd, ra);
749                     k++;
750                 }
751             }
752         }
753     }
754     env->vstart = 0;
755 
756     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
757 }
758 
759 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
760 void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
761                   CPURISCVState *env, uint32_t desc)            \
762 {                                                               \
763     vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
764               LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
765 }
766 
767 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
768 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
769 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
770 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
771 
772 #define DO_SWAP(N, M) (M)
773 #define DO_AND(N, M)  (N & M)
774 #define DO_XOR(N, M)  (N ^ M)
775 #define DO_OR(N, M)   (N | M)
776 #define DO_ADD(N, M)  (N + M)
777 
778 /* Signed min/max */
779 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
780 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
781 
782 /*
783  * load and store whole register instructions
784  */
785 static inline QEMU_ALWAYS_INLINE void
786 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
787                 vext_ldst_elem_fn_tlb *ldst_tlb,
788                 vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
789                 uintptr_t ra, bool is_load)
790 {
791     target_ulong page_split, elems, addr;
792     uint32_t nf = vext_nf(desc);
793     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
794     uint32_t max_elems = vlenb >> log2_esz;
795     uint32_t evl = nf * max_elems;
796     uint32_t esz = 1 << log2_esz;
797     int mmu_index = riscv_env_mmu_index(env, false);
798 
799     /* Calculate the page range of first page */
800     addr = base + (env->vstart << log2_esz);
801     page_split = -(addr | TARGET_PAGE_MASK);
802     /* Get number of elements */
803     elems = page_split / esz;
804     if (unlikely(env->vstart + elems >= evl)) {
805         elems = evl - env->vstart;
806     }
807 
808     /* Load/store elements in the first page */
809     if (likely(elems)) {
810         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
811                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
812     }
813 
814     /* Load/store elements in the second page */
815     if (unlikely(env->vstart < evl)) {
816         /* Cross page element */
817         if (unlikely(page_split % esz)) {
818             addr = base + (env->vstart << log2_esz);
819             ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
820             env->vstart++;
821         }
822 
823         addr = base + (env->vstart << log2_esz);
824         /* Get number of elements of second page */
825         elems = evl - env->vstart;
826 
827         /* Load/store elements in the second page */
828         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
829                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
830     }
831 
832     env->vstart = 0;
833 }
834 
835 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
836 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
837                   uint32_t desc)                                    \
838 {                                                                   \
839     vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
840                     ctzl(sizeof(ETYPE)), GETPC(), true);            \
841 }
842 
843 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
844 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
845 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
846 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
847 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
848 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
849 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
850 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
851 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
852 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
853 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
854 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
855 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
856 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
857 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
858 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
859 
860 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
861 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
862                   uint32_t desc)                                        \
863 {                                                                       \
864     vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
865                     ctzl(sizeof(ETYPE)), GETPC(), false);               \
866 }
867 
868 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
869 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
870 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
871 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
872 
873 /*
874  * Vector Integer Arithmetic Instructions
875  */
876 
877 /* (TD, T1, T2, TX1, TX2) */
878 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
879 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
880 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
881 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
882 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
883 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
884 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
885 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
886 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
887 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
888 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
889 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
890 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
891 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
892 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
893 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
894 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
895 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
896 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
897 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
898 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
899 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
900 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
901 
902 #define DO_SUB(N, M) (N - M)
903 #define DO_RSUB(N, M) (M - N)
904 
905 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
906 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
907 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
908 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
909 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
910 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
911 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
912 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
913 
914 GEN_VEXT_VV(vadd_vv_b, 1)
915 GEN_VEXT_VV(vadd_vv_h, 2)
916 GEN_VEXT_VV(vadd_vv_w, 4)
917 GEN_VEXT_VV(vadd_vv_d, 8)
918 GEN_VEXT_VV(vsub_vv_b, 1)
919 GEN_VEXT_VV(vsub_vv_h, 2)
920 GEN_VEXT_VV(vsub_vv_w, 4)
921 GEN_VEXT_VV(vsub_vv_d, 8)
922 
923 
924 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
925 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
926 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
927 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
928 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
929 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
930 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
931 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
932 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
933 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
934 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
935 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
936 
937 GEN_VEXT_VX(vadd_vx_b, 1)
938 GEN_VEXT_VX(vadd_vx_h, 2)
939 GEN_VEXT_VX(vadd_vx_w, 4)
940 GEN_VEXT_VX(vadd_vx_d, 8)
941 GEN_VEXT_VX(vsub_vx_b, 1)
942 GEN_VEXT_VX(vsub_vx_h, 2)
943 GEN_VEXT_VX(vsub_vx_w, 4)
944 GEN_VEXT_VX(vsub_vx_d, 8)
945 GEN_VEXT_VX(vrsub_vx_b, 1)
946 GEN_VEXT_VX(vrsub_vx_h, 2)
947 GEN_VEXT_VX(vrsub_vx_w, 4)
948 GEN_VEXT_VX(vrsub_vx_d, 8)
949 
950 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
951 {
952     intptr_t oprsz = simd_oprsz(desc);
953     intptr_t i;
954 
955     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
956         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
957     }
958 }
959 
960 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
961 {
962     intptr_t oprsz = simd_oprsz(desc);
963     intptr_t i;
964 
965     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
966         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
967     }
968 }
969 
970 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
971 {
972     intptr_t oprsz = simd_oprsz(desc);
973     intptr_t i;
974 
975     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
976         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
977     }
978 }
979 
980 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
981 {
982     intptr_t oprsz = simd_oprsz(desc);
983     intptr_t i;
984 
985     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
986         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
987     }
988 }
989 
990 /* Vector Widening Integer Add/Subtract */
991 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
992 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
993 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
994 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
995 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
996 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
997 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
998 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
999 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
1000 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
1001 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
1002 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
1003 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
1004 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
1005 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
1006 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
1007 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
1008 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1009 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1010 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1011 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1012 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1013 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1014 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1015 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1016 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1017 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1018 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1019 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1020 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1021 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1022 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1023 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1024 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1025 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1026 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1027 GEN_VEXT_VV(vwaddu_vv_b, 2)
1028 GEN_VEXT_VV(vwaddu_vv_h, 4)
1029 GEN_VEXT_VV(vwaddu_vv_w, 8)
1030 GEN_VEXT_VV(vwsubu_vv_b, 2)
1031 GEN_VEXT_VV(vwsubu_vv_h, 4)
1032 GEN_VEXT_VV(vwsubu_vv_w, 8)
1033 GEN_VEXT_VV(vwadd_vv_b, 2)
1034 GEN_VEXT_VV(vwadd_vv_h, 4)
1035 GEN_VEXT_VV(vwadd_vv_w, 8)
1036 GEN_VEXT_VV(vwsub_vv_b, 2)
1037 GEN_VEXT_VV(vwsub_vv_h, 4)
1038 GEN_VEXT_VV(vwsub_vv_w, 8)
1039 GEN_VEXT_VV(vwaddu_wv_b, 2)
1040 GEN_VEXT_VV(vwaddu_wv_h, 4)
1041 GEN_VEXT_VV(vwaddu_wv_w, 8)
1042 GEN_VEXT_VV(vwsubu_wv_b, 2)
1043 GEN_VEXT_VV(vwsubu_wv_h, 4)
1044 GEN_VEXT_VV(vwsubu_wv_w, 8)
1045 GEN_VEXT_VV(vwadd_wv_b, 2)
1046 GEN_VEXT_VV(vwadd_wv_h, 4)
1047 GEN_VEXT_VV(vwadd_wv_w, 8)
1048 GEN_VEXT_VV(vwsub_wv_b, 2)
1049 GEN_VEXT_VV(vwsub_wv_h, 4)
1050 GEN_VEXT_VV(vwsub_wv_w, 8)
1051 
1052 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1053 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1054 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1055 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1056 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1057 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1058 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1059 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1060 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1061 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1062 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1063 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1064 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1065 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1066 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1067 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1068 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1069 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1070 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1071 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1072 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1073 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1074 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1075 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1076 GEN_VEXT_VX(vwaddu_vx_b, 2)
1077 GEN_VEXT_VX(vwaddu_vx_h, 4)
1078 GEN_VEXT_VX(vwaddu_vx_w, 8)
1079 GEN_VEXT_VX(vwsubu_vx_b, 2)
1080 GEN_VEXT_VX(vwsubu_vx_h, 4)
1081 GEN_VEXT_VX(vwsubu_vx_w, 8)
1082 GEN_VEXT_VX(vwadd_vx_b, 2)
1083 GEN_VEXT_VX(vwadd_vx_h, 4)
1084 GEN_VEXT_VX(vwadd_vx_w, 8)
1085 GEN_VEXT_VX(vwsub_vx_b, 2)
1086 GEN_VEXT_VX(vwsub_vx_h, 4)
1087 GEN_VEXT_VX(vwsub_vx_w, 8)
1088 GEN_VEXT_VX(vwaddu_wx_b, 2)
1089 GEN_VEXT_VX(vwaddu_wx_h, 4)
1090 GEN_VEXT_VX(vwaddu_wx_w, 8)
1091 GEN_VEXT_VX(vwsubu_wx_b, 2)
1092 GEN_VEXT_VX(vwsubu_wx_h, 4)
1093 GEN_VEXT_VX(vwsubu_wx_w, 8)
1094 GEN_VEXT_VX(vwadd_wx_b, 2)
1095 GEN_VEXT_VX(vwadd_wx_h, 4)
1096 GEN_VEXT_VX(vwadd_wx_w, 8)
1097 GEN_VEXT_VX(vwsub_wx_b, 2)
1098 GEN_VEXT_VX(vwsub_wx_h, 4)
1099 GEN_VEXT_VX(vwsub_wx_w, 8)
1100 
1101 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1102 #define DO_VADC(N, M, C) (N + M + C)
1103 #define DO_VSBC(N, M, C) (N - M - C)
1104 
1105 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1106 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1107                   CPURISCVState *env, uint32_t desc)          \
1108 {                                                             \
1109     uint32_t vl = env->vl;                                    \
1110     uint32_t esz = sizeof(ETYPE);                             \
1111     uint32_t total_elems =                                    \
1112         vext_get_total_elems(env, desc, esz);                 \
1113     uint32_t vta = vext_vta(desc);                            \
1114     uint32_t i;                                               \
1115                                                               \
1116     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1117                                                               \
1118     for (i = env->vstart; i < vl; i++) {                      \
1119         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1120         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1121         ETYPE carry = vext_elem_mask(v0, i);                  \
1122                                                               \
1123         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1124     }                                                         \
1125     env->vstart = 0;                                          \
1126     /* set tail elements to 1s */                             \
1127     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1128 }
1129 
1130 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1131 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1132 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1133 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1134 
1135 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1136 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1137 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1138 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1139 
1140 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1141 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1142                   CPURISCVState *env, uint32_t desc)                     \
1143 {                                                                        \
1144     uint32_t vl = env->vl;                                               \
1145     uint32_t esz = sizeof(ETYPE);                                        \
1146     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1147     uint32_t vta = vext_vta(desc);                                       \
1148     uint32_t i;                                                          \
1149                                                                          \
1150     VSTART_CHECK_EARLY_EXIT(env, vl);                                    \
1151                                                                          \
1152     for (i = env->vstart; i < vl; i++) {                                 \
1153         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1154         ETYPE carry = vext_elem_mask(v0, i);                             \
1155                                                                          \
1156         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1157     }                                                                    \
1158     env->vstart = 0;                                                     \
1159     /* set tail elements to 1s */                                        \
1160     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1161 }
1162 
1163 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1164 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1165 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1166 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1167 
1168 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1169 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1170 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1171 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1172 
1173 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1174                           (__typeof(N))(N + M) < N)
1175 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1176 
1177 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1178 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1179                   CPURISCVState *env, uint32_t desc)          \
1180 {                                                             \
1181     uint32_t vl = env->vl;                                    \
1182     uint32_t vm = vext_vm(desc);                              \
1183     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1184     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1185     uint32_t i;                                               \
1186                                                               \
1187     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1188                                                               \
1189     for (i = env->vstart; i < vl; i++) {                      \
1190         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1191         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1192         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1193         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1194     }                                                         \
1195     env->vstart = 0;                                          \
1196     /*
1197      * mask destination register are always tail-agnostic
1198      * set tail elements to 1s
1199      */                                                       \
1200     if (vta_all_1s) {                                         \
1201         for (; i < total_elems; i++) {                        \
1202             vext_set_elem_mask(vd, i, 1);                     \
1203         }                                                     \
1204     }                                                         \
1205 }
1206 
1207 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1208 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1209 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1210 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1211 
1212 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1213 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1214 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1215 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1216 
1217 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1218 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1219                   void *vs2, CPURISCVState *env, uint32_t desc) \
1220 {                                                               \
1221     uint32_t vl = env->vl;                                      \
1222     uint32_t vm = vext_vm(desc);                                \
1223     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1224     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1225     uint32_t i;                                                 \
1226                                                                 \
1227     VSTART_CHECK_EARLY_EXIT(env, vl);                           \
1228                                                                 \
1229     for (i = env->vstart; i < vl; i++) {                        \
1230         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1231         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1232         vext_set_elem_mask(vd, i,                               \
1233                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1234     }                                                           \
1235     env->vstart = 0;                                            \
1236     /*
1237      * mask destination register are always tail-agnostic
1238      * set tail elements to 1s
1239      */                                                         \
1240     if (vta_all_1s) {                                           \
1241         for (; i < total_elems; i++) {                          \
1242             vext_set_elem_mask(vd, i, 1);                       \
1243         }                                                       \
1244     }                                                           \
1245 }
1246 
1247 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1248 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1249 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1250 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1251 
1252 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1253 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1254 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1255 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1256 
1257 /* Vector Bitwise Logical Instructions */
1258 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1259 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1260 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1261 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1262 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1263 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1264 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1265 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1266 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1267 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1268 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1269 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1270 GEN_VEXT_VV(vand_vv_b, 1)
1271 GEN_VEXT_VV(vand_vv_h, 2)
1272 GEN_VEXT_VV(vand_vv_w, 4)
1273 GEN_VEXT_VV(vand_vv_d, 8)
1274 GEN_VEXT_VV(vor_vv_b, 1)
1275 GEN_VEXT_VV(vor_vv_h, 2)
1276 GEN_VEXT_VV(vor_vv_w, 4)
1277 GEN_VEXT_VV(vor_vv_d, 8)
1278 GEN_VEXT_VV(vxor_vv_b, 1)
1279 GEN_VEXT_VV(vxor_vv_h, 2)
1280 GEN_VEXT_VV(vxor_vv_w, 4)
1281 GEN_VEXT_VV(vxor_vv_d, 8)
1282 
1283 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1284 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1285 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1286 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1287 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1288 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1289 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1290 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1291 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1292 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1293 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1294 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1295 GEN_VEXT_VX(vand_vx_b, 1)
1296 GEN_VEXT_VX(vand_vx_h, 2)
1297 GEN_VEXT_VX(vand_vx_w, 4)
1298 GEN_VEXT_VX(vand_vx_d, 8)
1299 GEN_VEXT_VX(vor_vx_b, 1)
1300 GEN_VEXT_VX(vor_vx_h, 2)
1301 GEN_VEXT_VX(vor_vx_w, 4)
1302 GEN_VEXT_VX(vor_vx_d, 8)
1303 GEN_VEXT_VX(vxor_vx_b, 1)
1304 GEN_VEXT_VX(vxor_vx_h, 2)
1305 GEN_VEXT_VX(vxor_vx_w, 4)
1306 GEN_VEXT_VX(vxor_vx_d, 8)
1307 
1308 /* Vector Single-Width Bit Shift Instructions */
1309 #define DO_SLL(N, M)  (N << (M))
1310 #define DO_SRL(N, M)  (N >> (M))
1311 
1312 /* generate the helpers for shift instructions with two vector operators */
1313 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1314 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1315                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1316 {                                                                         \
1317     uint32_t vm = vext_vm(desc);                                          \
1318     uint32_t vl = env->vl;                                                \
1319     uint32_t esz = sizeof(TS1);                                           \
1320     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1321     uint32_t vta = vext_vta(desc);                                        \
1322     uint32_t vma = vext_vma(desc);                                        \
1323     uint32_t i;                                                           \
1324                                                                           \
1325     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
1326                                                                           \
1327     for (i = env->vstart; i < vl; i++) {                                  \
1328         if (!vm && !vext_elem_mask(v0, i)) {                              \
1329             /* set masked-off elements to 1s */                           \
1330             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1331             continue;                                                     \
1332         }                                                                 \
1333         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1334         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1335         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1336     }                                                                     \
1337     env->vstart = 0;                                                      \
1338     /* set tail elements to 1s */                                         \
1339     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1340 }
1341 
1342 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1343 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1344 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1345 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1346 
1347 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1348 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1349 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1350 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1351 
1352 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1353 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1354 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1355 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1356 
1357 /*
1358  * generate the helpers for shift instructions with one vector and one scalar
1359  */
1360 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1361 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1362                   void *vs2, CPURISCVState *env,            \
1363                   uint32_t desc)                            \
1364 {                                                           \
1365     uint32_t vm = vext_vm(desc);                            \
1366     uint32_t vl = env->vl;                                  \
1367     uint32_t esz = sizeof(TD);                              \
1368     uint32_t total_elems =                                  \
1369         vext_get_total_elems(env, desc, esz);               \
1370     uint32_t vta = vext_vta(desc);                          \
1371     uint32_t vma = vext_vma(desc);                          \
1372     uint32_t i;                                             \
1373                                                             \
1374     VSTART_CHECK_EARLY_EXIT(env, vl);                       \
1375                                                             \
1376     for (i = env->vstart; i < vl; i++) {                    \
1377         if (!vm && !vext_elem_mask(v0, i)) {                \
1378             /* set masked-off elements to 1s */             \
1379             vext_set_elems_1s(vd, vma, i * esz,             \
1380                               (i + 1) * esz);               \
1381             continue;                                       \
1382         }                                                   \
1383         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1384         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1385     }                                                       \
1386     env->vstart = 0;                                        \
1387     /* set tail elements to 1s */                           \
1388     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1389 }
1390 
1391 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1392 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1393 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1394 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1395 
1396 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1397 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1398 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1399 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1400 
1401 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1402 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1403 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1404 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1405 
1406 /* Vector Narrowing Integer Right Shift Instructions */
1407 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1408 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1409 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1410 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1411 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1412 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1413 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1414 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1415 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1416 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1417 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1418 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1419 
1420 /* Vector Integer Comparison Instructions */
1421 #define DO_MSEQ(N, M) (N == M)
1422 #define DO_MSNE(N, M) (N != M)
1423 #define DO_MSLT(N, M) (N < M)
1424 #define DO_MSLE(N, M) (N <= M)
1425 #define DO_MSGT(N, M) (N > M)
1426 
1427 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1428 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1429                   CPURISCVState *env, uint32_t desc)          \
1430 {                                                             \
1431     uint32_t vm = vext_vm(desc);                              \
1432     uint32_t vl = env->vl;                                    \
1433     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1434     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1435     uint32_t vma = vext_vma(desc);                            \
1436     uint32_t i;                                               \
1437                                                               \
1438     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1439                                                               \
1440     for (i = env->vstart; i < vl; i++) {                      \
1441         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1442         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1443         if (!vm && !vext_elem_mask(v0, i)) {                  \
1444             /* set masked-off elements to 1s */               \
1445             if (vma) {                                        \
1446                 vext_set_elem_mask(vd, i, 1);                 \
1447             }                                                 \
1448             continue;                                         \
1449         }                                                     \
1450         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1451     }                                                         \
1452     env->vstart = 0;                                          \
1453     /*
1454      * mask destination register are always tail-agnostic
1455      * set tail elements to 1s
1456      */                                                       \
1457     if (vta_all_1s) {                                         \
1458         for (; i < total_elems; i++) {                        \
1459             vext_set_elem_mask(vd, i, 1);                     \
1460         }                                                     \
1461     }                                                         \
1462 }
1463 
1464 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1465 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1466 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1467 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1468 
1469 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1470 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1471 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1472 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1473 
1474 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1475 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1476 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1477 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1478 
1479 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1480 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1481 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1482 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1483 
1484 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1485 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1486 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1487 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1488 
1489 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1490 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1491 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1492 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1493 
1494 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1495 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1496                   CPURISCVState *env, uint32_t desc)                \
1497 {                                                                   \
1498     uint32_t vm = vext_vm(desc);                                    \
1499     uint32_t vl = env->vl;                                          \
1500     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1501     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1502     uint32_t vma = vext_vma(desc);                                  \
1503     uint32_t i;                                                     \
1504                                                                     \
1505     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
1506                                                                     \
1507     for (i = env->vstart; i < vl; i++) {                            \
1508         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1509         if (!vm && !vext_elem_mask(v0, i)) {                        \
1510             /* set masked-off elements to 1s */                     \
1511             if (vma) {                                              \
1512                 vext_set_elem_mask(vd, i, 1);                       \
1513             }                                                       \
1514             continue;                                               \
1515         }                                                           \
1516         vext_set_elem_mask(vd, i,                                   \
1517                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1518     }                                                               \
1519     env->vstart = 0;                                                \
1520     /*
1521      * mask destination register are always tail-agnostic
1522      * set tail elements to 1s
1523      */                                                             \
1524     if (vta_all_1s) {                                               \
1525         for (; i < total_elems; i++) {                              \
1526             vext_set_elem_mask(vd, i, 1);                           \
1527         }                                                           \
1528     }                                                               \
1529 }
1530 
1531 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1532 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1533 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1534 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1535 
1536 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1537 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1538 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1539 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1540 
1541 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1542 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1543 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1544 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1545 
1546 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1547 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1548 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1549 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1550 
1551 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1552 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1553 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1554 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1555 
1556 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1557 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1558 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1559 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1560 
1561 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1562 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1563 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1564 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1565 
1566 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1567 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1568 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1569 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1570 
1571 /* Vector Integer Min/Max Instructions */
1572 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1573 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1574 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1575 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1576 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1577 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1578 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1579 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1580 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1581 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1582 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1583 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1584 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1585 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1586 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1587 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1588 GEN_VEXT_VV(vminu_vv_b, 1)
1589 GEN_VEXT_VV(vminu_vv_h, 2)
1590 GEN_VEXT_VV(vminu_vv_w, 4)
1591 GEN_VEXT_VV(vminu_vv_d, 8)
1592 GEN_VEXT_VV(vmin_vv_b, 1)
1593 GEN_VEXT_VV(vmin_vv_h, 2)
1594 GEN_VEXT_VV(vmin_vv_w, 4)
1595 GEN_VEXT_VV(vmin_vv_d, 8)
1596 GEN_VEXT_VV(vmaxu_vv_b, 1)
1597 GEN_VEXT_VV(vmaxu_vv_h, 2)
1598 GEN_VEXT_VV(vmaxu_vv_w, 4)
1599 GEN_VEXT_VV(vmaxu_vv_d, 8)
1600 GEN_VEXT_VV(vmax_vv_b, 1)
1601 GEN_VEXT_VV(vmax_vv_h, 2)
1602 GEN_VEXT_VV(vmax_vv_w, 4)
1603 GEN_VEXT_VV(vmax_vv_d, 8)
1604 
1605 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1606 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1607 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1608 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1609 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1610 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1611 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1612 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1613 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1614 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1615 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1616 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1617 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1618 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1619 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1620 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1621 GEN_VEXT_VX(vminu_vx_b, 1)
1622 GEN_VEXT_VX(vminu_vx_h, 2)
1623 GEN_VEXT_VX(vminu_vx_w, 4)
1624 GEN_VEXT_VX(vminu_vx_d, 8)
1625 GEN_VEXT_VX(vmin_vx_b, 1)
1626 GEN_VEXT_VX(vmin_vx_h, 2)
1627 GEN_VEXT_VX(vmin_vx_w, 4)
1628 GEN_VEXT_VX(vmin_vx_d, 8)
1629 GEN_VEXT_VX(vmaxu_vx_b, 1)
1630 GEN_VEXT_VX(vmaxu_vx_h, 2)
1631 GEN_VEXT_VX(vmaxu_vx_w, 4)
1632 GEN_VEXT_VX(vmaxu_vx_d, 8)
1633 GEN_VEXT_VX(vmax_vx_b, 1)
1634 GEN_VEXT_VX(vmax_vx_h, 2)
1635 GEN_VEXT_VX(vmax_vx_w, 4)
1636 GEN_VEXT_VX(vmax_vx_d, 8)
1637 
1638 /* Vector Single-Width Integer Multiply Instructions */
1639 #define DO_MUL(N, M) (N * M)
1640 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1641 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1642 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1643 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1644 GEN_VEXT_VV(vmul_vv_b, 1)
1645 GEN_VEXT_VV(vmul_vv_h, 2)
1646 GEN_VEXT_VV(vmul_vv_w, 4)
1647 GEN_VEXT_VV(vmul_vv_d, 8)
1648 
1649 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1650 {
1651     return (int16_t)s2 * (int16_t)s1 >> 8;
1652 }
1653 
1654 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1655 {
1656     return (int32_t)s2 * (int32_t)s1 >> 16;
1657 }
1658 
1659 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1660 {
1661     return (int64_t)s2 * (int64_t)s1 >> 32;
1662 }
1663 
1664 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1665 {
1666     uint64_t hi_64, lo_64;
1667 
1668     muls64(&lo_64, &hi_64, s1, s2);
1669     return hi_64;
1670 }
1671 
1672 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1673 {
1674     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1675 }
1676 
1677 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1678 {
1679     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1680 }
1681 
1682 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1683 {
1684     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1685 }
1686 
1687 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1688 {
1689     uint64_t hi_64, lo_64;
1690 
1691     mulu64(&lo_64, &hi_64, s2, s1);
1692     return hi_64;
1693 }
1694 
1695 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1696 {
1697     return (int16_t)s2 * (uint16_t)s1 >> 8;
1698 }
1699 
1700 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1701 {
1702     return (int32_t)s2 * (uint32_t)s1 >> 16;
1703 }
1704 
1705 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1706 {
1707     return (int64_t)s2 * (uint64_t)s1 >> 32;
1708 }
1709 
1710 /*
1711  * Let  A = signed operand,
1712  *      B = unsigned operand
1713  *      P = mulu64(A, B), unsigned product
1714  *
1715  * LET  X = 2 ** 64  - A, 2's complement of A
1716  *      SP = signed product
1717  * THEN
1718  *      IF A < 0
1719  *          SP = -X * B
1720  *             = -(2 ** 64 - A) * B
1721  *             = A * B - 2 ** 64 * B
1722  *             = P - 2 ** 64 * B
1723  *      ELSE
1724  *          SP = P
1725  * THEN
1726  *      HI_P -= (A < 0 ? B : 0)
1727  */
1728 
1729 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1730 {
1731     uint64_t hi_64, lo_64;
1732 
1733     mulu64(&lo_64, &hi_64, s2, s1);
1734 
1735     hi_64 -= s2 < 0 ? s1 : 0;
1736     return hi_64;
1737 }
1738 
1739 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1740 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1741 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1742 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1743 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1744 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1745 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1746 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1747 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1748 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1749 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1750 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1751 GEN_VEXT_VV(vmulh_vv_b, 1)
1752 GEN_VEXT_VV(vmulh_vv_h, 2)
1753 GEN_VEXT_VV(vmulh_vv_w, 4)
1754 GEN_VEXT_VV(vmulh_vv_d, 8)
1755 GEN_VEXT_VV(vmulhu_vv_b, 1)
1756 GEN_VEXT_VV(vmulhu_vv_h, 2)
1757 GEN_VEXT_VV(vmulhu_vv_w, 4)
1758 GEN_VEXT_VV(vmulhu_vv_d, 8)
1759 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1760 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1761 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1762 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1763 
1764 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1765 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1766 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1767 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1768 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1769 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1770 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1771 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1772 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1773 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1774 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1775 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1776 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1777 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1778 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1779 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1780 GEN_VEXT_VX(vmul_vx_b, 1)
1781 GEN_VEXT_VX(vmul_vx_h, 2)
1782 GEN_VEXT_VX(vmul_vx_w, 4)
1783 GEN_VEXT_VX(vmul_vx_d, 8)
1784 GEN_VEXT_VX(vmulh_vx_b, 1)
1785 GEN_VEXT_VX(vmulh_vx_h, 2)
1786 GEN_VEXT_VX(vmulh_vx_w, 4)
1787 GEN_VEXT_VX(vmulh_vx_d, 8)
1788 GEN_VEXT_VX(vmulhu_vx_b, 1)
1789 GEN_VEXT_VX(vmulhu_vx_h, 2)
1790 GEN_VEXT_VX(vmulhu_vx_w, 4)
1791 GEN_VEXT_VX(vmulhu_vx_d, 8)
1792 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1793 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1794 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1795 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1796 
1797 /* Vector Integer Divide Instructions */
1798 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1799 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1800 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1801         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1802 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1803         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1804 
1805 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1806 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1807 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1808 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1809 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1810 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1811 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1812 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1813 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1814 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1815 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1816 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1817 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1818 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1819 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1820 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1821 GEN_VEXT_VV(vdivu_vv_b, 1)
1822 GEN_VEXT_VV(vdivu_vv_h, 2)
1823 GEN_VEXT_VV(vdivu_vv_w, 4)
1824 GEN_VEXT_VV(vdivu_vv_d, 8)
1825 GEN_VEXT_VV(vdiv_vv_b, 1)
1826 GEN_VEXT_VV(vdiv_vv_h, 2)
1827 GEN_VEXT_VV(vdiv_vv_w, 4)
1828 GEN_VEXT_VV(vdiv_vv_d, 8)
1829 GEN_VEXT_VV(vremu_vv_b, 1)
1830 GEN_VEXT_VV(vremu_vv_h, 2)
1831 GEN_VEXT_VV(vremu_vv_w, 4)
1832 GEN_VEXT_VV(vremu_vv_d, 8)
1833 GEN_VEXT_VV(vrem_vv_b, 1)
1834 GEN_VEXT_VV(vrem_vv_h, 2)
1835 GEN_VEXT_VV(vrem_vv_w, 4)
1836 GEN_VEXT_VV(vrem_vv_d, 8)
1837 
1838 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1839 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1840 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1841 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1842 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1843 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1844 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1845 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1846 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1847 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1848 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1849 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1850 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1851 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1852 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1853 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1854 GEN_VEXT_VX(vdivu_vx_b, 1)
1855 GEN_VEXT_VX(vdivu_vx_h, 2)
1856 GEN_VEXT_VX(vdivu_vx_w, 4)
1857 GEN_VEXT_VX(vdivu_vx_d, 8)
1858 GEN_VEXT_VX(vdiv_vx_b, 1)
1859 GEN_VEXT_VX(vdiv_vx_h, 2)
1860 GEN_VEXT_VX(vdiv_vx_w, 4)
1861 GEN_VEXT_VX(vdiv_vx_d, 8)
1862 GEN_VEXT_VX(vremu_vx_b, 1)
1863 GEN_VEXT_VX(vremu_vx_h, 2)
1864 GEN_VEXT_VX(vremu_vx_w, 4)
1865 GEN_VEXT_VX(vremu_vx_d, 8)
1866 GEN_VEXT_VX(vrem_vx_b, 1)
1867 GEN_VEXT_VX(vrem_vx_h, 2)
1868 GEN_VEXT_VX(vrem_vx_w, 4)
1869 GEN_VEXT_VX(vrem_vx_d, 8)
1870 
1871 /* Vector Widening Integer Multiply Instructions */
1872 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1873 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1874 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1875 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1876 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1877 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1878 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1879 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1880 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1881 GEN_VEXT_VV(vwmul_vv_b, 2)
1882 GEN_VEXT_VV(vwmul_vv_h, 4)
1883 GEN_VEXT_VV(vwmul_vv_w, 8)
1884 GEN_VEXT_VV(vwmulu_vv_b, 2)
1885 GEN_VEXT_VV(vwmulu_vv_h, 4)
1886 GEN_VEXT_VV(vwmulu_vv_w, 8)
1887 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1888 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1889 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1890 
1891 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1892 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1893 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1894 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1895 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1896 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1897 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1898 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1899 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1900 GEN_VEXT_VX(vwmul_vx_b, 2)
1901 GEN_VEXT_VX(vwmul_vx_h, 4)
1902 GEN_VEXT_VX(vwmul_vx_w, 8)
1903 GEN_VEXT_VX(vwmulu_vx_b, 2)
1904 GEN_VEXT_VX(vwmulu_vx_h, 4)
1905 GEN_VEXT_VX(vwmulu_vx_w, 8)
1906 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1907 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1908 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1909 
1910 /* Vector Single-Width Integer Multiply-Add Instructions */
1911 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1912 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1913 {                                                                  \
1914     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1915     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1916     TD d = *((TD *)vd + HD(i));                                    \
1917     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1918 }
1919 
1920 #define DO_MACC(N, M, D) (M * N + D)
1921 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1922 #define DO_MADD(N, M, D) (M * D + N)
1923 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1924 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1925 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1926 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1927 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1928 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1929 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1930 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1931 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1932 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1933 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1934 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1935 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1936 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1937 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1938 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1939 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1940 GEN_VEXT_VV(vmacc_vv_b, 1)
1941 GEN_VEXT_VV(vmacc_vv_h, 2)
1942 GEN_VEXT_VV(vmacc_vv_w, 4)
1943 GEN_VEXT_VV(vmacc_vv_d, 8)
1944 GEN_VEXT_VV(vnmsac_vv_b, 1)
1945 GEN_VEXT_VV(vnmsac_vv_h, 2)
1946 GEN_VEXT_VV(vnmsac_vv_w, 4)
1947 GEN_VEXT_VV(vnmsac_vv_d, 8)
1948 GEN_VEXT_VV(vmadd_vv_b, 1)
1949 GEN_VEXT_VV(vmadd_vv_h, 2)
1950 GEN_VEXT_VV(vmadd_vv_w, 4)
1951 GEN_VEXT_VV(vmadd_vv_d, 8)
1952 GEN_VEXT_VV(vnmsub_vv_b, 1)
1953 GEN_VEXT_VV(vnmsub_vv_h, 2)
1954 GEN_VEXT_VV(vnmsub_vv_w, 4)
1955 GEN_VEXT_VV(vnmsub_vv_d, 8)
1956 
1957 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1958 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1959 {                                                                   \
1960     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1961     TD d = *((TD *)vd + HD(i));                                     \
1962     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1963 }
1964 
1965 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1966 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1967 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1968 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1969 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1970 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1971 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1972 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1973 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1974 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1975 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1976 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1977 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1978 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1979 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1980 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1981 GEN_VEXT_VX(vmacc_vx_b, 1)
1982 GEN_VEXT_VX(vmacc_vx_h, 2)
1983 GEN_VEXT_VX(vmacc_vx_w, 4)
1984 GEN_VEXT_VX(vmacc_vx_d, 8)
1985 GEN_VEXT_VX(vnmsac_vx_b, 1)
1986 GEN_VEXT_VX(vnmsac_vx_h, 2)
1987 GEN_VEXT_VX(vnmsac_vx_w, 4)
1988 GEN_VEXT_VX(vnmsac_vx_d, 8)
1989 GEN_VEXT_VX(vmadd_vx_b, 1)
1990 GEN_VEXT_VX(vmadd_vx_h, 2)
1991 GEN_VEXT_VX(vmadd_vx_w, 4)
1992 GEN_VEXT_VX(vmadd_vx_d, 8)
1993 GEN_VEXT_VX(vnmsub_vx_b, 1)
1994 GEN_VEXT_VX(vnmsub_vx_h, 2)
1995 GEN_VEXT_VX(vnmsub_vx_w, 4)
1996 GEN_VEXT_VX(vnmsub_vx_d, 8)
1997 
1998 /* Vector Widening Integer Multiply-Add Instructions */
1999 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
2000 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
2001 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
2002 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
2003 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
2004 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
2005 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
2006 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
2007 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
2008 GEN_VEXT_VV(vwmaccu_vv_b, 2)
2009 GEN_VEXT_VV(vwmaccu_vv_h, 4)
2010 GEN_VEXT_VV(vwmaccu_vv_w, 8)
2011 GEN_VEXT_VV(vwmacc_vv_b, 2)
2012 GEN_VEXT_VV(vwmacc_vv_h, 4)
2013 GEN_VEXT_VV(vwmacc_vv_w, 8)
2014 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
2015 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
2016 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
2017 
2018 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
2019 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
2020 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
2021 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
2022 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
2023 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
2024 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
2025 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
2026 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
2027 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
2028 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
2029 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
2030 GEN_VEXT_VX(vwmaccu_vx_b, 2)
2031 GEN_VEXT_VX(vwmaccu_vx_h, 4)
2032 GEN_VEXT_VX(vwmaccu_vx_w, 8)
2033 GEN_VEXT_VX(vwmacc_vx_b, 2)
2034 GEN_VEXT_VX(vwmacc_vx_h, 4)
2035 GEN_VEXT_VX(vwmacc_vx_w, 8)
2036 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2037 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2038 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2039 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2040 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2041 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2042 
2043 /* Vector Integer Merge and Move Instructions */
2044 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2045 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2046                   uint32_t desc)                                     \
2047 {                                                                    \
2048     uint32_t vl = env->vl;                                           \
2049     uint32_t esz = sizeof(ETYPE);                                    \
2050     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2051     uint32_t vta = vext_vta(desc);                                   \
2052     uint32_t i;                                                      \
2053                                                                      \
2054     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2055                                                                      \
2056     for (i = env->vstart; i < vl; i++) {                             \
2057         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2058         *((ETYPE *)vd + H(i)) = s1;                                  \
2059     }                                                                \
2060     env->vstart = 0;                                                 \
2061     /* set tail elements to 1s */                                    \
2062     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2063 }
2064 
2065 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2066 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2067 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2068 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2069 
2070 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2071 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2072                   uint32_t desc)                                     \
2073 {                                                                    \
2074     uint32_t vl = env->vl;                                           \
2075     uint32_t esz = sizeof(ETYPE);                                    \
2076     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2077     uint32_t vta = vext_vta(desc);                                   \
2078     uint32_t i;                                                      \
2079                                                                      \
2080     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2081                                                                      \
2082     for (i = env->vstart; i < vl; i++) {                             \
2083         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2084     }                                                                \
2085     env->vstart = 0;                                                 \
2086     /* set tail elements to 1s */                                    \
2087     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2088 }
2089 
2090 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2091 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2092 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2093 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2094 
2095 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2096 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2097                   CPURISCVState *env, uint32_t desc)                 \
2098 {                                                                    \
2099     uint32_t vl = env->vl;                                           \
2100     uint32_t esz = sizeof(ETYPE);                                    \
2101     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2102     uint32_t vta = vext_vta(desc);                                   \
2103     uint32_t i;                                                      \
2104                                                                      \
2105     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2106                                                                      \
2107     for (i = env->vstart; i < vl; i++) {                             \
2108         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2109         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2110     }                                                                \
2111     env->vstart = 0;                                                 \
2112     /* set tail elements to 1s */                                    \
2113     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2114 }
2115 
2116 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2117 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2118 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2119 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2120 
2121 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2122 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2123                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2124 {                                                                    \
2125     uint32_t vl = env->vl;                                           \
2126     uint32_t esz = sizeof(ETYPE);                                    \
2127     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2128     uint32_t vta = vext_vta(desc);                                   \
2129     uint32_t i;                                                      \
2130                                                                      \
2131     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2132                                                                      \
2133     for (i = env->vstart; i < vl; i++) {                             \
2134         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2135         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2136                    (ETYPE)(target_long)s1);                          \
2137         *((ETYPE *)vd + H(i)) = d;                                   \
2138     }                                                                \
2139     env->vstart = 0;                                                 \
2140     /* set tail elements to 1s */                                    \
2141     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2142 }
2143 
2144 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2145 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2146 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2147 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2148 
2149 /*
2150  * Vector Fixed-Point Arithmetic Instructions
2151  */
2152 
2153 /* Vector Single-Width Saturating Add and Subtract */
2154 
2155 /*
2156  * As fixed point instructions probably have round mode and saturation,
2157  * define common macros for fixed point here.
2158  */
2159 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2160                           CPURISCVState *env, int vxrm);
2161 
2162 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2163 static inline void                                                  \
2164 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2165           CPURISCVState *env, int vxrm)                             \
2166 {                                                                   \
2167     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2168     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2169     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2170 }
2171 
2172 static inline void
2173 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2174              CPURISCVState *env,
2175              uint32_t vl, uint32_t vm, int vxrm,
2176              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2177 {
2178     for (uint32_t i = env->vstart; i < vl; i++) {
2179         if (!vm && !vext_elem_mask(v0, i)) {
2180             /* set masked-off elements to 1s */
2181             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2182             continue;
2183         }
2184         fn(vd, vs1, vs2, i, env, vxrm);
2185     }
2186     env->vstart = 0;
2187 }
2188 
2189 static inline void
2190 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2191              CPURISCVState *env,
2192              uint32_t desc,
2193              opivv2_rm_fn *fn, uint32_t esz)
2194 {
2195     uint32_t vm = vext_vm(desc);
2196     uint32_t vl = env->vl;
2197     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2198     uint32_t vta = vext_vta(desc);
2199     uint32_t vma = vext_vma(desc);
2200 
2201     VSTART_CHECK_EARLY_EXIT(env, vl);
2202 
2203     switch (env->vxrm) {
2204     case 0: /* rnu */
2205         vext_vv_rm_1(vd, v0, vs1, vs2,
2206                      env, vl, vm, 0, fn, vma, esz);
2207         break;
2208     case 1: /* rne */
2209         vext_vv_rm_1(vd, v0, vs1, vs2,
2210                      env, vl, vm, 1, fn, vma, esz);
2211         break;
2212     case 2: /* rdn */
2213         vext_vv_rm_1(vd, v0, vs1, vs2,
2214                      env, vl, vm, 2, fn, vma, esz);
2215         break;
2216     default: /* rod */
2217         vext_vv_rm_1(vd, v0, vs1, vs2,
2218                      env, vl, vm, 3, fn, vma, esz);
2219         break;
2220     }
2221     /* set tail elements to 1s */
2222     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2223 }
2224 
2225 /* generate helpers for fixed point instructions with OPIVV format */
2226 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2227 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2228                   CPURISCVState *env, uint32_t desc)            \
2229 {                                                               \
2230     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2231                  do_##NAME, ESZ);                               \
2232 }
2233 
2234 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2235                              uint8_t b)
2236 {
2237     uint8_t res = a + b;
2238     if (res < a) {
2239         res = UINT8_MAX;
2240         env->vxsat = 0x1;
2241     }
2242     return res;
2243 }
2244 
2245 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2246                                uint16_t b)
2247 {
2248     uint16_t res = a + b;
2249     if (res < a) {
2250         res = UINT16_MAX;
2251         env->vxsat = 0x1;
2252     }
2253     return res;
2254 }
2255 
2256 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2257                                uint32_t b)
2258 {
2259     uint32_t res = a + b;
2260     if (res < a) {
2261         res = UINT32_MAX;
2262         env->vxsat = 0x1;
2263     }
2264     return res;
2265 }
2266 
2267 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2268                                uint64_t b)
2269 {
2270     uint64_t res = a + b;
2271     if (res < a) {
2272         res = UINT64_MAX;
2273         env->vxsat = 0x1;
2274     }
2275     return res;
2276 }
2277 
2278 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2279 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2280 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2281 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2282 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2283 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2284 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2285 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2286 
2287 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2288                           CPURISCVState *env, int vxrm);
2289 
2290 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2291 static inline void                                                  \
2292 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2293           CPURISCVState *env, int vxrm)                             \
2294 {                                                                   \
2295     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2296     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2297 }
2298 
2299 static inline void
2300 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2301              CPURISCVState *env,
2302              uint32_t vl, uint32_t vm, int vxrm,
2303              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2304 {
2305     for (uint32_t i = env->vstart; i < vl; i++) {
2306         if (!vm && !vext_elem_mask(v0, i)) {
2307             /* set masked-off elements to 1s */
2308             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2309             continue;
2310         }
2311         fn(vd, s1, vs2, i, env, vxrm);
2312     }
2313     env->vstart = 0;
2314 }
2315 
2316 static inline void
2317 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2318              CPURISCVState *env,
2319              uint32_t desc,
2320              opivx2_rm_fn *fn, uint32_t esz)
2321 {
2322     uint32_t vm = vext_vm(desc);
2323     uint32_t vl = env->vl;
2324     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2325     uint32_t vta = vext_vta(desc);
2326     uint32_t vma = vext_vma(desc);
2327 
2328     VSTART_CHECK_EARLY_EXIT(env, vl);
2329 
2330     switch (env->vxrm) {
2331     case 0: /* rnu */
2332         vext_vx_rm_1(vd, v0, s1, vs2,
2333                      env, vl, vm, 0, fn, vma, esz);
2334         break;
2335     case 1: /* rne */
2336         vext_vx_rm_1(vd, v0, s1, vs2,
2337                      env, vl, vm, 1, fn, vma, esz);
2338         break;
2339     case 2: /* rdn */
2340         vext_vx_rm_1(vd, v0, s1, vs2,
2341                      env, vl, vm, 2, fn, vma, esz);
2342         break;
2343     default: /* rod */
2344         vext_vx_rm_1(vd, v0, s1, vs2,
2345                      env, vl, vm, 3, fn, vma, esz);
2346         break;
2347     }
2348     /* set tail elements to 1s */
2349     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2350 }
2351 
2352 /* generate helpers for fixed point instructions with OPIVX format */
2353 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2354 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2355                   void *vs2, CPURISCVState *env,          \
2356                   uint32_t desc)                          \
2357 {                                                         \
2358     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2359                  do_##NAME, ESZ);                         \
2360 }
2361 
2362 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2363 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2364 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2365 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2366 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2367 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2368 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2369 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2370 
2371 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2372 {
2373     int8_t res = a + b;
2374     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2375         res = a > 0 ? INT8_MAX : INT8_MIN;
2376         env->vxsat = 0x1;
2377     }
2378     return res;
2379 }
2380 
2381 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2382                              int16_t b)
2383 {
2384     int16_t res = a + b;
2385     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2386         res = a > 0 ? INT16_MAX : INT16_MIN;
2387         env->vxsat = 0x1;
2388     }
2389     return res;
2390 }
2391 
2392 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2393                              int32_t b)
2394 {
2395     int32_t res = a + b;
2396     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2397         res = a > 0 ? INT32_MAX : INT32_MIN;
2398         env->vxsat = 0x1;
2399     }
2400     return res;
2401 }
2402 
2403 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2404                              int64_t b)
2405 {
2406     int64_t res = a + b;
2407     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2408         res = a > 0 ? INT64_MAX : INT64_MIN;
2409         env->vxsat = 0x1;
2410     }
2411     return res;
2412 }
2413 
2414 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2415 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2416 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2417 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2418 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2419 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2420 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2421 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2422 
2423 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2424 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2425 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2426 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2427 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2428 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2429 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2430 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2431 
2432 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2433                              uint8_t b)
2434 {
2435     uint8_t res = a - b;
2436     if (res > a) {
2437         res = 0;
2438         env->vxsat = 0x1;
2439     }
2440     return res;
2441 }
2442 
2443 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2444                                uint16_t b)
2445 {
2446     uint16_t res = a - b;
2447     if (res > a) {
2448         res = 0;
2449         env->vxsat = 0x1;
2450     }
2451     return res;
2452 }
2453 
2454 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2455                                uint32_t b)
2456 {
2457     uint32_t res = a - b;
2458     if (res > a) {
2459         res = 0;
2460         env->vxsat = 0x1;
2461     }
2462     return res;
2463 }
2464 
2465 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2466                                uint64_t b)
2467 {
2468     uint64_t res = a - b;
2469     if (res > a) {
2470         res = 0;
2471         env->vxsat = 0x1;
2472     }
2473     return res;
2474 }
2475 
2476 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2477 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2478 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2479 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2480 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2481 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2482 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2483 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2484 
2485 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2486 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2487 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2488 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2489 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2490 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2491 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2492 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2493 
2494 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2495 {
2496     int8_t res = a - b;
2497     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2498         res = a >= 0 ? INT8_MAX : INT8_MIN;
2499         env->vxsat = 0x1;
2500     }
2501     return res;
2502 }
2503 
2504 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2505                              int16_t b)
2506 {
2507     int16_t res = a - b;
2508     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2509         res = a >= 0 ? INT16_MAX : INT16_MIN;
2510         env->vxsat = 0x1;
2511     }
2512     return res;
2513 }
2514 
2515 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2516                              int32_t b)
2517 {
2518     int32_t res = a - b;
2519     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2520         res = a >= 0 ? INT32_MAX : INT32_MIN;
2521         env->vxsat = 0x1;
2522     }
2523     return res;
2524 }
2525 
2526 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2527                              int64_t b)
2528 {
2529     int64_t res = a - b;
2530     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2531         res = a >= 0 ? INT64_MAX : INT64_MIN;
2532         env->vxsat = 0x1;
2533     }
2534     return res;
2535 }
2536 
2537 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2538 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2539 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2540 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2541 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2542 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2543 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2544 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2545 
2546 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2547 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2548 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2549 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2550 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2551 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2552 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2553 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2554 
2555 /* Vector Single-Width Averaging Add and Subtract */
2556 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2557 {
2558     uint8_t d = extract64(v, shift, 1);
2559     uint8_t d1;
2560     uint64_t D1, D2;
2561 
2562     if (shift == 0 || shift > 64) {
2563         return 0;
2564     }
2565 
2566     d1 = extract64(v, shift - 1, 1);
2567     D1 = extract64(v, 0, shift);
2568     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2569         return d1;
2570     } else if (vxrm == 1) { /* round-to-nearest-even */
2571         if (shift > 1) {
2572             D2 = extract64(v, 0, shift - 1);
2573             return d1 & ((D2 != 0) | d);
2574         } else {
2575             return d1 & d;
2576         }
2577     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2578         return !d & (D1 != 0);
2579     }
2580     return 0; /* round-down (truncate) */
2581 }
2582 
2583 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2584                              int32_t b)
2585 {
2586     int64_t res = (int64_t)a + b;
2587     uint8_t round = get_round(vxrm, res, 1);
2588 
2589     return (res >> 1) + round;
2590 }
2591 
2592 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2593                              int64_t b)
2594 {
2595     int64_t res = a + b;
2596     uint8_t round = get_round(vxrm, res, 1);
2597     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2598 
2599     /* With signed overflow, bit 64 is inverse of bit 63. */
2600     return ((res >> 1) ^ over) + round;
2601 }
2602 
2603 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2604 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2605 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2606 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2607 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2608 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2609 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2610 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2611 
2612 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2613 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2614 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2615 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2616 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2617 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2618 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2619 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2620 
2621 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2622                                uint32_t a, uint32_t b)
2623 {
2624     uint64_t res = (uint64_t)a + b;
2625     uint8_t round = get_round(vxrm, res, 1);
2626 
2627     return (res >> 1) + round;
2628 }
2629 
2630 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2631                                uint64_t a, uint64_t b)
2632 {
2633     uint64_t res = a + b;
2634     uint8_t round = get_round(vxrm, res, 1);
2635     uint64_t over = (uint64_t)(res < a) << 63;
2636 
2637     return ((res >> 1) | over) + round;
2638 }
2639 
2640 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2641 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2642 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2643 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2644 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2645 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2646 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2647 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2648 
2649 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2650 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2651 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2652 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2653 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2654 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2655 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2656 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2657 
2658 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2659                              int32_t b)
2660 {
2661     int64_t res = (int64_t)a - b;
2662     uint8_t round = get_round(vxrm, res, 1);
2663 
2664     return (res >> 1) + round;
2665 }
2666 
2667 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2668                              int64_t b)
2669 {
2670     int64_t res = (int64_t)a - b;
2671     uint8_t round = get_round(vxrm, res, 1);
2672     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2673 
2674     /* With signed overflow, bit 64 is inverse of bit 63. */
2675     return ((res >> 1) ^ over) + round;
2676 }
2677 
2678 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2679 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2680 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2681 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2682 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2683 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2684 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2685 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2686 
2687 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2688 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2689 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2690 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2691 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2692 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2693 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2694 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2695 
2696 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2697                                uint32_t a, uint32_t b)
2698 {
2699     int64_t res = (int64_t)a - b;
2700     uint8_t round = get_round(vxrm, res, 1);
2701 
2702     return (res >> 1) + round;
2703 }
2704 
2705 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2706                                uint64_t a, uint64_t b)
2707 {
2708     uint64_t res = (uint64_t)a - b;
2709     uint8_t round = get_round(vxrm, res, 1);
2710     uint64_t over = (uint64_t)(res > a) << 63;
2711 
2712     return ((res >> 1) | over) + round;
2713 }
2714 
2715 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2716 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2717 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2718 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2719 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2720 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2721 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2722 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2723 
2724 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2725 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2726 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2727 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2728 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2729 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2730 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2731 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2732 
2733 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2734 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2735 {
2736     uint8_t round;
2737     int16_t res;
2738 
2739     res = (int16_t)a * (int16_t)b;
2740     round = get_round(vxrm, res, 7);
2741     res = (res >> 7) + round;
2742 
2743     if (res > INT8_MAX) {
2744         env->vxsat = 0x1;
2745         return INT8_MAX;
2746     } else if (res < INT8_MIN) {
2747         env->vxsat = 0x1;
2748         return INT8_MIN;
2749     } else {
2750         return res;
2751     }
2752 }
2753 
2754 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2755 {
2756     uint8_t round;
2757     int32_t res;
2758 
2759     res = (int32_t)a * (int32_t)b;
2760     round = get_round(vxrm, res, 15);
2761     res = (res >> 15) + round;
2762 
2763     if (res > INT16_MAX) {
2764         env->vxsat = 0x1;
2765         return INT16_MAX;
2766     } else if (res < INT16_MIN) {
2767         env->vxsat = 0x1;
2768         return INT16_MIN;
2769     } else {
2770         return res;
2771     }
2772 }
2773 
2774 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2775 {
2776     uint8_t round;
2777     int64_t res;
2778 
2779     res = (int64_t)a * (int64_t)b;
2780     round = get_round(vxrm, res, 31);
2781     res = (res >> 31) + round;
2782 
2783     if (res > INT32_MAX) {
2784         env->vxsat = 0x1;
2785         return INT32_MAX;
2786     } else if (res < INT32_MIN) {
2787         env->vxsat = 0x1;
2788         return INT32_MIN;
2789     } else {
2790         return res;
2791     }
2792 }
2793 
2794 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2795 {
2796     uint8_t round;
2797     uint64_t hi_64, lo_64;
2798     int64_t res;
2799 
2800     if (a == INT64_MIN && b == INT64_MIN) {
2801         env->vxsat = 1;
2802         return INT64_MAX;
2803     }
2804 
2805     muls64(&lo_64, &hi_64, a, b);
2806     round = get_round(vxrm, lo_64, 63);
2807     /*
2808      * Cannot overflow, as there are always
2809      * 2 sign bits after multiply.
2810      */
2811     res = (hi_64 << 1) | (lo_64 >> 63);
2812     if (round) {
2813         if (res == INT64_MAX) {
2814             env->vxsat = 1;
2815         } else {
2816             res += 1;
2817         }
2818     }
2819     return res;
2820 }
2821 
2822 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2823 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2824 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2825 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2826 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2827 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2828 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2829 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2830 
2831 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2832 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2833 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2834 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2835 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2836 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2837 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2838 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2839 
2840 /* Vector Single-Width Scaling Shift Instructions */
2841 static inline uint8_t
2842 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2843 {
2844     uint8_t round, shift = b & 0x7;
2845     uint8_t res;
2846 
2847     round = get_round(vxrm, a, shift);
2848     res = (a >> shift) + round;
2849     return res;
2850 }
2851 static inline uint16_t
2852 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2853 {
2854     uint8_t round, shift = b & 0xf;
2855 
2856     round = get_round(vxrm, a, shift);
2857     return (a >> shift) + round;
2858 }
2859 static inline uint32_t
2860 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2861 {
2862     uint8_t round, shift = b & 0x1f;
2863 
2864     round = get_round(vxrm, a, shift);
2865     return (a >> shift) + round;
2866 }
2867 static inline uint64_t
2868 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2869 {
2870     uint8_t round, shift = b & 0x3f;
2871 
2872     round = get_round(vxrm, a, shift);
2873     return (a >> shift) + round;
2874 }
2875 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2876 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2877 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2878 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2879 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2880 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2881 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2882 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2883 
2884 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2885 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2886 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2887 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2888 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2889 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2890 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2891 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2892 
2893 static inline int8_t
2894 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2895 {
2896     uint8_t round, shift = b & 0x7;
2897 
2898     round = get_round(vxrm, a, shift);
2899     return (a >> shift) + round;
2900 }
2901 static inline int16_t
2902 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2903 {
2904     uint8_t round, shift = b & 0xf;
2905 
2906     round = get_round(vxrm, a, shift);
2907     return (a >> shift) + round;
2908 }
2909 static inline int32_t
2910 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2911 {
2912     uint8_t round, shift = b & 0x1f;
2913 
2914     round = get_round(vxrm, a, shift);
2915     return (a >> shift) + round;
2916 }
2917 static inline int64_t
2918 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2919 {
2920     uint8_t round, shift = b & 0x3f;
2921 
2922     round = get_round(vxrm, a, shift);
2923     return (a >> shift) + round;
2924 }
2925 
2926 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2927 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2928 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2929 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2930 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2931 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2932 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2933 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2934 
2935 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2936 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2937 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2938 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2939 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2940 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2941 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2942 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2943 
2944 /* Vector Narrowing Fixed-Point Clip Instructions */
2945 static inline int8_t
2946 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2947 {
2948     uint8_t round, shift = b & 0xf;
2949     int16_t res;
2950 
2951     round = get_round(vxrm, a, shift);
2952     res = (a >> shift) + round;
2953     if (res > INT8_MAX) {
2954         env->vxsat = 0x1;
2955         return INT8_MAX;
2956     } else if (res < INT8_MIN) {
2957         env->vxsat = 0x1;
2958         return INT8_MIN;
2959     } else {
2960         return res;
2961     }
2962 }
2963 
2964 static inline int16_t
2965 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2966 {
2967     uint8_t round, shift = b & 0x1f;
2968     int32_t res;
2969 
2970     round = get_round(vxrm, a, shift);
2971     res = (a >> shift) + round;
2972     if (res > INT16_MAX) {
2973         env->vxsat = 0x1;
2974         return INT16_MAX;
2975     } else if (res < INT16_MIN) {
2976         env->vxsat = 0x1;
2977         return INT16_MIN;
2978     } else {
2979         return res;
2980     }
2981 }
2982 
2983 static inline int32_t
2984 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2985 {
2986     uint8_t round, shift = b & 0x3f;
2987     int64_t res;
2988 
2989     round = get_round(vxrm, a, shift);
2990     res = (a >> shift) + round;
2991     if (res > INT32_MAX) {
2992         env->vxsat = 0x1;
2993         return INT32_MAX;
2994     } else if (res < INT32_MIN) {
2995         env->vxsat = 0x1;
2996         return INT32_MIN;
2997     } else {
2998         return res;
2999     }
3000 }
3001 
3002 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
3003 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
3004 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
3005 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
3006 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
3007 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
3008 
3009 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
3010 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
3011 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
3012 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
3013 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
3014 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
3015 
3016 static inline uint8_t
3017 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
3018 {
3019     uint8_t round, shift = b & 0xf;
3020     uint16_t res;
3021 
3022     round = get_round(vxrm, a, shift);
3023     res = (a >> shift) + round;
3024     if (res > UINT8_MAX) {
3025         env->vxsat = 0x1;
3026         return UINT8_MAX;
3027     } else {
3028         return res;
3029     }
3030 }
3031 
3032 static inline uint16_t
3033 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
3034 {
3035     uint8_t round, shift = b & 0x1f;
3036     uint32_t res;
3037 
3038     round = get_round(vxrm, a, shift);
3039     res = (a >> shift) + round;
3040     if (res > UINT16_MAX) {
3041         env->vxsat = 0x1;
3042         return UINT16_MAX;
3043     } else {
3044         return res;
3045     }
3046 }
3047 
3048 static inline uint32_t
3049 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3050 {
3051     uint8_t round, shift = b & 0x3f;
3052     uint64_t res;
3053 
3054     round = get_round(vxrm, a, shift);
3055     res = (a >> shift) + round;
3056     if (res > UINT32_MAX) {
3057         env->vxsat = 0x1;
3058         return UINT32_MAX;
3059     } else {
3060         return res;
3061     }
3062 }
3063 
3064 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3065 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3066 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3067 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3068 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3069 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3070 
3071 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3072 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3073 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3074 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3075 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3076 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3077 
3078 /*
3079  * Vector Float Point Arithmetic Instructions
3080  */
3081 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3082 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3083 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3084                       CPURISCVState *env)                      \
3085 {                                                              \
3086     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3087     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3088     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3089 }
3090 
3091 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3092 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3093                   void *vs2, CPURISCVState *env,          \
3094                   uint32_t desc)                          \
3095 {                                                         \
3096     uint32_t vm = vext_vm(desc);                          \
3097     uint32_t vl = env->vl;                                \
3098     uint32_t total_elems =                                \
3099         vext_get_total_elems(env, desc, ESZ);             \
3100     uint32_t vta = vext_vta(desc);                        \
3101     uint32_t vma = vext_vma(desc);                        \
3102     uint32_t i;                                           \
3103                                                           \
3104     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3105                                                           \
3106     for (i = env->vstart; i < vl; i++) {                  \
3107         if (!vm && !vext_elem_mask(v0, i)) {              \
3108             /* set masked-off elements to 1s */           \
3109             vext_set_elems_1s(vd, vma, i * ESZ,           \
3110                               (i + 1) * ESZ);             \
3111             continue;                                     \
3112         }                                                 \
3113         do_##NAME(vd, vs1, vs2, i, env);                  \
3114     }                                                     \
3115     env->vstart = 0;                                      \
3116     /* set tail elements to 1s */                         \
3117     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3118                       total_elems * ESZ);                 \
3119 }
3120 
3121 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3122 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3123 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3124 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3125 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3126 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3127 
3128 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3129 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3130                       CPURISCVState *env)                      \
3131 {                                                              \
3132     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3133     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3134 }
3135 
3136 #define GEN_VEXT_VF(NAME, ESZ)                            \
3137 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3138                   void *vs2, CPURISCVState *env,          \
3139                   uint32_t desc)                          \
3140 {                                                         \
3141     uint32_t vm = vext_vm(desc);                          \
3142     uint32_t vl = env->vl;                                \
3143     uint32_t total_elems =                                \
3144         vext_get_total_elems(env, desc, ESZ);             \
3145     uint32_t vta = vext_vta(desc);                        \
3146     uint32_t vma = vext_vma(desc);                        \
3147     uint32_t i;                                           \
3148                                                           \
3149     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3150                                                           \
3151     for (i = env->vstart; i < vl; i++) {                  \
3152         if (!vm && !vext_elem_mask(v0, i)) {              \
3153             /* set masked-off elements to 1s */           \
3154             vext_set_elems_1s(vd, vma, i * ESZ,           \
3155                               (i + 1) * ESZ);             \
3156             continue;                                     \
3157         }                                                 \
3158         do_##NAME(vd, s1, vs2, i, env);                   \
3159     }                                                     \
3160     env->vstart = 0;                                      \
3161     /* set tail elements to 1s */                         \
3162     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3163                       total_elems * ESZ);                 \
3164 }
3165 
3166 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3167 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3168 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3169 GEN_VEXT_VF(vfadd_vf_h, 2)
3170 GEN_VEXT_VF(vfadd_vf_w, 4)
3171 GEN_VEXT_VF(vfadd_vf_d, 8)
3172 
3173 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3174 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3175 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3176 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3177 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3178 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3179 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3180 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3181 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3182 GEN_VEXT_VF(vfsub_vf_h, 2)
3183 GEN_VEXT_VF(vfsub_vf_w, 4)
3184 GEN_VEXT_VF(vfsub_vf_d, 8)
3185 
3186 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3187 {
3188     return float16_sub(b, a, s);
3189 }
3190 
3191 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3192 {
3193     return float32_sub(b, a, s);
3194 }
3195 
3196 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3197 {
3198     return float64_sub(b, a, s);
3199 }
3200 
3201 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3202 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3203 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3204 GEN_VEXT_VF(vfrsub_vf_h, 2)
3205 GEN_VEXT_VF(vfrsub_vf_w, 4)
3206 GEN_VEXT_VF(vfrsub_vf_d, 8)
3207 
3208 /* Vector Widening Floating-Point Add/Subtract Instructions */
3209 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3210 {
3211     return float32_add(float16_to_float32(a, true, s),
3212                        float16_to_float32(b, true, s), s);
3213 }
3214 
3215 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3216 {
3217     return float64_add(float32_to_float64(a, s),
3218                        float32_to_float64(b, s), s);
3219 
3220 }
3221 
3222 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3223 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3224 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3225 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3226 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3227 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3228 GEN_VEXT_VF(vfwadd_vf_h, 4)
3229 GEN_VEXT_VF(vfwadd_vf_w, 8)
3230 
3231 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3232 {
3233     return float32_sub(float16_to_float32(a, true, s),
3234                        float16_to_float32(b, true, s), s);
3235 }
3236 
3237 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3238 {
3239     return float64_sub(float32_to_float64(a, s),
3240                        float32_to_float64(b, s), s);
3241 
3242 }
3243 
3244 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3245 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3246 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3247 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3248 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3249 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3250 GEN_VEXT_VF(vfwsub_vf_h, 4)
3251 GEN_VEXT_VF(vfwsub_vf_w, 8)
3252 
3253 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3254 {
3255     return float32_add(a, float16_to_float32(b, true, s), s);
3256 }
3257 
3258 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3259 {
3260     return float64_add(a, float32_to_float64(b, s), s);
3261 }
3262 
3263 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3264 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3265 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3266 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3267 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3268 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3269 GEN_VEXT_VF(vfwadd_wf_h, 4)
3270 GEN_VEXT_VF(vfwadd_wf_w, 8)
3271 
3272 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3273 {
3274     return float32_sub(a, float16_to_float32(b, true, s), s);
3275 }
3276 
3277 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3278 {
3279     return float64_sub(a, float32_to_float64(b, s), s);
3280 }
3281 
3282 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3283 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3284 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3285 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3286 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3287 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3288 GEN_VEXT_VF(vfwsub_wf_h, 4)
3289 GEN_VEXT_VF(vfwsub_wf_w, 8)
3290 
3291 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3292 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3293 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3294 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3295 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3296 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3297 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3298 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3299 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3300 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3301 GEN_VEXT_VF(vfmul_vf_h, 2)
3302 GEN_VEXT_VF(vfmul_vf_w, 4)
3303 GEN_VEXT_VF(vfmul_vf_d, 8)
3304 
3305 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3306 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3307 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3308 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3309 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3310 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3311 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3312 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3313 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3314 GEN_VEXT_VF(vfdiv_vf_h, 2)
3315 GEN_VEXT_VF(vfdiv_vf_w, 4)
3316 GEN_VEXT_VF(vfdiv_vf_d, 8)
3317 
3318 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3319 {
3320     return float16_div(b, a, s);
3321 }
3322 
3323 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3324 {
3325     return float32_div(b, a, s);
3326 }
3327 
3328 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3329 {
3330     return float64_div(b, a, s);
3331 }
3332 
3333 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3334 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3335 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3336 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3337 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3338 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3339 
3340 /* Vector Widening Floating-Point Multiply */
3341 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3342 {
3343     return float32_mul(float16_to_float32(a, true, s),
3344                        float16_to_float32(b, true, s), s);
3345 }
3346 
3347 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3348 {
3349     return float64_mul(float32_to_float64(a, s),
3350                        float32_to_float64(b, s), s);
3351 
3352 }
3353 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3354 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3355 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3356 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3357 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3358 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3359 GEN_VEXT_VF(vfwmul_vf_h, 4)
3360 GEN_VEXT_VF(vfwmul_vf_w, 8)
3361 
3362 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3363 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3364 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3365                       CPURISCVState *env)                          \
3366 {                                                                  \
3367     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3368     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3369     TD d = *((TD *)vd + HD(i));                                    \
3370     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3371 }
3372 
3373 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3374 {
3375     return float16_muladd(a, b, d, 0, s);
3376 }
3377 
3378 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3379 {
3380     return float32_muladd(a, b, d, 0, s);
3381 }
3382 
3383 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3384 {
3385     return float64_muladd(a, b, d, 0, s);
3386 }
3387 
3388 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3389 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3390 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3391 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3392 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3393 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3394 
3395 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3396 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3397                       CPURISCVState *env)                         \
3398 {                                                                 \
3399     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3400     TD d = *((TD *)vd + HD(i));                                   \
3401     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3402 }
3403 
3404 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3405 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3406 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3407 GEN_VEXT_VF(vfmacc_vf_h, 2)
3408 GEN_VEXT_VF(vfmacc_vf_w, 4)
3409 GEN_VEXT_VF(vfmacc_vf_d, 8)
3410 
3411 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3412 {
3413     return float16_muladd(a, b, d, float_muladd_negate_c |
3414                                    float_muladd_negate_product, s);
3415 }
3416 
3417 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3418 {
3419     return float32_muladd(a, b, d, float_muladd_negate_c |
3420                                    float_muladd_negate_product, s);
3421 }
3422 
3423 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3424 {
3425     return float64_muladd(a, b, d, float_muladd_negate_c |
3426                                    float_muladd_negate_product, s);
3427 }
3428 
3429 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3430 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3431 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3432 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3433 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3434 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3435 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3436 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3437 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3438 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3439 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3440 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3441 
3442 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3443 {
3444     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3445 }
3446 
3447 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3448 {
3449     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3450 }
3451 
3452 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3453 {
3454     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3455 }
3456 
3457 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3458 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3459 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3460 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3461 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3462 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3463 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3464 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3465 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3466 GEN_VEXT_VF(vfmsac_vf_h, 2)
3467 GEN_VEXT_VF(vfmsac_vf_w, 4)
3468 GEN_VEXT_VF(vfmsac_vf_d, 8)
3469 
3470 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3471 {
3472     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3473 }
3474 
3475 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3476 {
3477     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3478 }
3479 
3480 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3481 {
3482     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3483 }
3484 
3485 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3486 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3487 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3488 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3489 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3490 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3491 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3492 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3493 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3494 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3495 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3496 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3497 
3498 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3499 {
3500     return float16_muladd(d, b, a, 0, s);
3501 }
3502 
3503 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3504 {
3505     return float32_muladd(d, b, a, 0, s);
3506 }
3507 
3508 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3509 {
3510     return float64_muladd(d, b, a, 0, s);
3511 }
3512 
3513 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3514 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3515 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3516 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3517 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3518 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3519 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3520 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3521 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3522 GEN_VEXT_VF(vfmadd_vf_h, 2)
3523 GEN_VEXT_VF(vfmadd_vf_w, 4)
3524 GEN_VEXT_VF(vfmadd_vf_d, 8)
3525 
3526 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3527 {
3528     return float16_muladd(d, b, a, float_muladd_negate_c |
3529                                    float_muladd_negate_product, s);
3530 }
3531 
3532 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3533 {
3534     return float32_muladd(d, b, a, float_muladd_negate_c |
3535                                    float_muladd_negate_product, s);
3536 }
3537 
3538 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3539 {
3540     return float64_muladd(d, b, a, float_muladd_negate_c |
3541                                    float_muladd_negate_product, s);
3542 }
3543 
3544 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3545 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3546 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3547 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3548 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3549 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3550 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3551 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3552 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3553 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3554 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3555 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3556 
3557 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3558 {
3559     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3560 }
3561 
3562 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3563 {
3564     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3565 }
3566 
3567 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3568 {
3569     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3570 }
3571 
3572 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3573 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3574 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3575 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3576 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3577 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3578 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3579 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3580 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3581 GEN_VEXT_VF(vfmsub_vf_h, 2)
3582 GEN_VEXT_VF(vfmsub_vf_w, 4)
3583 GEN_VEXT_VF(vfmsub_vf_d, 8)
3584 
3585 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3586 {
3587     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3588 }
3589 
3590 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3591 {
3592     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3593 }
3594 
3595 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3596 {
3597     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3598 }
3599 
3600 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3601 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3602 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3603 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3604 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3605 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3606 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3607 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3608 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3609 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3610 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3611 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3612 
3613 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3614 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3615 {
3616     return float32_muladd(float16_to_float32(a, true, s),
3617                           float16_to_float32(b, true, s), d, 0, s);
3618 }
3619 
3620 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3621 {
3622     return float64_muladd(float32_to_float64(a, s),
3623                           float32_to_float64(b, s), d, 0, s);
3624 }
3625 
3626 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3627 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3628 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3629 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3630 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3631 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3632 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3633 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3634 
3635 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3636 {
3637     return float32_muladd(bfloat16_to_float32(a, s),
3638                           bfloat16_to_float32(b, s), d, 0, s);
3639 }
3640 
3641 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3642 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3643 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3644 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3645 
3646 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3647 {
3648     return float32_muladd(float16_to_float32(a, true, s),
3649                           float16_to_float32(b, true, s), d,
3650                           float_muladd_negate_c | float_muladd_negate_product,
3651                           s);
3652 }
3653 
3654 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3655 {
3656     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3657                           d, float_muladd_negate_c |
3658                              float_muladd_negate_product, s);
3659 }
3660 
3661 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3662 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3663 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3664 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3665 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3666 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3667 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3668 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3669 
3670 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3671 {
3672     return float32_muladd(float16_to_float32(a, true, s),
3673                           float16_to_float32(b, true, s), d,
3674                           float_muladd_negate_c, s);
3675 }
3676 
3677 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3678 {
3679     return float64_muladd(float32_to_float64(a, s),
3680                           float32_to_float64(b, s), d,
3681                           float_muladd_negate_c, s);
3682 }
3683 
3684 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3685 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3686 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3687 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3688 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3689 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3690 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3691 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3692 
3693 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3694 {
3695     return float32_muladd(float16_to_float32(a, true, s),
3696                           float16_to_float32(b, true, s), d,
3697                           float_muladd_negate_product, s);
3698 }
3699 
3700 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3701 {
3702     return float64_muladd(float32_to_float64(a, s),
3703                           float32_to_float64(b, s), d,
3704                           float_muladd_negate_product, s);
3705 }
3706 
3707 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3708 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3709 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3710 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3711 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3712 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3713 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3714 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3715 
3716 /* Vector Floating-Point Square-Root Instruction */
3717 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3718 static void do_##NAME(void *vd, void *vs2, int i,      \
3719                       CPURISCVState *env)              \
3720 {                                                      \
3721     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3722     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3723 }
3724 
3725 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3726 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3727                   CPURISCVState *env, uint32_t desc)   \
3728 {                                                      \
3729     uint32_t vm = vext_vm(desc);                       \
3730     uint32_t vl = env->vl;                             \
3731     uint32_t total_elems =                             \
3732         vext_get_total_elems(env, desc, ESZ);          \
3733     uint32_t vta = vext_vta(desc);                     \
3734     uint32_t vma = vext_vma(desc);                     \
3735     uint32_t i;                                        \
3736                                                        \
3737     VSTART_CHECK_EARLY_EXIT(env, vl);                  \
3738                                                        \
3739     if (vl == 0) {                                     \
3740         return;                                        \
3741     }                                                  \
3742     for (i = env->vstart; i < vl; i++) {               \
3743         if (!vm && !vext_elem_mask(v0, i)) {           \
3744             /* set masked-off elements to 1s */        \
3745             vext_set_elems_1s(vd, vma, i * ESZ,        \
3746                               (i + 1) * ESZ);          \
3747             continue;                                  \
3748         }                                              \
3749         do_##NAME(vd, vs2, i, env);                    \
3750     }                                                  \
3751     env->vstart = 0;                                   \
3752     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3753                       total_elems * ESZ);              \
3754 }
3755 
3756 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3757 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3758 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3759 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3760 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3761 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3762 
3763 /*
3764  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3765  *
3766  * Adapted from riscv-v-spec recip.c:
3767  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3768  */
3769 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3770 {
3771     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3772     uint64_t exp = extract64(f, frac_size, exp_size);
3773     uint64_t frac = extract64(f, 0, frac_size);
3774 
3775     const uint8_t lookup_table[] = {
3776         52, 51, 50, 48, 47, 46, 44, 43,
3777         42, 41, 40, 39, 38, 36, 35, 34,
3778         33, 32, 31, 30, 30, 29, 28, 27,
3779         26, 25, 24, 23, 23, 22, 21, 20,
3780         19, 19, 18, 17, 16, 16, 15, 14,
3781         14, 13, 12, 12, 11, 10, 10, 9,
3782         9, 8, 7, 7, 6, 6, 5, 4,
3783         4, 3, 3, 2, 2, 1, 1, 0,
3784         127, 125, 123, 121, 119, 118, 116, 114,
3785         113, 111, 109, 108, 106, 105, 103, 102,
3786         100, 99, 97, 96, 95, 93, 92, 91,
3787         90, 88, 87, 86, 85, 84, 83, 82,
3788         80, 79, 78, 77, 76, 75, 74, 73,
3789         72, 71, 70, 70, 69, 68, 67, 66,
3790         65, 64, 63, 63, 62, 61, 60, 59,
3791         59, 58, 57, 56, 56, 55, 54, 53
3792     };
3793     const int precision = 7;
3794 
3795     if (exp == 0 && frac != 0) { /* subnormal */
3796         /* Normalize the subnormal. */
3797         while (extract64(frac, frac_size - 1, 1) == 0) {
3798             exp--;
3799             frac <<= 1;
3800         }
3801 
3802         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3803     }
3804 
3805     int idx = ((exp & 1) << (precision - 1)) |
3806               (frac >> (frac_size - precision + 1));
3807     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3808                         (frac_size - precision);
3809     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3810 
3811     uint64_t val = 0;
3812     val = deposit64(val, 0, frac_size, out_frac);
3813     val = deposit64(val, frac_size, exp_size, out_exp);
3814     val = deposit64(val, frac_size + exp_size, 1, sign);
3815     return val;
3816 }
3817 
3818 static float16 frsqrt7_h(float16 f, float_status *s)
3819 {
3820     int exp_size = 5, frac_size = 10;
3821     bool sign = float16_is_neg(f);
3822 
3823     /*
3824      * frsqrt7(sNaN) = canonical NaN
3825      * frsqrt7(-inf) = canonical NaN
3826      * frsqrt7(-normal) = canonical NaN
3827      * frsqrt7(-subnormal) = canonical NaN
3828      */
3829     if (float16_is_signaling_nan(f, s) ||
3830         (float16_is_infinity(f) && sign) ||
3831         (float16_is_normal(f) && sign) ||
3832         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3833         s->float_exception_flags |= float_flag_invalid;
3834         return float16_default_nan(s);
3835     }
3836 
3837     /* frsqrt7(qNaN) = canonical NaN */
3838     if (float16_is_quiet_nan(f, s)) {
3839         return float16_default_nan(s);
3840     }
3841 
3842     /* frsqrt7(+-0) = +-inf */
3843     if (float16_is_zero(f)) {
3844         s->float_exception_flags |= float_flag_divbyzero;
3845         return float16_set_sign(float16_infinity, sign);
3846     }
3847 
3848     /* frsqrt7(+inf) = +0 */
3849     if (float16_is_infinity(f) && !sign) {
3850         return float16_set_sign(float16_zero, sign);
3851     }
3852 
3853     /* +normal, +subnormal */
3854     uint64_t val = frsqrt7(f, exp_size, frac_size);
3855     return make_float16(val);
3856 }
3857 
3858 static float32 frsqrt7_s(float32 f, float_status *s)
3859 {
3860     int exp_size = 8, frac_size = 23;
3861     bool sign = float32_is_neg(f);
3862 
3863     /*
3864      * frsqrt7(sNaN) = canonical NaN
3865      * frsqrt7(-inf) = canonical NaN
3866      * frsqrt7(-normal) = canonical NaN
3867      * frsqrt7(-subnormal) = canonical NaN
3868      */
3869     if (float32_is_signaling_nan(f, s) ||
3870         (float32_is_infinity(f) && sign) ||
3871         (float32_is_normal(f) && sign) ||
3872         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3873         s->float_exception_flags |= float_flag_invalid;
3874         return float32_default_nan(s);
3875     }
3876 
3877     /* frsqrt7(qNaN) = canonical NaN */
3878     if (float32_is_quiet_nan(f, s)) {
3879         return float32_default_nan(s);
3880     }
3881 
3882     /* frsqrt7(+-0) = +-inf */
3883     if (float32_is_zero(f)) {
3884         s->float_exception_flags |= float_flag_divbyzero;
3885         return float32_set_sign(float32_infinity, sign);
3886     }
3887 
3888     /* frsqrt7(+inf) = +0 */
3889     if (float32_is_infinity(f) && !sign) {
3890         return float32_set_sign(float32_zero, sign);
3891     }
3892 
3893     /* +normal, +subnormal */
3894     uint64_t val = frsqrt7(f, exp_size, frac_size);
3895     return make_float32(val);
3896 }
3897 
3898 static float64 frsqrt7_d(float64 f, float_status *s)
3899 {
3900     int exp_size = 11, frac_size = 52;
3901     bool sign = float64_is_neg(f);
3902 
3903     /*
3904      * frsqrt7(sNaN) = canonical NaN
3905      * frsqrt7(-inf) = canonical NaN
3906      * frsqrt7(-normal) = canonical NaN
3907      * frsqrt7(-subnormal) = canonical NaN
3908      */
3909     if (float64_is_signaling_nan(f, s) ||
3910         (float64_is_infinity(f) && sign) ||
3911         (float64_is_normal(f) && sign) ||
3912         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3913         s->float_exception_flags |= float_flag_invalid;
3914         return float64_default_nan(s);
3915     }
3916 
3917     /* frsqrt7(qNaN) = canonical NaN */
3918     if (float64_is_quiet_nan(f, s)) {
3919         return float64_default_nan(s);
3920     }
3921 
3922     /* frsqrt7(+-0) = +-inf */
3923     if (float64_is_zero(f)) {
3924         s->float_exception_flags |= float_flag_divbyzero;
3925         return float64_set_sign(float64_infinity, sign);
3926     }
3927 
3928     /* frsqrt7(+inf) = +0 */
3929     if (float64_is_infinity(f) && !sign) {
3930         return float64_set_sign(float64_zero, sign);
3931     }
3932 
3933     /* +normal, +subnormal */
3934     uint64_t val = frsqrt7(f, exp_size, frac_size);
3935     return make_float64(val);
3936 }
3937 
3938 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3939 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3940 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3941 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3942 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3943 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3944 
3945 /*
3946  * Vector Floating-Point Reciprocal Estimate Instruction
3947  *
3948  * Adapted from riscv-v-spec recip.c:
3949  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3950  */
3951 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3952                       float_status *s)
3953 {
3954     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3955     uint64_t exp = extract64(f, frac_size, exp_size);
3956     uint64_t frac = extract64(f, 0, frac_size);
3957 
3958     const uint8_t lookup_table[] = {
3959         127, 125, 123, 121, 119, 117, 116, 114,
3960         112, 110, 109, 107, 105, 104, 102, 100,
3961         99, 97, 96, 94, 93, 91, 90, 88,
3962         87, 85, 84, 83, 81, 80, 79, 77,
3963         76, 75, 74, 72, 71, 70, 69, 68,
3964         66, 65, 64, 63, 62, 61, 60, 59,
3965         58, 57, 56, 55, 54, 53, 52, 51,
3966         50, 49, 48, 47, 46, 45, 44, 43,
3967         42, 41, 40, 40, 39, 38, 37, 36,
3968         35, 35, 34, 33, 32, 31, 31, 30,
3969         29, 28, 28, 27, 26, 25, 25, 24,
3970         23, 23, 22, 21, 21, 20, 19, 19,
3971         18, 17, 17, 16, 15, 15, 14, 14,
3972         13, 12, 12, 11, 11, 10, 9, 9,
3973         8, 8, 7, 7, 6, 5, 5, 4,
3974         4, 3, 3, 2, 2, 1, 1, 0
3975     };
3976     const int precision = 7;
3977 
3978     if (exp == 0 && frac != 0) { /* subnormal */
3979         /* Normalize the subnormal. */
3980         while (extract64(frac, frac_size - 1, 1) == 0) {
3981             exp--;
3982             frac <<= 1;
3983         }
3984 
3985         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3986 
3987         if (exp != 0 && exp != UINT64_MAX) {
3988             /*
3989              * Overflow to inf or max value of same sign,
3990              * depending on sign and rounding mode.
3991              */
3992             s->float_exception_flags |= (float_flag_inexact |
3993                                          float_flag_overflow);
3994 
3995             if ((s->float_rounding_mode == float_round_to_zero) ||
3996                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3997                 ((s->float_rounding_mode == float_round_up) && sign)) {
3998                 /* Return greatest/negative finite value. */
3999                 return (sign << (exp_size + frac_size)) |
4000                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
4001             } else {
4002                 /* Return +-inf. */
4003                 return (sign << (exp_size + frac_size)) |
4004                        MAKE_64BIT_MASK(frac_size, exp_size);
4005             }
4006         }
4007     }
4008 
4009     int idx = frac >> (frac_size - precision);
4010     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
4011                         (frac_size - precision);
4012     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
4013 
4014     if (out_exp == 0 || out_exp == UINT64_MAX) {
4015         /*
4016          * The result is subnormal, but don't raise the underflow exception,
4017          * because there's no additional loss of precision.
4018          */
4019         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
4020         if (out_exp == UINT64_MAX) {
4021             out_frac >>= 1;
4022             out_exp = 0;
4023         }
4024     }
4025 
4026     uint64_t val = 0;
4027     val = deposit64(val, 0, frac_size, out_frac);
4028     val = deposit64(val, frac_size, exp_size, out_exp);
4029     val = deposit64(val, frac_size + exp_size, 1, sign);
4030     return val;
4031 }
4032 
4033 static float16 frec7_h(float16 f, float_status *s)
4034 {
4035     int exp_size = 5, frac_size = 10;
4036     bool sign = float16_is_neg(f);
4037 
4038     /* frec7(+-inf) = +-0 */
4039     if (float16_is_infinity(f)) {
4040         return float16_set_sign(float16_zero, sign);
4041     }
4042 
4043     /* frec7(+-0) = +-inf */
4044     if (float16_is_zero(f)) {
4045         s->float_exception_flags |= float_flag_divbyzero;
4046         return float16_set_sign(float16_infinity, sign);
4047     }
4048 
4049     /* frec7(sNaN) = canonical NaN */
4050     if (float16_is_signaling_nan(f, s)) {
4051         s->float_exception_flags |= float_flag_invalid;
4052         return float16_default_nan(s);
4053     }
4054 
4055     /* frec7(qNaN) = canonical NaN */
4056     if (float16_is_quiet_nan(f, s)) {
4057         return float16_default_nan(s);
4058     }
4059 
4060     /* +-normal, +-subnormal */
4061     uint64_t val = frec7(f, exp_size, frac_size, s);
4062     return make_float16(val);
4063 }
4064 
4065 static float32 frec7_s(float32 f, float_status *s)
4066 {
4067     int exp_size = 8, frac_size = 23;
4068     bool sign = float32_is_neg(f);
4069 
4070     /* frec7(+-inf) = +-0 */
4071     if (float32_is_infinity(f)) {
4072         return float32_set_sign(float32_zero, sign);
4073     }
4074 
4075     /* frec7(+-0) = +-inf */
4076     if (float32_is_zero(f)) {
4077         s->float_exception_flags |= float_flag_divbyzero;
4078         return float32_set_sign(float32_infinity, sign);
4079     }
4080 
4081     /* frec7(sNaN) = canonical NaN */
4082     if (float32_is_signaling_nan(f, s)) {
4083         s->float_exception_flags |= float_flag_invalid;
4084         return float32_default_nan(s);
4085     }
4086 
4087     /* frec7(qNaN) = canonical NaN */
4088     if (float32_is_quiet_nan(f, s)) {
4089         return float32_default_nan(s);
4090     }
4091 
4092     /* +-normal, +-subnormal */
4093     uint64_t val = frec7(f, exp_size, frac_size, s);
4094     return make_float32(val);
4095 }
4096 
4097 static float64 frec7_d(float64 f, float_status *s)
4098 {
4099     int exp_size = 11, frac_size = 52;
4100     bool sign = float64_is_neg(f);
4101 
4102     /* frec7(+-inf) = +-0 */
4103     if (float64_is_infinity(f)) {
4104         return float64_set_sign(float64_zero, sign);
4105     }
4106 
4107     /* frec7(+-0) = +-inf */
4108     if (float64_is_zero(f)) {
4109         s->float_exception_flags |= float_flag_divbyzero;
4110         return float64_set_sign(float64_infinity, sign);
4111     }
4112 
4113     /* frec7(sNaN) = canonical NaN */
4114     if (float64_is_signaling_nan(f, s)) {
4115         s->float_exception_flags |= float_flag_invalid;
4116         return float64_default_nan(s);
4117     }
4118 
4119     /* frec7(qNaN) = canonical NaN */
4120     if (float64_is_quiet_nan(f, s)) {
4121         return float64_default_nan(s);
4122     }
4123 
4124     /* +-normal, +-subnormal */
4125     uint64_t val = frec7(f, exp_size, frac_size, s);
4126     return make_float64(val);
4127 }
4128 
4129 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4130 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4131 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4132 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4133 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4134 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4135 
4136 /* Vector Floating-Point MIN/MAX Instructions */
4137 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4138 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4139 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4140 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4141 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4142 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4143 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4144 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4145 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4146 GEN_VEXT_VF(vfmin_vf_h, 2)
4147 GEN_VEXT_VF(vfmin_vf_w, 4)
4148 GEN_VEXT_VF(vfmin_vf_d, 8)
4149 
4150 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4151 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4152 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4153 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4154 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4155 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4156 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4157 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4158 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4159 GEN_VEXT_VF(vfmax_vf_h, 2)
4160 GEN_VEXT_VF(vfmax_vf_w, 4)
4161 GEN_VEXT_VF(vfmax_vf_d, 8)
4162 
4163 /* Vector Floating-Point Sign-Injection Instructions */
4164 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4165 {
4166     return deposit64(b, 0, 15, a);
4167 }
4168 
4169 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4170 {
4171     return deposit64(b, 0, 31, a);
4172 }
4173 
4174 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4175 {
4176     return deposit64(b, 0, 63, a);
4177 }
4178 
4179 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4180 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4181 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4182 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4183 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4184 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4185 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4186 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4187 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4188 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4189 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4190 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4191 
4192 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4193 {
4194     return deposit64(~b, 0, 15, a);
4195 }
4196 
4197 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4198 {
4199     return deposit64(~b, 0, 31, a);
4200 }
4201 
4202 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4203 {
4204     return deposit64(~b, 0, 63, a);
4205 }
4206 
4207 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4208 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4209 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4210 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4211 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4212 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4213 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4214 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4215 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4216 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4217 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4218 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4219 
4220 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4221 {
4222     return deposit64(b ^ a, 0, 15, a);
4223 }
4224 
4225 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4226 {
4227     return deposit64(b ^ a, 0, 31, a);
4228 }
4229 
4230 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4231 {
4232     return deposit64(b ^ a, 0, 63, a);
4233 }
4234 
4235 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4236 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4237 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4238 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4239 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4240 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4241 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4242 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4243 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4244 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4245 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4246 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4247 
4248 /* Vector Floating-Point Compare Instructions */
4249 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4250 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4251                   CPURISCVState *env, uint32_t desc)          \
4252 {                                                             \
4253     uint32_t vm = vext_vm(desc);                              \
4254     uint32_t vl = env->vl;                                    \
4255     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4256     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4257     uint32_t vma = vext_vma(desc);                            \
4258     uint32_t i;                                               \
4259                                                               \
4260     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4261                                                               \
4262     for (i = env->vstart; i < vl; i++) {                      \
4263         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4264         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4265         if (!vm && !vext_elem_mask(v0, i)) {                  \
4266             /* set masked-off elements to 1s */               \
4267             if (vma) {                                        \
4268                 vext_set_elem_mask(vd, i, 1);                 \
4269             }                                                 \
4270             continue;                                         \
4271         }                                                     \
4272         vext_set_elem_mask(vd, i,                             \
4273                            DO_OP(s2, s1, &env->fp_status));   \
4274     }                                                         \
4275     env->vstart = 0;                                          \
4276     /*
4277      * mask destination register are always tail-agnostic
4278      * set tail elements to 1s
4279      */                                                       \
4280     if (vta_all_1s) {                                         \
4281         for (; i < total_elems; i++) {                        \
4282             vext_set_elem_mask(vd, i, 1);                     \
4283         }                                                     \
4284     }                                                         \
4285 }
4286 
4287 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4288 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4289 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4290 
4291 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4292 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4293                   CPURISCVState *env, uint32_t desc)                \
4294 {                                                                   \
4295     uint32_t vm = vext_vm(desc);                                    \
4296     uint32_t vl = env->vl;                                          \
4297     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4298     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4299     uint32_t vma = vext_vma(desc);                                  \
4300     uint32_t i;                                                     \
4301                                                                     \
4302     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
4303                                                                     \
4304     for (i = env->vstart; i < vl; i++) {                            \
4305         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4306         if (!vm && !vext_elem_mask(v0, i)) {                        \
4307             /* set masked-off elements to 1s */                     \
4308             if (vma) {                                              \
4309                 vext_set_elem_mask(vd, i, 1);                       \
4310             }                                                       \
4311             continue;                                               \
4312         }                                                           \
4313         vext_set_elem_mask(vd, i,                                   \
4314                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4315     }                                                               \
4316     env->vstart = 0;                                                \
4317     /*
4318      * mask destination register are always tail-agnostic
4319      * set tail elements to 1s
4320      */                                                             \
4321     if (vta_all_1s) {                                               \
4322         for (; i < total_elems; i++) {                              \
4323             vext_set_elem_mask(vd, i, 1);                           \
4324         }                                                           \
4325     }                                                               \
4326 }
4327 
4328 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4329 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4330 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4331 
4332 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4333 {
4334     FloatRelation compare = float16_compare_quiet(a, b, s);
4335     return compare != float_relation_equal;
4336 }
4337 
4338 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4339 {
4340     FloatRelation compare = float32_compare_quiet(a, b, s);
4341     return compare != float_relation_equal;
4342 }
4343 
4344 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4345 {
4346     FloatRelation compare = float64_compare_quiet(a, b, s);
4347     return compare != float_relation_equal;
4348 }
4349 
4350 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4351 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4352 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4353 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4354 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4355 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4356 
4357 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4358 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4359 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4360 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4361 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4362 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4363 
4364 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4365 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4366 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4367 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4368 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4369 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4370 
4371 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4372 {
4373     FloatRelation compare = float16_compare(a, b, s);
4374     return compare == float_relation_greater;
4375 }
4376 
4377 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4378 {
4379     FloatRelation compare = float32_compare(a, b, s);
4380     return compare == float_relation_greater;
4381 }
4382 
4383 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4384 {
4385     FloatRelation compare = float64_compare(a, b, s);
4386     return compare == float_relation_greater;
4387 }
4388 
4389 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4390 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4391 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4392 
4393 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4394 {
4395     FloatRelation compare = float16_compare(a, b, s);
4396     return compare == float_relation_greater ||
4397            compare == float_relation_equal;
4398 }
4399 
4400 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4401 {
4402     FloatRelation compare = float32_compare(a, b, s);
4403     return compare == float_relation_greater ||
4404            compare == float_relation_equal;
4405 }
4406 
4407 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4408 {
4409     FloatRelation compare = float64_compare(a, b, s);
4410     return compare == float_relation_greater ||
4411            compare == float_relation_equal;
4412 }
4413 
4414 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4415 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4416 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4417 
4418 /* Vector Floating-Point Classify Instruction */
4419 target_ulong fclass_h(uint64_t frs1)
4420 {
4421     float16 f = frs1;
4422     bool sign = float16_is_neg(f);
4423 
4424     if (float16_is_infinity(f)) {
4425         return sign ? 1 << 0 : 1 << 7;
4426     } else if (float16_is_zero(f)) {
4427         return sign ? 1 << 3 : 1 << 4;
4428     } else if (float16_is_zero_or_denormal(f)) {
4429         return sign ? 1 << 2 : 1 << 5;
4430     } else if (float16_is_any_nan(f)) {
4431         float_status s = { }; /* for snan_bit_is_one */
4432         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4433     } else {
4434         return sign ? 1 << 1 : 1 << 6;
4435     }
4436 }
4437 
4438 target_ulong fclass_s(uint64_t frs1)
4439 {
4440     float32 f = frs1;
4441     bool sign = float32_is_neg(f);
4442 
4443     if (float32_is_infinity(f)) {
4444         return sign ? 1 << 0 : 1 << 7;
4445     } else if (float32_is_zero(f)) {
4446         return sign ? 1 << 3 : 1 << 4;
4447     } else if (float32_is_zero_or_denormal(f)) {
4448         return sign ? 1 << 2 : 1 << 5;
4449     } else if (float32_is_any_nan(f)) {
4450         float_status s = { }; /* for snan_bit_is_one */
4451         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4452     } else {
4453         return sign ? 1 << 1 : 1 << 6;
4454     }
4455 }
4456 
4457 target_ulong fclass_d(uint64_t frs1)
4458 {
4459     float64 f = frs1;
4460     bool sign = float64_is_neg(f);
4461 
4462     if (float64_is_infinity(f)) {
4463         return sign ? 1 << 0 : 1 << 7;
4464     } else if (float64_is_zero(f)) {
4465         return sign ? 1 << 3 : 1 << 4;
4466     } else if (float64_is_zero_or_denormal(f)) {
4467         return sign ? 1 << 2 : 1 << 5;
4468     } else if (float64_is_any_nan(f)) {
4469         float_status s = { }; /* for snan_bit_is_one */
4470         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4471     } else {
4472         return sign ? 1 << 1 : 1 << 6;
4473     }
4474 }
4475 
4476 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4477 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4478 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4479 GEN_VEXT_V(vfclass_v_h, 2)
4480 GEN_VEXT_V(vfclass_v_w, 4)
4481 GEN_VEXT_V(vfclass_v_d, 8)
4482 
4483 /* Vector Floating-Point Merge Instruction */
4484 
4485 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4486 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4487                   CPURISCVState *env, uint32_t desc)          \
4488 {                                                             \
4489     uint32_t vm = vext_vm(desc);                              \
4490     uint32_t vl = env->vl;                                    \
4491     uint32_t esz = sizeof(ETYPE);                             \
4492     uint32_t total_elems =                                    \
4493         vext_get_total_elems(env, desc, esz);                 \
4494     uint32_t vta = vext_vta(desc);                            \
4495     uint32_t i;                                               \
4496                                                               \
4497     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4498                                                               \
4499     for (i = env->vstart; i < vl; i++) {                      \
4500         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4501         *((ETYPE *)vd + H(i)) =                               \
4502             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4503     }                                                         \
4504     env->vstart = 0;                                          \
4505     /* set tail elements to 1s */                             \
4506     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4507 }
4508 
4509 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4510 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4511 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4512 
4513 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4514 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4515 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4516 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4517 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4518 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4519 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4520 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4521 
4522 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4523 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4524 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4525 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4526 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4527 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4528 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4529 
4530 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4531 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4532 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4533 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4534 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4535 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4536 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4537 
4538 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4539 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4540 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4541 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4542 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4543 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4544 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4545 
4546 /* Widening Floating-Point/Integer Type-Convert Instructions */
4547 /* (TD, T2, TX2) */
4548 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4549 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4550 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4551 /*
4552  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4553  */
4554 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4555 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4556 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4557 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4558 
4559 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4560 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4561 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4562 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4563 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4564 
4565 /*
4566  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4567  */
4568 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4569 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4570 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4571 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4572 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4573 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4574 
4575 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4576 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4577 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4578 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4579 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4580 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4581 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4582 
4583 /*
4584  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4585  */
4586 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4587 {
4588     return float16_to_float32(a, true, s);
4589 }
4590 
4591 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4592 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4593 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4594 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4595 
4596 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4597 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4598 
4599 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4600 /* (TD, T2, TX2) */
4601 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4602 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4603 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4604 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4605 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4606 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4607 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4608 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4609 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4610 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4611 
4612 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4613 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4614 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4615 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4616 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4617 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4618 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4619 
4620 /*
4621  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4622  */
4623 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4624 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4625 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4626 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4627 
4628 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4629 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4630 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4631 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4632 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4633 
4634 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4635 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4636 {
4637     return float32_to_float16(a, true, s);
4638 }
4639 
4640 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4641 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4642 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4643 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4644 
4645 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4646 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4647 
4648 /*
4649  * Vector Reduction Operations
4650  */
4651 /* Vector Single-Width Integer Reduction Instructions */
4652 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4653 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4654                   void *vs2, CPURISCVState *env,          \
4655                   uint32_t desc)                          \
4656 {                                                         \
4657     uint32_t vm = vext_vm(desc);                          \
4658     uint32_t vl = env->vl;                                \
4659     uint32_t esz = sizeof(TD);                            \
4660     uint32_t vlenb = simd_maxsz(desc);                    \
4661     uint32_t vta = vext_vta(desc);                        \
4662     uint32_t i;                                           \
4663     TD s1 =  *((TD *)vs1 + HD(0));                        \
4664                                                           \
4665     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4666                                                           \
4667     for (i = env->vstart; i < vl; i++) {                  \
4668         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4669         if (!vm && !vext_elem_mask(v0, i)) {              \
4670             continue;                                     \
4671         }                                                 \
4672         s1 = OP(s1, (TD)s2);                              \
4673     }                                                     \
4674     if (vl > 0) {                                         \
4675         *((TD *)vd + HD(0)) = s1;                         \
4676     }                                                     \
4677     env->vstart = 0;                                      \
4678     /* set tail elements to 1s */                         \
4679     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4680 }
4681 
4682 /* vd[0] = sum(vs1[0], vs2[*]) */
4683 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4684 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4685 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4686 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4687 
4688 /* vd[0] = maxu(vs1[0], vs2[*]) */
4689 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4690 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4691 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4692 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4693 
4694 /* vd[0] = max(vs1[0], vs2[*]) */
4695 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4696 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4697 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4698 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4699 
4700 /* vd[0] = minu(vs1[0], vs2[*]) */
4701 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4702 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4703 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4704 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4705 
4706 /* vd[0] = min(vs1[0], vs2[*]) */
4707 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4708 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4709 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4710 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4711 
4712 /* vd[0] = and(vs1[0], vs2[*]) */
4713 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4714 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4715 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4716 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4717 
4718 /* vd[0] = or(vs1[0], vs2[*]) */
4719 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4720 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4721 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4722 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4723 
4724 /* vd[0] = xor(vs1[0], vs2[*]) */
4725 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4726 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4727 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4728 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4729 
4730 /* Vector Widening Integer Reduction Instructions */
4731 /* signed sum reduction into double-width accumulator */
4732 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4733 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4734 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4735 
4736 /* Unsigned sum reduction into double-width accumulator */
4737 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4738 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4739 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4740 
4741 /* Vector Single-Width Floating-Point Reduction Instructions */
4742 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4743 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4744                   void *vs2, CPURISCVState *env,           \
4745                   uint32_t desc)                           \
4746 {                                                          \
4747     uint32_t vm = vext_vm(desc);                           \
4748     uint32_t vl = env->vl;                                 \
4749     uint32_t esz = sizeof(TD);                             \
4750     uint32_t vlenb = simd_maxsz(desc);                     \
4751     uint32_t vta = vext_vta(desc);                         \
4752     uint32_t i;                                            \
4753     TD s1 =  *((TD *)vs1 + HD(0));                         \
4754                                                            \
4755     VSTART_CHECK_EARLY_EXIT(env, vl);                      \
4756                                                            \
4757     for (i = env->vstart; i < vl; i++) {                   \
4758         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4759         if (!vm && !vext_elem_mask(v0, i)) {               \
4760             continue;                                      \
4761         }                                                  \
4762         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4763     }                                                      \
4764     if (vl > 0) {                                          \
4765         *((TD *)vd + HD(0)) = s1;                          \
4766     }                                                      \
4767     env->vstart = 0;                                       \
4768     /* set tail elements to 1s */                          \
4769     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4770 }
4771 
4772 /* Unordered sum */
4773 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4774 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4775 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4776 
4777 /* Ordered sum */
4778 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4779 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4780 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4781 
4782 /* Maximum value */
4783 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4784               float16_maximum_number)
4785 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4786               float32_maximum_number)
4787 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4788               float64_maximum_number)
4789 
4790 /* Minimum value */
4791 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4792               float16_minimum_number)
4793 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4794               float32_minimum_number)
4795 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4796               float64_minimum_number)
4797 
4798 /* Vector Widening Floating-Point Add Instructions */
4799 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4800 {
4801     return float32_add(a, float16_to_float32(b, true, s), s);
4802 }
4803 
4804 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4805 {
4806     return float64_add(a, float32_to_float64(b, s), s);
4807 }
4808 
4809 /* Vector Widening Floating-Point Reduction Instructions */
4810 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4811 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4812 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4813 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4814 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4815 
4816 /*
4817  * Vector Mask Operations
4818  */
4819 /* Vector Mask-Register Logical Instructions */
4820 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4821 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4822                   void *vs2, CPURISCVState *env,          \
4823                   uint32_t desc)                          \
4824 {                                                         \
4825     uint32_t vl = env->vl;                                \
4826     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4827     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4828     uint32_t i;                                           \
4829     int a, b;                                             \
4830                                                           \
4831     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4832                                                           \
4833     for (i = env->vstart; i < vl; i++) {                  \
4834         a = vext_elem_mask(vs1, i);                       \
4835         b = vext_elem_mask(vs2, i);                       \
4836         vext_set_elem_mask(vd, i, OP(b, a));              \
4837     }                                                     \
4838     env->vstart = 0;                                      \
4839     /*
4840      * mask destination register are always tail-agnostic
4841      * set tail elements to 1s
4842      */                                                   \
4843     if (vta_all_1s) {                                     \
4844         for (; i < total_elems; i++) {                    \
4845             vext_set_elem_mask(vd, i, 1);                 \
4846         }                                                 \
4847     }                                                     \
4848 }
4849 
4850 #define DO_NAND(N, M)  (!(N & M))
4851 #define DO_ANDNOT(N, M)  (N & !M)
4852 #define DO_NOR(N, M)  (!(N | M))
4853 #define DO_ORNOT(N, M)  (N | !M)
4854 #define DO_XNOR(N, M)  (!(N ^ M))
4855 
4856 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4857 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4858 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4859 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4860 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4861 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4862 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4863 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4864 
4865 /* Vector count population in mask vcpop */
4866 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4867                              uint32_t desc)
4868 {
4869     target_ulong cnt = 0;
4870     uint32_t vm = vext_vm(desc);
4871     uint32_t vl = env->vl;
4872     int i;
4873 
4874     for (i = env->vstart; i < vl; i++) {
4875         if (vm || vext_elem_mask(v0, i)) {
4876             if (vext_elem_mask(vs2, i)) {
4877                 cnt++;
4878             }
4879         }
4880     }
4881     env->vstart = 0;
4882     return cnt;
4883 }
4884 
4885 /* vfirst find-first-set mask bit */
4886 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4887                               uint32_t desc)
4888 {
4889     uint32_t vm = vext_vm(desc);
4890     uint32_t vl = env->vl;
4891     int i;
4892 
4893     for (i = env->vstart; i < vl; i++) {
4894         if (vm || vext_elem_mask(v0, i)) {
4895             if (vext_elem_mask(vs2, i)) {
4896                 return i;
4897             }
4898         }
4899     }
4900     env->vstart = 0;
4901     return -1LL;
4902 }
4903 
4904 enum set_mask_type {
4905     ONLY_FIRST = 1,
4906     INCLUDE_FIRST,
4907     BEFORE_FIRST,
4908 };
4909 
4910 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4911                    uint32_t desc, enum set_mask_type type)
4912 {
4913     uint32_t vm = vext_vm(desc);
4914     uint32_t vl = env->vl;
4915     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4916     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4917     uint32_t vma = vext_vma(desc);
4918     int i;
4919     bool first_mask_bit = false;
4920 
4921     VSTART_CHECK_EARLY_EXIT(env, vl);
4922 
4923     for (i = env->vstart; i < vl; i++) {
4924         if (!vm && !vext_elem_mask(v0, i)) {
4925             /* set masked-off elements to 1s */
4926             if (vma) {
4927                 vext_set_elem_mask(vd, i, 1);
4928             }
4929             continue;
4930         }
4931         /* write a zero to all following active elements */
4932         if (first_mask_bit) {
4933             vext_set_elem_mask(vd, i, 0);
4934             continue;
4935         }
4936         if (vext_elem_mask(vs2, i)) {
4937             first_mask_bit = true;
4938             if (type == BEFORE_FIRST) {
4939                 vext_set_elem_mask(vd, i, 0);
4940             } else {
4941                 vext_set_elem_mask(vd, i, 1);
4942             }
4943         } else {
4944             if (type == ONLY_FIRST) {
4945                 vext_set_elem_mask(vd, i, 0);
4946             } else {
4947                 vext_set_elem_mask(vd, i, 1);
4948             }
4949         }
4950     }
4951     env->vstart = 0;
4952     /*
4953      * mask destination register are always tail-agnostic
4954      * set tail elements to 1s
4955      */
4956     if (vta_all_1s) {
4957         for (; i < total_elems; i++) {
4958             vext_set_elem_mask(vd, i, 1);
4959         }
4960     }
4961 }
4962 
4963 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4964                      uint32_t desc)
4965 {
4966     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4967 }
4968 
4969 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4970                      uint32_t desc)
4971 {
4972     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4973 }
4974 
4975 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4976                      uint32_t desc)
4977 {
4978     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4979 }
4980 
4981 /* Vector Iota Instruction */
4982 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4983 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4984                   uint32_t desc)                                          \
4985 {                                                                         \
4986     uint32_t vm = vext_vm(desc);                                          \
4987     uint32_t vl = env->vl;                                                \
4988     uint32_t esz = sizeof(ETYPE);                                         \
4989     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4990     uint32_t vta = vext_vta(desc);                                        \
4991     uint32_t vma = vext_vma(desc);                                        \
4992     uint32_t sum = 0;                                                     \
4993     int i;                                                                \
4994                                                                           \
4995     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
4996                                                                           \
4997     for (i = env->vstart; i < vl; i++) {                                  \
4998         if (!vm && !vext_elem_mask(v0, i)) {                              \
4999             /* set masked-off elements to 1s */                           \
5000             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5001             continue;                                                     \
5002         }                                                                 \
5003         *((ETYPE *)vd + H(i)) = sum;                                      \
5004         if (vext_elem_mask(vs2, i)) {                                     \
5005             sum++;                                                        \
5006         }                                                                 \
5007     }                                                                     \
5008     env->vstart = 0;                                                      \
5009     /* set tail elements to 1s */                                         \
5010     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5011 }
5012 
5013 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
5014 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
5015 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
5016 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
5017 
5018 /* Vector Element Index Instruction */
5019 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
5020 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
5021 {                                                                         \
5022     uint32_t vm = vext_vm(desc);                                          \
5023     uint32_t vl = env->vl;                                                \
5024     uint32_t esz = sizeof(ETYPE);                                         \
5025     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5026     uint32_t vta = vext_vta(desc);                                        \
5027     uint32_t vma = vext_vma(desc);                                        \
5028     int i;                                                                \
5029                                                                           \
5030     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5031                                                                           \
5032     for (i = env->vstart; i < vl; i++) {                                  \
5033         if (!vm && !vext_elem_mask(v0, i)) {                              \
5034             /* set masked-off elements to 1s */                           \
5035             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5036             continue;                                                     \
5037         }                                                                 \
5038         *((ETYPE *)vd + H(i)) = i;                                        \
5039     }                                                                     \
5040     env->vstart = 0;                                                      \
5041     /* set tail elements to 1s */                                         \
5042     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5043 }
5044 
5045 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5046 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5047 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5048 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5049 
5050 /*
5051  * Vector Permutation Instructions
5052  */
5053 
5054 /* Vector Slide Instructions */
5055 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5056 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5057                   CPURISCVState *env, uint32_t desc)                      \
5058 {                                                                         \
5059     uint32_t vm = vext_vm(desc);                                          \
5060     uint32_t vl = env->vl;                                                \
5061     uint32_t esz = sizeof(ETYPE);                                         \
5062     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5063     uint32_t vta = vext_vta(desc);                                        \
5064     uint32_t vma = vext_vma(desc);                                        \
5065     target_ulong offset = s1, i_min, i;                                   \
5066                                                                           \
5067     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5068                                                                           \
5069     i_min = MAX(env->vstart, offset);                                     \
5070     for (i = i_min; i < vl; i++) {                                        \
5071         if (!vm && !vext_elem_mask(v0, i)) {                              \
5072             /* set masked-off elements to 1s */                           \
5073             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5074             continue;                                                     \
5075         }                                                                 \
5076         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5077     }                                                                     \
5078     env->vstart = 0;                                                      \
5079     /* set tail elements to 1s */                                         \
5080     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5081 }
5082 
5083 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5084 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5085 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5086 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5087 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5088 
5089 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5090 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5091                   CPURISCVState *env, uint32_t desc)                      \
5092 {                                                                         \
5093     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5094     uint32_t vm = vext_vm(desc);                                          \
5095     uint32_t vl = env->vl;                                                \
5096     uint32_t esz = sizeof(ETYPE);                                         \
5097     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5098     uint32_t vta = vext_vta(desc);                                        \
5099     uint32_t vma = vext_vma(desc);                                        \
5100     target_ulong i_max, i_min, i;                                         \
5101                                                                           \
5102     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5103                                                                           \
5104     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
5105     i_max = MAX(i_min, env->vstart);                                      \
5106     for (i = env->vstart; i < i_max; ++i) {                               \
5107         if (!vm && !vext_elem_mask(v0, i)) {                              \
5108             /* set masked-off elements to 1s */                           \
5109             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5110             continue;                                                     \
5111         }                                                                 \
5112         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5113     }                                                                     \
5114                                                                           \
5115     for (i = i_max; i < vl; ++i) {                                        \
5116         if (vm || vext_elem_mask(v0, i)) {                                \
5117             *((ETYPE *)vd + H(i)) = 0;                                    \
5118         }                                                                 \
5119     }                                                                     \
5120                                                                           \
5121     env->vstart = 0;                                                      \
5122     /* set tail elements to 1s */                                         \
5123     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5124 }
5125 
5126 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5127 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5128 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5129 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5130 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5131 
5132 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5133 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5134                                  void *vs2, CPURISCVState *env,             \
5135                                  uint32_t desc)                             \
5136 {                                                                           \
5137     typedef uint##BITWIDTH##_t ETYPE;                                       \
5138     uint32_t vm = vext_vm(desc);                                            \
5139     uint32_t vl = env->vl;                                                  \
5140     uint32_t esz = sizeof(ETYPE);                                           \
5141     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5142     uint32_t vta = vext_vta(desc);                                          \
5143     uint32_t vma = vext_vma(desc);                                          \
5144     uint32_t i;                                                             \
5145                                                                             \
5146     VSTART_CHECK_EARLY_EXIT(env, vl);                                       \
5147                                                                             \
5148     for (i = env->vstart; i < vl; i++) {                                    \
5149         if (!vm && !vext_elem_mask(v0, i)) {                                \
5150             /* set masked-off elements to 1s */                             \
5151             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5152             continue;                                                       \
5153         }                                                                   \
5154         if (i == 0) {                                                       \
5155             *((ETYPE *)vd + H(i)) = s1;                                     \
5156         } else {                                                            \
5157             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5158         }                                                                   \
5159     }                                                                       \
5160     env->vstart = 0;                                                        \
5161     /* set tail elements to 1s */                                           \
5162     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5163 }
5164 
5165 GEN_VEXT_VSLIE1UP(8,  H1)
5166 GEN_VEXT_VSLIE1UP(16, H2)
5167 GEN_VEXT_VSLIE1UP(32, H4)
5168 GEN_VEXT_VSLIE1UP(64, H8)
5169 
5170 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5171 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5172                   CPURISCVState *env, uint32_t desc)              \
5173 {                                                                 \
5174     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5175 }
5176 
5177 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5178 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5179 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5180 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5181 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5182 
5183 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5184 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5185                                    void *vs2, CPURISCVState *env,             \
5186                                    uint32_t desc)                             \
5187 {                                                                             \
5188     typedef uint##BITWIDTH##_t ETYPE;                                         \
5189     uint32_t vm = vext_vm(desc);                                              \
5190     uint32_t vl = env->vl;                                                    \
5191     uint32_t esz = sizeof(ETYPE);                                             \
5192     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5193     uint32_t vta = vext_vta(desc);                                            \
5194     uint32_t vma = vext_vma(desc);                                            \
5195     uint32_t i;                                                               \
5196                                                                               \
5197     VSTART_CHECK_EARLY_EXIT(env, vl);                                         \
5198                                                                               \
5199     for (i = env->vstart; i < vl; i++) {                                      \
5200         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5201             /* set masked-off elements to 1s */                               \
5202             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5203             continue;                                                         \
5204         }                                                                     \
5205         if (i == vl - 1) {                                                    \
5206             *((ETYPE *)vd + H(i)) = s1;                                       \
5207         } else {                                                              \
5208             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5209         }                                                                     \
5210     }                                                                         \
5211     env->vstart = 0;                                                          \
5212     /* set tail elements to 1s */                                             \
5213     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5214 }
5215 
5216 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5217 GEN_VEXT_VSLIDE1DOWN(16, H2)
5218 GEN_VEXT_VSLIDE1DOWN(32, H4)
5219 GEN_VEXT_VSLIDE1DOWN(64, H8)
5220 
5221 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5222 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5223                   CPURISCVState *env, uint32_t desc)              \
5224 {                                                                 \
5225     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5226 }
5227 
5228 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5229 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5230 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5231 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5232 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5233 
5234 /* Vector Floating-Point Slide Instructions */
5235 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5236 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5237                   CPURISCVState *env, uint32_t desc)          \
5238 {                                                             \
5239     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5240 }
5241 
5242 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5243 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5244 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5245 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5246 
5247 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5248 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5249                   CPURISCVState *env, uint32_t desc)          \
5250 {                                                             \
5251     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5252 }
5253 
5254 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5255 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5256 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5257 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5258 
5259 /* Vector Register Gather Instruction */
5260 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5261 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5262                   CPURISCVState *env, uint32_t desc)                      \
5263 {                                                                         \
5264     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5265     uint32_t vm = vext_vm(desc);                                          \
5266     uint32_t vl = env->vl;                                                \
5267     uint32_t esz = sizeof(TS2);                                           \
5268     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5269     uint32_t vta = vext_vta(desc);                                        \
5270     uint32_t vma = vext_vma(desc);                                        \
5271     uint64_t index;                                                       \
5272     uint32_t i;                                                           \
5273                                                                           \
5274     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5275                                                                           \
5276     for (i = env->vstart; i < vl; i++) {                                  \
5277         if (!vm && !vext_elem_mask(v0, i)) {                              \
5278             /* set masked-off elements to 1s */                           \
5279             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5280             continue;                                                     \
5281         }                                                                 \
5282         index = *((TS1 *)vs1 + HS1(i));                                   \
5283         if (index >= vlmax) {                                             \
5284             *((TS2 *)vd + HS2(i)) = 0;                                    \
5285         } else {                                                          \
5286             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5287         }                                                                 \
5288     }                                                                     \
5289     env->vstart = 0;                                                      \
5290     /* set tail elements to 1s */                                         \
5291     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5292 }
5293 
5294 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5295 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5296 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5297 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5298 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5299 
5300 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5301 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5302 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5303 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5304 
5305 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5306 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5307                   CPURISCVState *env, uint32_t desc)                      \
5308 {                                                                         \
5309     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5310     uint32_t vm = vext_vm(desc);                                          \
5311     uint32_t vl = env->vl;                                                \
5312     uint32_t esz = sizeof(ETYPE);                                         \
5313     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5314     uint32_t vta = vext_vta(desc);                                        \
5315     uint32_t vma = vext_vma(desc);                                        \
5316     uint64_t index = s1;                                                  \
5317     uint32_t i;                                                           \
5318                                                                           \
5319     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5320                                                                           \
5321     for (i = env->vstart; i < vl; i++) {                                  \
5322         if (!vm && !vext_elem_mask(v0, i)) {                              \
5323             /* set masked-off elements to 1s */                           \
5324             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5325             continue;                                                     \
5326         }                                                                 \
5327         if (index >= vlmax) {                                             \
5328             *((ETYPE *)vd + H(i)) = 0;                                    \
5329         } else {                                                          \
5330             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5331         }                                                                 \
5332     }                                                                     \
5333     env->vstart = 0;                                                      \
5334     /* set tail elements to 1s */                                         \
5335     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5336 }
5337 
5338 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5339 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5340 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5341 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5342 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5343 
5344 /* Vector Compress Instruction */
5345 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5346 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5347                   CPURISCVState *env, uint32_t desc)                      \
5348 {                                                                         \
5349     uint32_t vl = env->vl;                                                \
5350     uint32_t esz = sizeof(ETYPE);                                         \
5351     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5352     uint32_t vta = vext_vta(desc);                                        \
5353     uint32_t num = 0, i;                                                  \
5354                                                                           \
5355     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5356                                                                           \
5357     for (i = env->vstart; i < vl; i++) {                                  \
5358         if (!vext_elem_mask(vs1, i)) {                                    \
5359             continue;                                                     \
5360         }                                                                 \
5361         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5362         num++;                                                            \
5363     }                                                                     \
5364     env->vstart = 0;                                                      \
5365     /* set tail elements to 1s */                                         \
5366     vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
5367 }
5368 
5369 /* Compress into vd elements of vs2 where vs1 is enabled */
5370 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5371 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5372 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5373 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5374 
5375 /* Vector Whole Register Move */
5376 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5377 {
5378     /* EEW = SEW */
5379     uint32_t maxsz = simd_maxsz(desc);
5380     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5381     uint32_t startb = env->vstart * sewb;
5382     uint32_t i = startb;
5383 
5384     if (startb >= maxsz) {
5385         env->vstart = 0;
5386         return;
5387     }
5388 
5389     if (HOST_BIG_ENDIAN && i % 8 != 0) {
5390         uint32_t j = ROUND_UP(i, 8);
5391         memcpy((uint8_t *)vd + H1(j - 1),
5392                (uint8_t *)vs2 + H1(j - 1),
5393                j - i);
5394         i = j;
5395     }
5396 
5397     memcpy((uint8_t *)vd + H1(i),
5398            (uint8_t *)vs2 + H1(i),
5399            maxsz - i);
5400 
5401     env->vstart = 0;
5402 }
5403 
5404 /* Vector Integer Extension */
5405 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5406 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5407                   CPURISCVState *env, uint32_t desc)             \
5408 {                                                                \
5409     uint32_t vl = env->vl;                                       \
5410     uint32_t vm = vext_vm(desc);                                 \
5411     uint32_t esz = sizeof(ETYPE);                                \
5412     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5413     uint32_t vta = vext_vta(desc);                               \
5414     uint32_t vma = vext_vma(desc);                               \
5415     uint32_t i;                                                  \
5416                                                                  \
5417     VSTART_CHECK_EARLY_EXIT(env, vl);                            \
5418                                                                  \
5419     for (i = env->vstart; i < vl; i++) {                         \
5420         if (!vm && !vext_elem_mask(v0, i)) {                     \
5421             /* set masked-off elements to 1s */                  \
5422             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5423             continue;                                            \
5424         }                                                        \
5425         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5426     }                                                            \
5427     env->vstart = 0;                                             \
5428     /* set tail elements to 1s */                                \
5429     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5430 }
5431 
5432 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5433 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5434 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5435 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5436 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5437 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5438 
5439 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5440 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5441 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5442 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5443 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5444 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5445