xref: /qemu/target/riscv/vector_helper.c (revision e240f6cc25917f3138d9e95e0343ae23b63a3f8c)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "accel/tcg/cpu-ldst.h"
25 #include "accel/tcg/probe.h"
26 #include "exec/page-protection.h"
27 #include "exec/helper-proto.h"
28 #include "exec/tlb-flags.h"
29 #include "exec/target_page.h"
30 #include "exec/tswap.h"
31 #include "fpu/softfloat.h"
32 #include "tcg/tcg-gvec-desc.h"
33 #include "internals.h"
34 #include "vector_internals.h"
35 #include <math.h>
36 
HELPER(vsetvl)37 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
38                             target_ulong s2, target_ulong x0)
39 {
40     int vlmax, vl;
41     RISCVCPU *cpu = env_archcpu(env);
42     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
43     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
44     uint16_t sew = 8 << vsew;
45     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
46     int xlen = riscv_cpu_xlen(env);
47     bool vill = (s2 >> (xlen - 1)) & 0x1;
48     target_ulong reserved = s2 &
49                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
50                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
51     uint16_t vlen = cpu->cfg.vlenb << 3;
52     int8_t lmul;
53 
54     if (vlmul & 4) {
55         /*
56          * Fractional LMUL, check:
57          *
58          * VLEN * LMUL >= SEW
59          * VLEN >> (8 - lmul) >= sew
60          * (vlenb << 3) >> (8 - lmul) >= sew
61          */
62         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
63             vill = true;
64         }
65     }
66 
67     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
68         /* only set vill bit. */
69         env->vill = 1;
70         env->vtype = 0;
71         env->vl = 0;
72         env->vstart = 0;
73         return 0;
74     }
75 
76     /* lmul encoded as in DisasContext::lmul */
77     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
78     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
79     if (s1 <= vlmax) {
80         vl = s1;
81     } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
82         vl = (s1 + 1) >> 1;
83     } else {
84         vl = vlmax;
85     }
86 
87     if (cpu->cfg.rvv_vsetvl_x0_vill && x0 && (env->vl != vl)) {
88         /* only set vill bit. */
89         env->vill = 1;
90         env->vtype = 0;
91         env->vl = 0;
92         env->vstart = 0;
93         return 0;
94     }
95 
96     env->vl = vl;
97     env->vtype = s2;
98     env->vstart = 0;
99     env->vill = 0;
100     return vl;
101 }
102 
103 /*
104  * Get the maximum number of elements can be operated.
105  *
106  * log2_esz: log2 of element size in bytes.
107  */
vext_max_elems(uint32_t desc,uint32_t log2_esz)108 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
109 {
110     /*
111      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
112      * so vlen in bytes (vlenb) is encoded as maxsz.
113      */
114     uint32_t vlenb = simd_maxsz(desc);
115 
116     /* Return VLMAX */
117     int scale = vext_lmul(desc) - log2_esz;
118     return scale < 0 ? vlenb >> -scale : vlenb << scale;
119 }
120 
121 /*
122  * This function checks watchpoint before real load operation.
123  *
124  * In system mode, the TLB API probe_access is enough for watchpoint check.
125  * In user mode, there is no watchpoint support now.
126  *
127  * It will trigger an exception if there is no mapping in TLB
128  * and page table walk can't fill the TLB entry. Then the guest
129  * software can return here after process the exception or never return.
130  *
131  * This function can also be used when direct access to probe_access_flags is
132  * needed in order to access the flags. If a pointer to a flags operand is
133  * provided the function will call probe_access_flags instead, use nonfault
134  * and update host and flags.
135  */
probe_pages(CPURISCVState * env,target_ulong addr,target_ulong len,uintptr_t ra,MMUAccessType access_type,int mmu_index,void ** host,int * flags,bool nonfault)136 static void probe_pages(CPURISCVState *env, target_ulong addr, target_ulong len,
137                         uintptr_t ra, MMUAccessType access_type, int mmu_index,
138                         void **host, int *flags, bool nonfault)
139 {
140     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
141     target_ulong curlen = MIN(pagelen, len);
142 
143     if (flags != NULL) {
144         *flags = probe_access_flags(env, adjust_addr(env, addr), curlen,
145                                     access_type, mmu_index, nonfault, host, ra);
146     } else {
147         probe_access(env, adjust_addr(env, addr), curlen, access_type,
148                      mmu_index, ra);
149     }
150 
151     if (len > curlen) {
152         addr += curlen;
153         curlen = len - curlen;
154         if (flags != NULL) {
155             *flags = probe_access_flags(env, adjust_addr(env, addr), curlen,
156                                         access_type, mmu_index, nonfault,
157                                         host, ra);
158         } else {
159             probe_access(env, adjust_addr(env, addr), curlen, access_type,
160                          mmu_index, ra);
161         }
162     }
163 }
164 
165 
vext_set_elem_mask(void * v0,int index,uint8_t value)166 static inline void vext_set_elem_mask(void *v0, int index,
167                                       uint8_t value)
168 {
169     int idx = index / 64;
170     int pos = index % 64;
171     uint64_t old = ((uint64_t *)v0)[idx];
172     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
173 }
174 
175 /* elements operations for load and store */
176 typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
177                                    uint32_t idx, void *vd, uintptr_t retaddr);
178 typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
179 
180 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
181 static inline QEMU_ALWAYS_INLINE                            \
182 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
183                 uint32_t idx, void *vd, uintptr_t retaddr)  \
184 {                                                           \
185     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
186     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
187 }                                                           \
188                                                             \
189 static inline QEMU_ALWAYS_INLINE                            \
190 void NAME##_host(void *vd, uint32_t idx, void *host)        \
191 {                                                           \
192     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
193     *cur = (ETYPE)LDSUF##_p(host);                          \
194 }
195 
GEN_VEXT_LD_ELEM(lde_b,uint8_t,H1,ldub)196 GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
197 GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
198 GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
199 GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
200 
201 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
202 static inline QEMU_ALWAYS_INLINE                            \
203 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
204                 uint32_t idx, void *vd, uintptr_t retaddr)  \
205 {                                                           \
206     ETYPE data = *((ETYPE *)vd + H(idx));                   \
207     cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
208 }                                                           \
209                                                             \
210 static inline QEMU_ALWAYS_INLINE                            \
211 void NAME##_host(void *vd, uint32_t idx, void *host)        \
212 {                                                           \
213     ETYPE data = *((ETYPE *)vd + H(idx));                   \
214     STSUF##_p(host, data);                                  \
215 }
216 
217 GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
218 GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
219 GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
220 GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
221 
222 static inline QEMU_ALWAYS_INLINE void
223 vext_continuous_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
224                        void *vd, uint32_t evl, target_ulong addr,
225                        uint32_t reg_start, uintptr_t ra, uint32_t esz,
226                        bool is_load)
227 {
228     uint32_t i;
229     for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
230         ldst_tlb(env, adjust_addr(env, addr), i, vd, ra);
231     }
232 }
233 
234 static inline QEMU_ALWAYS_INLINE void
vext_continuous_ldst_host(CPURISCVState * env,vext_ldst_elem_fn_host * ldst_host,void * vd,uint32_t evl,uint32_t reg_start,void * host,uint32_t esz,bool is_load)235 vext_continuous_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
236                         void *vd, uint32_t evl, uint32_t reg_start, void *host,
237                         uint32_t esz, bool is_load)
238 {
239 #if HOST_BIG_ENDIAN
240     for (; reg_start < evl; reg_start++, host += esz) {
241         ldst_host(vd, reg_start, host);
242     }
243 #else
244     if (esz == 1) {
245         uint32_t byte_offset = reg_start * esz;
246         uint32_t size = (evl - reg_start) * esz;
247 
248         if (is_load) {
249             memcpy(vd + byte_offset, host, size);
250         } else {
251             memcpy(host, vd + byte_offset, size);
252         }
253     } else {
254         for (; reg_start < evl; reg_start++, host += esz) {
255             ldst_host(vd, reg_start, host);
256         }
257     }
258 #endif
259 }
260 
vext_set_tail_elems_1s(target_ulong vl,void * vd,uint32_t desc,uint32_t nf,uint32_t esz,uint32_t max_elems)261 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
262                                    uint32_t desc, uint32_t nf,
263                                    uint32_t esz, uint32_t max_elems)
264 {
265     uint32_t vta = vext_vta(desc);
266     int k;
267 
268     if (vta == 0) {
269         return;
270     }
271 
272     for (k = 0; k < nf; ++k) {
273         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
274                           (k * max_elems + max_elems) * esz);
275     }
276 }
277 
278 /*
279  * stride: access vector element from strided memory
280  */
281 static void
vext_ldst_stride(void * vd,void * v0,target_ulong base,target_ulong stride,CPURISCVState * env,uint32_t desc,uint32_t vm,vext_ldst_elem_fn_tlb * ldst_elem,uint32_t log2_esz,uintptr_t ra)282 vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride,
283                  CPURISCVState *env, uint32_t desc, uint32_t vm,
284                  vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
285                  uintptr_t ra)
286 {
287     uint32_t i, k;
288     uint32_t nf = vext_nf(desc);
289     uint32_t max_elems = vext_max_elems(desc, log2_esz);
290     uint32_t esz = 1 << log2_esz;
291     uint32_t vma = vext_vma(desc);
292 
293     VSTART_CHECK_EARLY_EXIT(env, env->vl);
294 
295     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
296         k = 0;
297         while (k < nf) {
298             if (!vm && !vext_elem_mask(v0, i)) {
299                 /* set masked-off elements to 1s */
300                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
301                                   (i + k * max_elems + 1) * esz);
302                 k++;
303                 continue;
304             }
305             target_ulong addr = base + stride * i + (k << log2_esz);
306             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
307             k++;
308         }
309     }
310     env->vstart = 0;
311 
312     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
313 }
314 
315 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
316 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
317                   target_ulong stride, CPURISCVState *env,              \
318                   uint32_t desc)                                        \
319 {                                                                       \
320     uint32_t vm = vext_vm(desc);                                        \
321     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
322                      ctzl(sizeof(ETYPE)), GETPC());                     \
323 }
324 
GEN_VEXT_LD_STRIDE(vlse8_v,int8_t,lde_b_tlb)325 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
326 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
327 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
328 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
329 
330 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
331 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
332                   target_ulong stride, CPURISCVState *env,              \
333                   uint32_t desc)                                        \
334 {                                                                       \
335     uint32_t vm = vext_vm(desc);                                        \
336     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
337                      ctzl(sizeof(ETYPE)), GETPC());                     \
338 }
339 
340 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
341 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
342 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
343 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
344 
345 /*
346  * unit-stride: access elements stored contiguously in memory
347  */
348 
349 /* unmasked unit-stride load and store operation */
350 static inline QEMU_ALWAYS_INLINE void
351 vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
352                   uint32_t elems, uint32_t nf, uint32_t max_elems,
353                   uint32_t log2_esz, bool is_load, int mmu_index,
354                   vext_ldst_elem_fn_tlb *ldst_tlb,
355                   vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
356 {
357     void *host;
358     int i, k, flags;
359     uint32_t esz = 1 << log2_esz;
360     uint32_t size = (elems * nf) << log2_esz;
361     uint32_t evl = env->vstart + elems;
362     MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
363 
364     /* Check page permission/pmp/watchpoint/etc. */
365     probe_pages(env, addr, size, ra, access_type, mmu_index, &host, &flags,
366                 true);
367 
368     if (flags == 0) {
369         if (nf == 1) {
370             vext_continuous_ldst_host(env, ldst_host, vd, evl, env->vstart,
371                                       host, esz, is_load);
372         } else {
373             for (i = env->vstart; i < evl; ++i) {
374                 k = 0;
375                 while (k < nf) {
376                     ldst_host(vd, i + k * max_elems, host);
377                     host += esz;
378                     k++;
379                 }
380             }
381         }
382         env->vstart += elems;
383     } else {
384         if (nf == 1) {
385             vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
386                                    ra, esz, is_load);
387         } else {
388             /* load bytes from guest memory */
389             for (i = env->vstart; i < evl; env->vstart = ++i) {
390                 k = 0;
391                 while (k < nf) {
392                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
393                              vd, ra);
394                     addr += esz;
395                     k++;
396                 }
397             }
398         }
399     }
400 }
401 
402 static inline QEMU_ALWAYS_INLINE void
vext_ldst_us(void * vd,target_ulong base,CPURISCVState * env,uint32_t desc,vext_ldst_elem_fn_tlb * ldst_tlb,vext_ldst_elem_fn_host * ldst_host,uint32_t log2_esz,uint32_t evl,uintptr_t ra,bool is_load)403 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
404              vext_ldst_elem_fn_tlb *ldst_tlb,
405              vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
406              uint32_t evl, uintptr_t ra, bool is_load)
407 {
408     uint32_t k;
409     target_ulong page_split, elems, addr;
410     uint32_t nf = vext_nf(desc);
411     uint32_t max_elems = vext_max_elems(desc, log2_esz);
412     uint32_t esz = 1 << log2_esz;
413     uint32_t msize = nf * esz;
414     int mmu_index = riscv_env_mmu_index(env, false);
415 
416     VSTART_CHECK_EARLY_EXIT(env, evl);
417 
418 #if defined(CONFIG_USER_ONLY)
419     /*
420      * For data sizes <= 6 bytes we get better performance by simply calling
421      * vext_continuous_ldst_tlb
422      */
423     if (nf == 1 && (evl << log2_esz) <= 6) {
424         addr = base + (env->vstart << log2_esz);
425         vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart, ra,
426                                  esz, is_load);
427 
428         env->vstart = 0;
429         vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
430         return;
431     }
432 #endif
433 
434     /* Calculate the page range of first page */
435     addr = base + ((env->vstart * nf) << log2_esz);
436     page_split = -(addr | TARGET_PAGE_MASK);
437     /* Get number of elements */
438     elems = page_split / msize;
439     if (unlikely(env->vstart + elems >= evl)) {
440         elems = evl - env->vstart;
441     }
442 
443     /* Load/store elements in the first page */
444     if (likely(elems)) {
445         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
446                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
447     }
448 
449     /* Load/store elements in the second page */
450     if (unlikely(env->vstart < evl)) {
451         /* Cross page element */
452         if (unlikely(page_split % msize)) {
453             for (k = 0; k < nf; k++) {
454                 addr = base + ((env->vstart * nf + k) << log2_esz);
455                 ldst_tlb(env, adjust_addr(env, addr),
456                         env->vstart + k * max_elems, vd, ra);
457             }
458             env->vstart++;
459         }
460 
461         addr = base + ((env->vstart * nf) << log2_esz);
462         /* Get number of elements of second page */
463         elems = evl - env->vstart;
464 
465         /* Load/store elements in the second page */
466         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
467                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
468     }
469 
470     env->vstart = 0;
471     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
472 }
473 
474 /*
475  * masked unit-stride load and store operation will be a special case of
476  * stride, stride = NF * sizeof (ETYPE)
477  */
478 
479 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
480 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
481                          CPURISCVState *env, uint32_t desc)         \
482 {                                                                   \
483     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
484     vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
485                      LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
486 }                                                                   \
487                                                                     \
488 void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
489                   CPURISCVState *env, uint32_t desc)                \
490 {                                                                   \
491     vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
492                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
493 }
494 
GEN_VEXT_LD_US(vle8_v,int8_t,lde_b_tlb,lde_b_host)495 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
496 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
497 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
498 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
499 
500 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
501 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
502                          CPURISCVState *env, uint32_t desc)              \
503 {                                                                        \
504     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
505     vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
506                      STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
507 }                                                                        \
508                                                                          \
509 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
510                   CPURISCVState *env, uint32_t desc)                     \
511 {                                                                        \
512     vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
513                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
514 }
515 
516 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
517 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
518 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
519 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
520 
521 /*
522  * unit stride mask load and store, EEW = 1
523  */
524 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
525                     CPURISCVState *env, uint32_t desc)
526 {
527     /* evl = ceil(vl/8) */
528     uint8_t evl = (env->vl + 7) >> 3;
529     vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
530                  0, evl, GETPC(), true);
531 }
532 
HELPER(vsm_v)533 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
534                     CPURISCVState *env, uint32_t desc)
535 {
536     /* evl = ceil(vl/8) */
537     uint8_t evl = (env->vl + 7) >> 3;
538     vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
539                  0, evl, GETPC(), false);
540 }
541 
542 /*
543  * index: access vector element from indexed memory
544  */
545 typedef target_ulong vext_get_index_addr(target_ulong base,
546         uint32_t idx, void *vs2);
547 
548 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
549 static target_ulong NAME(target_ulong base,            \
550                          uint32_t idx, void *vs2)      \
551 {                                                      \
552     return (base + *((ETYPE *)vs2 + H(idx)));          \
553 }
554 
GEN_VEXT_GET_INDEX_ADDR(idx_b,uint8_t,H1)555 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
556 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
557 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
558 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
559 
560 static inline void
561 vext_ldst_index(void *vd, void *v0, target_ulong base,
562                 void *vs2, CPURISCVState *env, uint32_t desc,
563                 vext_get_index_addr get_index_addr,
564                 vext_ldst_elem_fn_tlb *ldst_elem,
565                 uint32_t log2_esz, uintptr_t ra)
566 {
567     uint32_t i, k;
568     uint32_t nf = vext_nf(desc);
569     uint32_t vm = vext_vm(desc);
570     uint32_t max_elems = vext_max_elems(desc, log2_esz);
571     uint32_t esz = 1 << log2_esz;
572     uint32_t vma = vext_vma(desc);
573 
574     VSTART_CHECK_EARLY_EXIT(env, env->vl);
575 
576     /* load bytes from guest memory */
577     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
578         k = 0;
579         while (k < nf) {
580             if (!vm && !vext_elem_mask(v0, i)) {
581                 /* set masked-off elements to 1s */
582                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
583                                   (i + k * max_elems + 1) * esz);
584                 k++;
585                 continue;
586             }
587             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
588             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
589             k++;
590         }
591     }
592     env->vstart = 0;
593 
594     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
595 }
596 
597 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
598 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
599                   void *vs2, CPURISCVState *env, uint32_t desc)            \
600 {                                                                          \
601     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
602                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
603 }
604 
GEN_VEXT_LD_INDEX(vlxei8_8_v,int8_t,idx_b,lde_b_tlb)605 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
606 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
607 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
608 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
609 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
610 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
611 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
612 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
613 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
614 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
615 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
616 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
617 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
618 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
619 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
620 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
621 
622 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
623 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
624                   void *vs2, CPURISCVState *env, uint32_t desc)  \
625 {                                                                \
626     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
627                     STORE_FN, ctzl(sizeof(ETYPE)),               \
628                     GETPC());                                    \
629 }
630 
631 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
632 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
633 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
634 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
635 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
636 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
637 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
638 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
639 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
640 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
641 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
642 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
643 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
644 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
645 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
646 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
647 
648 /*
649  * unit-stride fault-only-fisrt load instructions
650  */
651 static inline void
652 vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
653           uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
654           vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
655 {
656     uint32_t i, k, vl = 0;
657     uint32_t nf = vext_nf(desc);
658     uint32_t vm = vext_vm(desc);
659     uint32_t max_elems = vext_max_elems(desc, log2_esz);
660     uint32_t esz = 1 << log2_esz;
661     uint32_t msize = nf * esz;
662     uint32_t vma = vext_vma(desc);
663     target_ulong addr, addr_probe, addr_i, offset, remain, page_split, elems;
664     int mmu_index = riscv_env_mmu_index(env, false);
665     int flags, probe_flags;
666     void *host;
667 
668     VSTART_CHECK_EARLY_EXIT(env, env->vl);
669 
670     addr = base + ((env->vstart * nf) << log2_esz);
671     page_split = -(addr | TARGET_PAGE_MASK);
672     /* Get number of elements */
673     elems = page_split / msize;
674     if (unlikely(env->vstart + elems >= env->vl)) {
675         elems = env->vl - env->vstart;
676     }
677 
678     /* Check page permission/pmp/watchpoint/etc. */
679     probe_pages(env, addr, elems * msize, ra, MMU_DATA_LOAD, mmu_index, &host,
680                 &flags, true);
681 
682     /* If we are crossing a page check also the second page. */
683     if (env->vl > elems) {
684         addr_probe = addr + (elems << log2_esz);
685         probe_pages(env, addr_probe, elems * msize, ra, MMU_DATA_LOAD,
686                     mmu_index, &host, &probe_flags, true);
687         flags |= probe_flags;
688     }
689 
690     if (flags & ~TLB_WATCHPOINT) {
691         /* probe every access */
692         for (i = env->vstart; i < env->vl; i++) {
693             if (!vm && !vext_elem_mask(v0, i)) {
694                 continue;
695             }
696             addr_i = adjust_addr(env, base + i * (nf << log2_esz));
697             if (i == 0) {
698                 /* Allow fault on first element. */
699                 probe_pages(env, addr_i, nf << log2_esz, ra, MMU_DATA_LOAD,
700                             mmu_index, &host, NULL, false);
701             } else {
702                 remain = nf << log2_esz;
703                 while (remain > 0) {
704                     offset = -(addr_i | TARGET_PAGE_MASK);
705 
706                     /* Probe nonfault on subsequent elements. */
707                     probe_pages(env, addr_i, offset, 0, MMU_DATA_LOAD,
708                                 mmu_index, &host, &flags, true);
709 
710                     /*
711                      * Stop if invalid (unmapped) or mmio (transaction may
712                      * fail). Do not stop if watchpoint, as the spec says that
713                      * first-fault should continue to access the same
714                      * elements regardless of any watchpoint.
715                      */
716                     if (flags & ~TLB_WATCHPOINT) {
717                         vl = i;
718                         goto ProbeSuccess;
719                     }
720                     if (remain <= offset) {
721                         break;
722                     }
723                     remain -= offset;
724                     addr_i = adjust_addr(env, addr_i + offset);
725                 }
726             }
727         }
728     }
729 ProbeSuccess:
730     /* load bytes from guest memory */
731     if (vl != 0) {
732         env->vl = vl;
733     }
734 
735     if (env->vstart < env->vl) {
736         if (vm) {
737             /* Load/store elements in the first page */
738             if (likely(elems)) {
739                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
740                                   log2_esz, true, mmu_index, ldst_tlb,
741                                   ldst_host, ra);
742             }
743 
744             /* Load/store elements in the second page */
745             if (unlikely(env->vstart < env->vl)) {
746                 /* Cross page element */
747                 if (unlikely(page_split % msize)) {
748                     for (k = 0; k < nf; k++) {
749                         addr = base + ((env->vstart * nf + k) << log2_esz);
750                         ldst_tlb(env, adjust_addr(env, addr),
751                                  env->vstart + k * max_elems, vd, ra);
752                     }
753                     env->vstart++;
754                 }
755 
756                 addr = base + ((env->vstart * nf) << log2_esz);
757                 /* Get number of elements of second page */
758                 elems = env->vl - env->vstart;
759 
760                 /* Load/store elements in the second page */
761                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
762                                   log2_esz, true, mmu_index, ldst_tlb,
763                                   ldst_host, ra);
764             }
765         } else {
766             for (i = env->vstart; i < env->vl; i++) {
767                 k = 0;
768                 while (k < nf) {
769                     if (!vext_elem_mask(v0, i)) {
770                         /* set masked-off elements to 1s */
771                         vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
772                                           (i + k * max_elems + 1) * esz);
773                         k++;
774                         continue;
775                     }
776                     addr = base + ((i * nf + k) << log2_esz);
777                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
778                              vd, ra);
779                     k++;
780                 }
781             }
782         }
783     }
784     env->vstart = 0;
785 
786     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
787 }
788 
789 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
790 void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
791                   CPURISCVState *env, uint32_t desc)            \
792 {                                                               \
793     vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
794               LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
795 }
796 
GEN_VEXT_LDFF(vle8ff_v,int8_t,lde_b_tlb,lde_b_host)797 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
798 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
799 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
800 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
801 
802 #define DO_SWAP(N, M) (M)
803 #define DO_AND(N, M)  (N & M)
804 #define DO_XOR(N, M)  (N ^ M)
805 #define DO_OR(N, M)   (N | M)
806 #define DO_ADD(N, M)  (N + M)
807 
808 /* Signed min/max */
809 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
810 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
811 
812 /*
813  * load and store whole register instructions
814  */
815 static inline QEMU_ALWAYS_INLINE void
816 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
817                 vext_ldst_elem_fn_tlb *ldst_tlb,
818                 vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
819                 uintptr_t ra, bool is_load)
820 {
821     target_ulong page_split, elems, addr;
822     uint32_t nf = vext_nf(desc);
823     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
824     uint32_t max_elems = vlenb >> log2_esz;
825     uint32_t evl = nf * max_elems;
826     uint32_t esz = 1 << log2_esz;
827     int mmu_index = riscv_env_mmu_index(env, false);
828 
829     /* Calculate the page range of first page */
830     addr = base + (env->vstart << log2_esz);
831     page_split = -(addr | TARGET_PAGE_MASK);
832     /* Get number of elements */
833     elems = page_split / esz;
834     if (unlikely(env->vstart + elems >= evl)) {
835         elems = evl - env->vstart;
836     }
837 
838     /* Load/store elements in the first page */
839     if (likely(elems)) {
840         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
841                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
842     }
843 
844     /* Load/store elements in the second page */
845     if (unlikely(env->vstart < evl)) {
846         /* Cross page element */
847         if (unlikely(page_split % esz)) {
848             addr = base + (env->vstart << log2_esz);
849             ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
850             env->vstart++;
851         }
852 
853         addr = base + (env->vstart << log2_esz);
854         /* Get number of elements of second page */
855         elems = evl - env->vstart;
856 
857         /* Load/store elements in the second page */
858         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
859                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
860     }
861 
862     env->vstart = 0;
863 }
864 
865 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
866 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
867                   uint32_t desc)                                    \
868 {                                                                   \
869     vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
870                     ctzl(sizeof(ETYPE)), GETPC(), true);            \
871 }
872 
GEN_VEXT_LD_WHOLE(vl1re8_v,int8_t,lde_b_tlb,lde_b_host)873 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
874 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
875 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
876 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
877 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
878 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
879 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
880 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
881 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
882 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
883 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
884 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
885 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
886 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
887 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
888 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
889 
890 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
891 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
892                   uint32_t desc)                                        \
893 {                                                                       \
894     vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
895                     ctzl(sizeof(ETYPE)), GETPC(), false);               \
896 }
897 
898 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
899 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
900 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
901 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
902 
903 /*
904  * Vector Integer Arithmetic Instructions
905  */
906 
907 /* (TD, T1, T2, TX1, TX2) */
908 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
909 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
910 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
911 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
912 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
913 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
914 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
915 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
916 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
917 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
918 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
919 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
920 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
921 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
922 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
923 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
924 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
925 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
926 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
927 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
928 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
929 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
930 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
931 
932 #define DO_SUB(N, M) (N - M)
933 #define DO_RSUB(N, M) (M - N)
934 
935 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
936 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
937 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
938 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
939 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
940 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
941 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
942 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
943 
944 GEN_VEXT_VV(vadd_vv_b, 1)
945 GEN_VEXT_VV(vadd_vv_h, 2)
946 GEN_VEXT_VV(vadd_vv_w, 4)
947 GEN_VEXT_VV(vadd_vv_d, 8)
948 GEN_VEXT_VV(vsub_vv_b, 1)
949 GEN_VEXT_VV(vsub_vv_h, 2)
950 GEN_VEXT_VV(vsub_vv_w, 4)
951 GEN_VEXT_VV(vsub_vv_d, 8)
952 
953 
954 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
955 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
956 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
957 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
958 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
959 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
960 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
961 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
962 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
963 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
964 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
965 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
966 
967 GEN_VEXT_VX(vadd_vx_b, 1)
968 GEN_VEXT_VX(vadd_vx_h, 2)
969 GEN_VEXT_VX(vadd_vx_w, 4)
970 GEN_VEXT_VX(vadd_vx_d, 8)
971 GEN_VEXT_VX(vsub_vx_b, 1)
972 GEN_VEXT_VX(vsub_vx_h, 2)
973 GEN_VEXT_VX(vsub_vx_w, 4)
974 GEN_VEXT_VX(vsub_vx_d, 8)
975 GEN_VEXT_VX(vrsub_vx_b, 1)
976 GEN_VEXT_VX(vrsub_vx_h, 2)
977 GEN_VEXT_VX(vrsub_vx_w, 4)
978 GEN_VEXT_VX(vrsub_vx_d, 8)
979 
980 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
981 {
982     intptr_t oprsz = simd_oprsz(desc);
983     intptr_t i;
984 
985     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
986         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
987     }
988 }
989 
HELPER(vec_rsubs16)990 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
991 {
992     intptr_t oprsz = simd_oprsz(desc);
993     intptr_t i;
994 
995     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
996         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
997     }
998 }
999 
HELPER(vec_rsubs32)1000 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
1001 {
1002     intptr_t oprsz = simd_oprsz(desc);
1003     intptr_t i;
1004 
1005     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1006         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
1007     }
1008 }
1009 
HELPER(vec_rsubs64)1010 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
1011 {
1012     intptr_t oprsz = simd_oprsz(desc);
1013     intptr_t i;
1014 
1015     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1016         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
1017     }
1018 }
1019 
1020 /* Vector Widening Integer Add/Subtract */
1021 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
1022 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
1023 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
1024 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
1025 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
1026 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
1027 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
1028 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
1029 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
1030 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
1031 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
1032 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
RVVCALL(OPIVV2,vwaddu_vv_b,WOP_UUU_B,H2,H1,H1,DO_ADD)1033 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
1034 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
1035 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
1036 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
1037 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
1038 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1039 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1040 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1041 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1042 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1043 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1044 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1045 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1046 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1047 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1048 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1049 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1050 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1051 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1052 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1053 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1054 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1055 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1056 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1057 GEN_VEXT_VV(vwaddu_vv_b, 2)
1058 GEN_VEXT_VV(vwaddu_vv_h, 4)
1059 GEN_VEXT_VV(vwaddu_vv_w, 8)
1060 GEN_VEXT_VV(vwsubu_vv_b, 2)
1061 GEN_VEXT_VV(vwsubu_vv_h, 4)
1062 GEN_VEXT_VV(vwsubu_vv_w, 8)
1063 GEN_VEXT_VV(vwadd_vv_b, 2)
1064 GEN_VEXT_VV(vwadd_vv_h, 4)
1065 GEN_VEXT_VV(vwadd_vv_w, 8)
1066 GEN_VEXT_VV(vwsub_vv_b, 2)
1067 GEN_VEXT_VV(vwsub_vv_h, 4)
1068 GEN_VEXT_VV(vwsub_vv_w, 8)
1069 GEN_VEXT_VV(vwaddu_wv_b, 2)
1070 GEN_VEXT_VV(vwaddu_wv_h, 4)
1071 GEN_VEXT_VV(vwaddu_wv_w, 8)
1072 GEN_VEXT_VV(vwsubu_wv_b, 2)
1073 GEN_VEXT_VV(vwsubu_wv_h, 4)
1074 GEN_VEXT_VV(vwsubu_wv_w, 8)
1075 GEN_VEXT_VV(vwadd_wv_b, 2)
1076 GEN_VEXT_VV(vwadd_wv_h, 4)
1077 GEN_VEXT_VV(vwadd_wv_w, 8)
1078 GEN_VEXT_VV(vwsub_wv_b, 2)
1079 GEN_VEXT_VV(vwsub_wv_h, 4)
1080 GEN_VEXT_VV(vwsub_wv_w, 8)
1081 
1082 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1083 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1084 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1085 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1086 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1087 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1088 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1089 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1090 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1091 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1092 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1093 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1094 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1095 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1096 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1097 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1098 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1099 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1100 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1101 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1102 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1103 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1104 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1105 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1106 GEN_VEXT_VX(vwaddu_vx_b, 2)
1107 GEN_VEXT_VX(vwaddu_vx_h, 4)
1108 GEN_VEXT_VX(vwaddu_vx_w, 8)
1109 GEN_VEXT_VX(vwsubu_vx_b, 2)
1110 GEN_VEXT_VX(vwsubu_vx_h, 4)
1111 GEN_VEXT_VX(vwsubu_vx_w, 8)
1112 GEN_VEXT_VX(vwadd_vx_b, 2)
1113 GEN_VEXT_VX(vwadd_vx_h, 4)
1114 GEN_VEXT_VX(vwadd_vx_w, 8)
1115 GEN_VEXT_VX(vwsub_vx_b, 2)
1116 GEN_VEXT_VX(vwsub_vx_h, 4)
1117 GEN_VEXT_VX(vwsub_vx_w, 8)
1118 GEN_VEXT_VX(vwaddu_wx_b, 2)
1119 GEN_VEXT_VX(vwaddu_wx_h, 4)
1120 GEN_VEXT_VX(vwaddu_wx_w, 8)
1121 GEN_VEXT_VX(vwsubu_wx_b, 2)
1122 GEN_VEXT_VX(vwsubu_wx_h, 4)
1123 GEN_VEXT_VX(vwsubu_wx_w, 8)
1124 GEN_VEXT_VX(vwadd_wx_b, 2)
1125 GEN_VEXT_VX(vwadd_wx_h, 4)
1126 GEN_VEXT_VX(vwadd_wx_w, 8)
1127 GEN_VEXT_VX(vwsub_wx_b, 2)
1128 GEN_VEXT_VX(vwsub_wx_h, 4)
1129 GEN_VEXT_VX(vwsub_wx_w, 8)
1130 
1131 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1132 #define DO_VADC(N, M, C) (N + M + C)
1133 #define DO_VSBC(N, M, C) (N - M - C)
1134 
1135 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1136 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1137                   CPURISCVState *env, uint32_t desc)          \
1138 {                                                             \
1139     uint32_t vl = env->vl;                                    \
1140     uint32_t esz = sizeof(ETYPE);                             \
1141     uint32_t total_elems =                                    \
1142         vext_get_total_elems(env, desc, esz);                 \
1143     uint32_t vta = vext_vta(desc);                            \
1144     uint32_t i;                                               \
1145                                                               \
1146     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1147                                                               \
1148     for (i = env->vstart; i < vl; i++) {                      \
1149         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1150         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1151         ETYPE carry = vext_elem_mask(v0, i);                  \
1152                                                               \
1153         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1154     }                                                         \
1155     env->vstart = 0;                                          \
1156     /* set tail elements to 1s */                             \
1157     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1158 }
1159 
1160 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1161 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1162 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1163 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1164 
1165 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1166 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1167 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1168 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1169 
1170 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1171 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1172                   CPURISCVState *env, uint32_t desc)                     \
1173 {                                                                        \
1174     uint32_t vl = env->vl;                                               \
1175     uint32_t esz = sizeof(ETYPE);                                        \
1176     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1177     uint32_t vta = vext_vta(desc);                                       \
1178     uint32_t i;                                                          \
1179                                                                          \
1180     VSTART_CHECK_EARLY_EXIT(env, vl);                                    \
1181                                                                          \
1182     for (i = env->vstart; i < vl; i++) {                                 \
1183         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1184         ETYPE carry = vext_elem_mask(v0, i);                             \
1185                                                                          \
1186         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1187     }                                                                    \
1188     env->vstart = 0;                                                     \
1189     /* set tail elements to 1s */                                        \
1190     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1191 }
1192 
1193 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1194 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1195 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1196 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1197 
1198 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1199 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1200 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1201 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1202 
1203 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1204                           (__typeof(N))(N + M) < N)
1205 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1206 
1207 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1208 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1209                   CPURISCVState *env, uint32_t desc)          \
1210 {                                                             \
1211     uint32_t vl = env->vl;                                    \
1212     uint32_t vm = vext_vm(desc);                              \
1213     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1214     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1215     uint32_t i;                                               \
1216                                                               \
1217     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1218                                                               \
1219     for (i = env->vstart; i < vl; i++) {                      \
1220         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1221         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1222         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1223         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1224     }                                                         \
1225     env->vstart = 0;                                          \
1226     /*
1227      * mask destination register are always tail-agnostic
1228      * set tail elements to 1s
1229      */                                                       \
1230     if (vta_all_1s) {                                         \
1231         for (; i < total_elems; i++) {                        \
1232             vext_set_elem_mask(vd, i, 1);                     \
1233         }                                                     \
1234     }                                                         \
1235 }
1236 
1237 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1238 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1239 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1240 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1241 
1242 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1243 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1244 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1245 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1246 
1247 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1248 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1249                   void *vs2, CPURISCVState *env, uint32_t desc) \
1250 {                                                               \
1251     uint32_t vl = env->vl;                                      \
1252     uint32_t vm = vext_vm(desc);                                \
1253     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1254     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1255     uint32_t i;                                                 \
1256                                                                 \
1257     VSTART_CHECK_EARLY_EXIT(env, vl);                           \
1258                                                                 \
1259     for (i = env->vstart; i < vl; i++) {                        \
1260         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1261         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1262         vext_set_elem_mask(vd, i,                               \
1263                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1264     }                                                           \
1265     env->vstart = 0;                                            \
1266     /*
1267      * mask destination register are always tail-agnostic
1268      * set tail elements to 1s
1269      */                                                         \
1270     if (vta_all_1s) {                                           \
1271         for (; i < total_elems; i++) {                          \
1272             vext_set_elem_mask(vd, i, 1);                       \
1273         }                                                       \
1274     }                                                           \
1275 }
1276 
1277 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1278 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1279 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1280 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1281 
1282 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1283 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1284 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1285 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1286 
1287 /* Vector Bitwise Logical Instructions */
1288 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1289 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1290 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1291 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1292 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1293 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1294 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1295 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1296 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1297 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1298 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1299 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1300 GEN_VEXT_VV(vand_vv_b, 1)
1301 GEN_VEXT_VV(vand_vv_h, 2)
1302 GEN_VEXT_VV(vand_vv_w, 4)
1303 GEN_VEXT_VV(vand_vv_d, 8)
1304 GEN_VEXT_VV(vor_vv_b, 1)
1305 GEN_VEXT_VV(vor_vv_h, 2)
1306 GEN_VEXT_VV(vor_vv_w, 4)
1307 GEN_VEXT_VV(vor_vv_d, 8)
1308 GEN_VEXT_VV(vxor_vv_b, 1)
1309 GEN_VEXT_VV(vxor_vv_h, 2)
1310 GEN_VEXT_VV(vxor_vv_w, 4)
1311 GEN_VEXT_VV(vxor_vv_d, 8)
1312 
1313 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1314 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1315 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1316 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1317 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1318 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1319 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1320 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1321 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1322 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1323 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1324 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1325 GEN_VEXT_VX(vand_vx_b, 1)
1326 GEN_VEXT_VX(vand_vx_h, 2)
1327 GEN_VEXT_VX(vand_vx_w, 4)
1328 GEN_VEXT_VX(vand_vx_d, 8)
1329 GEN_VEXT_VX(vor_vx_b, 1)
1330 GEN_VEXT_VX(vor_vx_h, 2)
1331 GEN_VEXT_VX(vor_vx_w, 4)
1332 GEN_VEXT_VX(vor_vx_d, 8)
1333 GEN_VEXT_VX(vxor_vx_b, 1)
1334 GEN_VEXT_VX(vxor_vx_h, 2)
1335 GEN_VEXT_VX(vxor_vx_w, 4)
1336 GEN_VEXT_VX(vxor_vx_d, 8)
1337 
1338 /* Vector Single-Width Bit Shift Instructions */
1339 #define DO_SLL(N, M)  (N << (M))
1340 #define DO_SRL(N, M)  (N >> (M))
1341 
1342 /* generate the helpers for shift instructions with two vector operators */
1343 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1344 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1345                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1346 {                                                                         \
1347     uint32_t vm = vext_vm(desc);                                          \
1348     uint32_t vl = env->vl;                                                \
1349     uint32_t esz = sizeof(TS1);                                           \
1350     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1351     uint32_t vta = vext_vta(desc);                                        \
1352     uint32_t vma = vext_vma(desc);                                        \
1353     uint32_t i;                                                           \
1354                                                                           \
1355     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
1356                                                                           \
1357     for (i = env->vstart; i < vl; i++) {                                  \
1358         if (!vm && !vext_elem_mask(v0, i)) {                              \
1359             /* set masked-off elements to 1s */                           \
1360             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1361             continue;                                                     \
1362         }                                                                 \
1363         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1364         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1365         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1366     }                                                                     \
1367     env->vstart = 0;                                                      \
1368     /* set tail elements to 1s */                                         \
1369     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1370 }
1371 
1372 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1373 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1374 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1375 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1376 
1377 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1378 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1379 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1380 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1381 
1382 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1383 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1384 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1385 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1386 
1387 /*
1388  * generate the helpers for shift instructions with one vector and one scalar
1389  */
1390 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1391 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1392                   void *vs2, CPURISCVState *env,            \
1393                   uint32_t desc)                            \
1394 {                                                           \
1395     uint32_t vm = vext_vm(desc);                            \
1396     uint32_t vl = env->vl;                                  \
1397     uint32_t esz = sizeof(TD);                              \
1398     uint32_t total_elems =                                  \
1399         vext_get_total_elems(env, desc, esz);               \
1400     uint32_t vta = vext_vta(desc);                          \
1401     uint32_t vma = vext_vma(desc);                          \
1402     uint32_t i;                                             \
1403                                                             \
1404     VSTART_CHECK_EARLY_EXIT(env, vl);                       \
1405                                                             \
1406     for (i = env->vstart; i < vl; i++) {                    \
1407         if (!vm && !vext_elem_mask(v0, i)) {                \
1408             /* set masked-off elements to 1s */             \
1409             vext_set_elems_1s(vd, vma, i * esz,             \
1410                               (i + 1) * esz);               \
1411             continue;                                       \
1412         }                                                   \
1413         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1414         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1415     }                                                       \
1416     env->vstart = 0;                                        \
1417     /* set tail elements to 1s */                           \
1418     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1419 }
1420 
1421 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1422 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1423 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1424 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1425 
1426 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1427 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1428 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1429 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1430 
1431 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1432 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1433 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1434 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1435 
1436 /* Vector Narrowing Integer Right Shift Instructions */
1437 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1438 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1439 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1440 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1441 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1442 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1443 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1444 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1445 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1446 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1447 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1448 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1449 
1450 /* Vector Integer Comparison Instructions */
1451 #define DO_MSEQ(N, M) (N == M)
1452 #define DO_MSNE(N, M) (N != M)
1453 #define DO_MSLT(N, M) (N < M)
1454 #define DO_MSLE(N, M) (N <= M)
1455 #define DO_MSGT(N, M) (N > M)
1456 
1457 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1458 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1459                   CPURISCVState *env, uint32_t desc)          \
1460 {                                                             \
1461     uint32_t vm = vext_vm(desc);                              \
1462     uint32_t vl = env->vl;                                    \
1463     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1464     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1465     uint32_t vma = vext_vma(desc);                            \
1466     uint32_t i;                                               \
1467                                                               \
1468     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1469                                                               \
1470     for (i = env->vstart; i < vl; i++) {                      \
1471         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1472         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1473         if (!vm && !vext_elem_mask(v0, i)) {                  \
1474             /* set masked-off elements to 1s */               \
1475             if (vma) {                                        \
1476                 vext_set_elem_mask(vd, i, 1);                 \
1477             }                                                 \
1478             continue;                                         \
1479         }                                                     \
1480         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1481     }                                                         \
1482     env->vstart = 0;                                          \
1483     /*
1484      * mask destination register are always tail-agnostic
1485      * set tail elements to 1s
1486      */                                                       \
1487     if (vta_all_1s) {                                         \
1488         for (; i < total_elems; i++) {                        \
1489             vext_set_elem_mask(vd, i, 1);                     \
1490         }                                                     \
1491     }                                                         \
1492 }
1493 
1494 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1495 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1496 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1497 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1498 
1499 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1500 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1501 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1502 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1503 
1504 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1505 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1506 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1507 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1508 
1509 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1510 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1511 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1512 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1513 
1514 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1515 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1516 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1517 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1518 
1519 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1520 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1521 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1522 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1523 
1524 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1525 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1526                   CPURISCVState *env, uint32_t desc)                \
1527 {                                                                   \
1528     uint32_t vm = vext_vm(desc);                                    \
1529     uint32_t vl = env->vl;                                          \
1530     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1531     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1532     uint32_t vma = vext_vma(desc);                                  \
1533     uint32_t i;                                                     \
1534                                                                     \
1535     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
1536                                                                     \
1537     for (i = env->vstart; i < vl; i++) {                            \
1538         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1539         if (!vm && !vext_elem_mask(v0, i)) {                        \
1540             /* set masked-off elements to 1s */                     \
1541             if (vma) {                                              \
1542                 vext_set_elem_mask(vd, i, 1);                       \
1543             }                                                       \
1544             continue;                                               \
1545         }                                                           \
1546         vext_set_elem_mask(vd, i,                                   \
1547                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1548     }                                                               \
1549     env->vstart = 0;                                                \
1550     /*
1551      * mask destination register are always tail-agnostic
1552      * set tail elements to 1s
1553      */                                                             \
1554     if (vta_all_1s) {                                               \
1555         for (; i < total_elems; i++) {                              \
1556             vext_set_elem_mask(vd, i, 1);                           \
1557         }                                                           \
1558     }                                                               \
1559 }
1560 
1561 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1562 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1563 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1564 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1565 
1566 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1567 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1568 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1569 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1570 
1571 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1572 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1573 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1574 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1575 
1576 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1577 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1578 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1579 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1580 
1581 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1582 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1583 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1584 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1585 
1586 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1587 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1588 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1589 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1590 
1591 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1592 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1593 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1594 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1595 
1596 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1597 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1598 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1599 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1600 
1601 /* Vector Integer Min/Max Instructions */
1602 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1603 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1604 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1605 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1606 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1607 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1608 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1609 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1610 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1611 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1612 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1613 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1614 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1615 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1616 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1617 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1618 GEN_VEXT_VV(vminu_vv_b, 1)
1619 GEN_VEXT_VV(vminu_vv_h, 2)
1620 GEN_VEXT_VV(vminu_vv_w, 4)
1621 GEN_VEXT_VV(vminu_vv_d, 8)
1622 GEN_VEXT_VV(vmin_vv_b, 1)
1623 GEN_VEXT_VV(vmin_vv_h, 2)
1624 GEN_VEXT_VV(vmin_vv_w, 4)
1625 GEN_VEXT_VV(vmin_vv_d, 8)
1626 GEN_VEXT_VV(vmaxu_vv_b, 1)
1627 GEN_VEXT_VV(vmaxu_vv_h, 2)
1628 GEN_VEXT_VV(vmaxu_vv_w, 4)
1629 GEN_VEXT_VV(vmaxu_vv_d, 8)
1630 GEN_VEXT_VV(vmax_vv_b, 1)
1631 GEN_VEXT_VV(vmax_vv_h, 2)
1632 GEN_VEXT_VV(vmax_vv_w, 4)
1633 GEN_VEXT_VV(vmax_vv_d, 8)
1634 
1635 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1636 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1637 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1638 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1639 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1640 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1641 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1642 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1643 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1644 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1645 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1646 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1647 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1648 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1649 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1650 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1651 GEN_VEXT_VX(vminu_vx_b, 1)
1652 GEN_VEXT_VX(vminu_vx_h, 2)
1653 GEN_VEXT_VX(vminu_vx_w, 4)
1654 GEN_VEXT_VX(vminu_vx_d, 8)
1655 GEN_VEXT_VX(vmin_vx_b, 1)
1656 GEN_VEXT_VX(vmin_vx_h, 2)
1657 GEN_VEXT_VX(vmin_vx_w, 4)
1658 GEN_VEXT_VX(vmin_vx_d, 8)
1659 GEN_VEXT_VX(vmaxu_vx_b, 1)
1660 GEN_VEXT_VX(vmaxu_vx_h, 2)
1661 GEN_VEXT_VX(vmaxu_vx_w, 4)
1662 GEN_VEXT_VX(vmaxu_vx_d, 8)
1663 GEN_VEXT_VX(vmax_vx_b, 1)
1664 GEN_VEXT_VX(vmax_vx_h, 2)
1665 GEN_VEXT_VX(vmax_vx_w, 4)
1666 GEN_VEXT_VX(vmax_vx_d, 8)
1667 
1668 /* Vector Single-Width Integer Multiply Instructions */
1669 #define DO_MUL(N, M) (N * M)
1670 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1671 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1672 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1673 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1674 GEN_VEXT_VV(vmul_vv_b, 1)
1675 GEN_VEXT_VV(vmul_vv_h, 2)
1676 GEN_VEXT_VV(vmul_vv_w, 4)
1677 GEN_VEXT_VV(vmul_vv_d, 8)
1678 
1679 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1680 {
1681     return (int16_t)s2 * (int16_t)s1 >> 8;
1682 }
1683 
do_mulh_h(int16_t s2,int16_t s1)1684 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1685 {
1686     return (int32_t)s2 * (int32_t)s1 >> 16;
1687 }
1688 
do_mulh_w(int32_t s2,int32_t s1)1689 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1690 {
1691     return (int64_t)s2 * (int64_t)s1 >> 32;
1692 }
1693 
do_mulh_d(int64_t s2,int64_t s1)1694 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1695 {
1696     uint64_t hi_64, lo_64;
1697 
1698     muls64(&lo_64, &hi_64, s1, s2);
1699     return hi_64;
1700 }
1701 
do_mulhu_b(uint8_t s2,uint8_t s1)1702 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1703 {
1704     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1705 }
1706 
do_mulhu_h(uint16_t s2,uint16_t s1)1707 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1708 {
1709     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1710 }
1711 
do_mulhu_w(uint32_t s2,uint32_t s1)1712 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1713 {
1714     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1715 }
1716 
do_mulhu_d(uint64_t s2,uint64_t s1)1717 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1718 {
1719     uint64_t hi_64, lo_64;
1720 
1721     mulu64(&lo_64, &hi_64, s2, s1);
1722     return hi_64;
1723 }
1724 
do_mulhsu_b(int8_t s2,uint8_t s1)1725 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1726 {
1727     return (int16_t)s2 * (uint16_t)s1 >> 8;
1728 }
1729 
do_mulhsu_h(int16_t s2,uint16_t s1)1730 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1731 {
1732     return (int32_t)s2 * (uint32_t)s1 >> 16;
1733 }
1734 
do_mulhsu_w(int32_t s2,uint32_t s1)1735 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1736 {
1737     return (int64_t)s2 * (uint64_t)s1 >> 32;
1738 }
1739 
1740 /*
1741  * Let  A = signed operand,
1742  *      B = unsigned operand
1743  *      P = mulu64(A, B), unsigned product
1744  *
1745  * LET  X = 2 ** 64  - A, 2's complement of A
1746  *      SP = signed product
1747  * THEN
1748  *      IF A < 0
1749  *          SP = -X * B
1750  *             = -(2 ** 64 - A) * B
1751  *             = A * B - 2 ** 64 * B
1752  *             = P - 2 ** 64 * B
1753  *      ELSE
1754  *          SP = P
1755  * THEN
1756  *      HI_P -= (A < 0 ? B : 0)
1757  */
1758 
do_mulhsu_d(int64_t s2,uint64_t s1)1759 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1760 {
1761     uint64_t hi_64, lo_64;
1762 
1763     mulu64(&lo_64, &hi_64, s2, s1);
1764 
1765     hi_64 -= s2 < 0 ? s1 : 0;
1766     return hi_64;
1767 }
1768 
1769 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1770 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1771 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1772 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1773 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1774 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1775 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1776 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1777 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1778 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1779 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1780 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1781 GEN_VEXT_VV(vmulh_vv_b, 1)
1782 GEN_VEXT_VV(vmulh_vv_h, 2)
1783 GEN_VEXT_VV(vmulh_vv_w, 4)
1784 GEN_VEXT_VV(vmulh_vv_d, 8)
1785 GEN_VEXT_VV(vmulhu_vv_b, 1)
1786 GEN_VEXT_VV(vmulhu_vv_h, 2)
1787 GEN_VEXT_VV(vmulhu_vv_w, 4)
1788 GEN_VEXT_VV(vmulhu_vv_d, 8)
1789 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1790 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1791 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1792 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1793 
1794 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1795 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1796 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1797 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1798 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1799 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1800 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1801 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1802 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1803 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1804 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1805 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1806 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1807 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1808 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1809 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1810 GEN_VEXT_VX(vmul_vx_b, 1)
1811 GEN_VEXT_VX(vmul_vx_h, 2)
1812 GEN_VEXT_VX(vmul_vx_w, 4)
1813 GEN_VEXT_VX(vmul_vx_d, 8)
1814 GEN_VEXT_VX(vmulh_vx_b, 1)
1815 GEN_VEXT_VX(vmulh_vx_h, 2)
1816 GEN_VEXT_VX(vmulh_vx_w, 4)
1817 GEN_VEXT_VX(vmulh_vx_d, 8)
1818 GEN_VEXT_VX(vmulhu_vx_b, 1)
1819 GEN_VEXT_VX(vmulhu_vx_h, 2)
1820 GEN_VEXT_VX(vmulhu_vx_w, 4)
1821 GEN_VEXT_VX(vmulhu_vx_d, 8)
1822 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1823 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1824 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1825 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1826 
1827 /* Vector Integer Divide Instructions */
1828 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1829 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1830 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1831         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1832 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1833         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1834 
1835 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1836 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1837 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1838 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1839 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1840 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1841 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1842 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1843 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1844 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1845 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1846 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1847 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1848 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1849 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1850 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1851 GEN_VEXT_VV(vdivu_vv_b, 1)
1852 GEN_VEXT_VV(vdivu_vv_h, 2)
1853 GEN_VEXT_VV(vdivu_vv_w, 4)
1854 GEN_VEXT_VV(vdivu_vv_d, 8)
1855 GEN_VEXT_VV(vdiv_vv_b, 1)
1856 GEN_VEXT_VV(vdiv_vv_h, 2)
1857 GEN_VEXT_VV(vdiv_vv_w, 4)
1858 GEN_VEXT_VV(vdiv_vv_d, 8)
1859 GEN_VEXT_VV(vremu_vv_b, 1)
1860 GEN_VEXT_VV(vremu_vv_h, 2)
1861 GEN_VEXT_VV(vremu_vv_w, 4)
1862 GEN_VEXT_VV(vremu_vv_d, 8)
1863 GEN_VEXT_VV(vrem_vv_b, 1)
1864 GEN_VEXT_VV(vrem_vv_h, 2)
1865 GEN_VEXT_VV(vrem_vv_w, 4)
1866 GEN_VEXT_VV(vrem_vv_d, 8)
1867 
1868 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1869 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1870 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1871 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1872 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1873 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1874 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1875 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1876 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1877 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1878 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1879 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1880 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1881 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1882 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1883 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1884 GEN_VEXT_VX(vdivu_vx_b, 1)
1885 GEN_VEXT_VX(vdivu_vx_h, 2)
1886 GEN_VEXT_VX(vdivu_vx_w, 4)
1887 GEN_VEXT_VX(vdivu_vx_d, 8)
1888 GEN_VEXT_VX(vdiv_vx_b, 1)
1889 GEN_VEXT_VX(vdiv_vx_h, 2)
1890 GEN_VEXT_VX(vdiv_vx_w, 4)
1891 GEN_VEXT_VX(vdiv_vx_d, 8)
1892 GEN_VEXT_VX(vremu_vx_b, 1)
1893 GEN_VEXT_VX(vremu_vx_h, 2)
1894 GEN_VEXT_VX(vremu_vx_w, 4)
1895 GEN_VEXT_VX(vremu_vx_d, 8)
1896 GEN_VEXT_VX(vrem_vx_b, 1)
1897 GEN_VEXT_VX(vrem_vx_h, 2)
1898 GEN_VEXT_VX(vrem_vx_w, 4)
1899 GEN_VEXT_VX(vrem_vx_d, 8)
1900 
1901 /* Vector Widening Integer Multiply Instructions */
1902 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1903 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1904 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1905 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1906 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1907 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1908 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1909 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1910 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1911 GEN_VEXT_VV(vwmul_vv_b, 2)
1912 GEN_VEXT_VV(vwmul_vv_h, 4)
1913 GEN_VEXT_VV(vwmul_vv_w, 8)
1914 GEN_VEXT_VV(vwmulu_vv_b, 2)
1915 GEN_VEXT_VV(vwmulu_vv_h, 4)
1916 GEN_VEXT_VV(vwmulu_vv_w, 8)
1917 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1918 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1919 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1920 
1921 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1922 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1923 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1924 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1925 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1926 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1927 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1928 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1929 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1930 GEN_VEXT_VX(vwmul_vx_b, 2)
1931 GEN_VEXT_VX(vwmul_vx_h, 4)
1932 GEN_VEXT_VX(vwmul_vx_w, 8)
1933 GEN_VEXT_VX(vwmulu_vx_b, 2)
1934 GEN_VEXT_VX(vwmulu_vx_h, 4)
1935 GEN_VEXT_VX(vwmulu_vx_w, 8)
1936 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1937 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1938 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1939 
1940 /* Vector Single-Width Integer Multiply-Add Instructions */
1941 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1942 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1943 {                                                                  \
1944     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1945     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1946     TD d = *((TD *)vd + HD(i));                                    \
1947     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1948 }
1949 
1950 #define DO_MACC(N, M, D) (M * N + D)
1951 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1952 #define DO_MADD(N, M, D) (M * D + N)
1953 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1954 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1955 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1956 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1957 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1958 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1959 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1960 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1961 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1962 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1963 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1964 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1965 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1966 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1967 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1968 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1969 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1970 GEN_VEXT_VV(vmacc_vv_b, 1)
1971 GEN_VEXT_VV(vmacc_vv_h, 2)
1972 GEN_VEXT_VV(vmacc_vv_w, 4)
1973 GEN_VEXT_VV(vmacc_vv_d, 8)
1974 GEN_VEXT_VV(vnmsac_vv_b, 1)
1975 GEN_VEXT_VV(vnmsac_vv_h, 2)
1976 GEN_VEXT_VV(vnmsac_vv_w, 4)
1977 GEN_VEXT_VV(vnmsac_vv_d, 8)
1978 GEN_VEXT_VV(vmadd_vv_b, 1)
1979 GEN_VEXT_VV(vmadd_vv_h, 2)
1980 GEN_VEXT_VV(vmadd_vv_w, 4)
1981 GEN_VEXT_VV(vmadd_vv_d, 8)
1982 GEN_VEXT_VV(vnmsub_vv_b, 1)
1983 GEN_VEXT_VV(vnmsub_vv_h, 2)
1984 GEN_VEXT_VV(vnmsub_vv_w, 4)
1985 GEN_VEXT_VV(vnmsub_vv_d, 8)
1986 
1987 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1988 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1989 {                                                                   \
1990     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1991     TD d = *((TD *)vd + HD(i));                                     \
1992     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1993 }
1994 
1995 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1996 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1997 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1998 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1999 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
2000 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
2001 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
2002 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
2003 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
2004 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
2005 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
2006 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
2007 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
2008 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
2009 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
2010 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
2011 GEN_VEXT_VX(vmacc_vx_b, 1)
2012 GEN_VEXT_VX(vmacc_vx_h, 2)
2013 GEN_VEXT_VX(vmacc_vx_w, 4)
2014 GEN_VEXT_VX(vmacc_vx_d, 8)
2015 GEN_VEXT_VX(vnmsac_vx_b, 1)
2016 GEN_VEXT_VX(vnmsac_vx_h, 2)
2017 GEN_VEXT_VX(vnmsac_vx_w, 4)
2018 GEN_VEXT_VX(vnmsac_vx_d, 8)
2019 GEN_VEXT_VX(vmadd_vx_b, 1)
2020 GEN_VEXT_VX(vmadd_vx_h, 2)
2021 GEN_VEXT_VX(vmadd_vx_w, 4)
2022 GEN_VEXT_VX(vmadd_vx_d, 8)
2023 GEN_VEXT_VX(vnmsub_vx_b, 1)
2024 GEN_VEXT_VX(vnmsub_vx_h, 2)
2025 GEN_VEXT_VX(vnmsub_vx_w, 4)
2026 GEN_VEXT_VX(vnmsub_vx_d, 8)
2027 
2028 /* Vector Widening Integer Multiply-Add Instructions */
2029 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
2030 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
2031 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
2032 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
2033 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
2034 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
2035 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
2036 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
2037 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
2038 GEN_VEXT_VV(vwmaccu_vv_b, 2)
2039 GEN_VEXT_VV(vwmaccu_vv_h, 4)
2040 GEN_VEXT_VV(vwmaccu_vv_w, 8)
2041 GEN_VEXT_VV(vwmacc_vv_b, 2)
2042 GEN_VEXT_VV(vwmacc_vv_h, 4)
2043 GEN_VEXT_VV(vwmacc_vv_w, 8)
2044 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
2045 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
2046 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
2047 
2048 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
2049 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
2050 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
2051 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
2052 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
2053 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
2054 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
2055 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
2056 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
2057 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
2058 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
2059 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
2060 GEN_VEXT_VX(vwmaccu_vx_b, 2)
2061 GEN_VEXT_VX(vwmaccu_vx_h, 4)
2062 GEN_VEXT_VX(vwmaccu_vx_w, 8)
2063 GEN_VEXT_VX(vwmacc_vx_b, 2)
2064 GEN_VEXT_VX(vwmacc_vx_h, 4)
2065 GEN_VEXT_VX(vwmacc_vx_w, 8)
2066 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2067 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2068 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2069 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2070 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2071 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2072 
2073 /* Vector Integer Merge and Move Instructions */
2074 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2075 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2076                   uint32_t desc)                                     \
2077 {                                                                    \
2078     uint32_t vl = env->vl;                                           \
2079     uint32_t esz = sizeof(ETYPE);                                    \
2080     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2081     uint32_t vta = vext_vta(desc);                                   \
2082     uint32_t i;                                                      \
2083                                                                      \
2084     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2085                                                                      \
2086     for (i = env->vstart; i < vl; i++) {                             \
2087         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2088         *((ETYPE *)vd + H(i)) = s1;                                  \
2089     }                                                                \
2090     env->vstart = 0;                                                 \
2091     /* set tail elements to 1s */                                    \
2092     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2093 }
2094 
2095 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2096 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2097 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2098 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2099 
2100 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2101 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2102                   uint32_t desc)                                     \
2103 {                                                                    \
2104     uint32_t vl = env->vl;                                           \
2105     uint32_t esz = sizeof(ETYPE);                                    \
2106     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2107     uint32_t vta = vext_vta(desc);                                   \
2108     uint32_t i;                                                      \
2109                                                                      \
2110     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2111                                                                      \
2112     for (i = env->vstart; i < vl; i++) {                             \
2113         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2114     }                                                                \
2115     env->vstart = 0;                                                 \
2116     /* set tail elements to 1s */                                    \
2117     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2118 }
2119 
2120 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2121 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2122 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2123 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2124 
2125 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2126 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2127                   CPURISCVState *env, uint32_t desc)                 \
2128 {                                                                    \
2129     uint32_t vl = env->vl;                                           \
2130     uint32_t esz = sizeof(ETYPE);                                    \
2131     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2132     uint32_t vta = vext_vta(desc);                                   \
2133     uint32_t i;                                                      \
2134                                                                      \
2135     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2136                                                                      \
2137     for (i = env->vstart; i < vl; i++) {                             \
2138         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2139         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2140     }                                                                \
2141     env->vstart = 0;                                                 \
2142     /* set tail elements to 1s */                                    \
2143     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2144 }
2145 
2146 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2147 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2148 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2149 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2150 
2151 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2152 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2153                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2154 {                                                                    \
2155     uint32_t vl = env->vl;                                           \
2156     uint32_t esz = sizeof(ETYPE);                                    \
2157     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2158     uint32_t vta = vext_vta(desc);                                   \
2159     uint32_t i;                                                      \
2160                                                                      \
2161     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2162                                                                      \
2163     for (i = env->vstart; i < vl; i++) {                             \
2164         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2165         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2166                    (ETYPE)(target_long)s1);                          \
2167         *((ETYPE *)vd + H(i)) = d;                                   \
2168     }                                                                \
2169     env->vstart = 0;                                                 \
2170     /* set tail elements to 1s */                                    \
2171     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2172 }
2173 
2174 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2175 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2176 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2177 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2178 
2179 /*
2180  * Vector Fixed-Point Arithmetic Instructions
2181  */
2182 
2183 /* Vector Single-Width Saturating Add and Subtract */
2184 
2185 /*
2186  * As fixed point instructions probably have round mode and saturation,
2187  * define common macros for fixed point here.
2188  */
2189 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2190                           CPURISCVState *env, int vxrm);
2191 
2192 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2193 static inline void                                                  \
2194 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2195           CPURISCVState *env, int vxrm)                             \
2196 {                                                                   \
2197     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2198     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2199     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2200 }
2201 
2202 static inline void
vext_vv_rm_1(void * vd,void * v0,void * vs1,void * vs2,CPURISCVState * env,uint32_t vl,uint32_t vm,int vxrm,opivv2_rm_fn * fn,uint32_t vma,uint32_t esz)2203 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2204              CPURISCVState *env,
2205              uint32_t vl, uint32_t vm, int vxrm,
2206              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2207 {
2208     for (uint32_t i = env->vstart; i < vl; i++) {
2209         if (!vm && !vext_elem_mask(v0, i)) {
2210             /* set masked-off elements to 1s */
2211             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2212             continue;
2213         }
2214         fn(vd, vs1, vs2, i, env, vxrm);
2215     }
2216     env->vstart = 0;
2217 }
2218 
2219 static inline void
vext_vv_rm_2(void * vd,void * v0,void * vs1,void * vs2,CPURISCVState * env,uint32_t desc,opivv2_rm_fn * fn,uint32_t esz)2220 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2221              CPURISCVState *env,
2222              uint32_t desc,
2223              opivv2_rm_fn *fn, uint32_t esz)
2224 {
2225     uint32_t vm = vext_vm(desc);
2226     uint32_t vl = env->vl;
2227     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2228     uint32_t vta = vext_vta(desc);
2229     uint32_t vma = vext_vma(desc);
2230 
2231     VSTART_CHECK_EARLY_EXIT(env, vl);
2232 
2233     switch (env->vxrm) {
2234     case 0: /* rnu */
2235         vext_vv_rm_1(vd, v0, vs1, vs2,
2236                      env, vl, vm, 0, fn, vma, esz);
2237         break;
2238     case 1: /* rne */
2239         vext_vv_rm_1(vd, v0, vs1, vs2,
2240                      env, vl, vm, 1, fn, vma, esz);
2241         break;
2242     case 2: /* rdn */
2243         vext_vv_rm_1(vd, v0, vs1, vs2,
2244                      env, vl, vm, 2, fn, vma, esz);
2245         break;
2246     default: /* rod */
2247         vext_vv_rm_1(vd, v0, vs1, vs2,
2248                      env, vl, vm, 3, fn, vma, esz);
2249         break;
2250     }
2251     /* set tail elements to 1s */
2252     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2253 }
2254 
2255 /* generate helpers for fixed point instructions with OPIVV format */
2256 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2257 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2258                   CPURISCVState *env, uint32_t desc)            \
2259 {                                                               \
2260     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2261                  do_##NAME, ESZ);                               \
2262 }
2263 
saddu8(CPURISCVState * env,int vxrm,uint8_t a,uint8_t b)2264 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2265                              uint8_t b)
2266 {
2267     uint8_t res = a + b;
2268     if (res < a) {
2269         res = UINT8_MAX;
2270         env->vxsat = 0x1;
2271     }
2272     return res;
2273 }
2274 
saddu16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2275 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2276                                uint16_t b)
2277 {
2278     uint16_t res = a + b;
2279     if (res < a) {
2280         res = UINT16_MAX;
2281         env->vxsat = 0x1;
2282     }
2283     return res;
2284 }
2285 
saddu32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2286 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2287                                uint32_t b)
2288 {
2289     uint32_t res = a + b;
2290     if (res < a) {
2291         res = UINT32_MAX;
2292         env->vxsat = 0x1;
2293     }
2294     return res;
2295 }
2296 
saddu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2297 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2298                                uint64_t b)
2299 {
2300     uint64_t res = a + b;
2301     if (res < a) {
2302         res = UINT64_MAX;
2303         env->vxsat = 0x1;
2304     }
2305     return res;
2306 }
2307 
2308 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2309 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2310 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2311 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2312 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2313 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2314 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2315 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2316 
2317 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2318                           CPURISCVState *env, int vxrm);
2319 
2320 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2321 static inline void                                                  \
2322 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2323           CPURISCVState *env, int vxrm)                             \
2324 {                                                                   \
2325     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2326     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2327 }
2328 
2329 static inline void
vext_vx_rm_1(void * vd,void * v0,target_long s1,void * vs2,CPURISCVState * env,uint32_t vl,uint32_t vm,int vxrm,opivx2_rm_fn * fn,uint32_t vma,uint32_t esz)2330 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2331              CPURISCVState *env,
2332              uint32_t vl, uint32_t vm, int vxrm,
2333              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2334 {
2335     for (uint32_t i = env->vstart; i < vl; i++) {
2336         if (!vm && !vext_elem_mask(v0, i)) {
2337             /* set masked-off elements to 1s */
2338             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2339             continue;
2340         }
2341         fn(vd, s1, vs2, i, env, vxrm);
2342     }
2343     env->vstart = 0;
2344 }
2345 
2346 static inline void
vext_vx_rm_2(void * vd,void * v0,target_long s1,void * vs2,CPURISCVState * env,uint32_t desc,opivx2_rm_fn * fn,uint32_t esz)2347 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2348              CPURISCVState *env,
2349              uint32_t desc,
2350              opivx2_rm_fn *fn, uint32_t esz)
2351 {
2352     uint32_t vm = vext_vm(desc);
2353     uint32_t vl = env->vl;
2354     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2355     uint32_t vta = vext_vta(desc);
2356     uint32_t vma = vext_vma(desc);
2357 
2358     VSTART_CHECK_EARLY_EXIT(env, vl);
2359 
2360     switch (env->vxrm) {
2361     case 0: /* rnu */
2362         vext_vx_rm_1(vd, v0, s1, vs2,
2363                      env, vl, vm, 0, fn, vma, esz);
2364         break;
2365     case 1: /* rne */
2366         vext_vx_rm_1(vd, v0, s1, vs2,
2367                      env, vl, vm, 1, fn, vma, esz);
2368         break;
2369     case 2: /* rdn */
2370         vext_vx_rm_1(vd, v0, s1, vs2,
2371                      env, vl, vm, 2, fn, vma, esz);
2372         break;
2373     default: /* rod */
2374         vext_vx_rm_1(vd, v0, s1, vs2,
2375                      env, vl, vm, 3, fn, vma, esz);
2376         break;
2377     }
2378     /* set tail elements to 1s */
2379     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2380 }
2381 
2382 /* generate helpers for fixed point instructions with OPIVX format */
2383 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2384 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2385                   void *vs2, CPURISCVState *env,          \
2386                   uint32_t desc)                          \
2387 {                                                         \
2388     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2389                  do_##NAME, ESZ);                         \
2390 }
2391 
RVVCALL(OPIVX2_RM,vsaddu_vx_b,OP_UUU_B,H1,H1,saddu8)2392 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2393 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2394 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2395 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2396 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2397 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2398 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2399 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2400 
2401 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2402 {
2403     int8_t res = a + b;
2404     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2405         res = a > 0 ? INT8_MAX : INT8_MIN;
2406         env->vxsat = 0x1;
2407     }
2408     return res;
2409 }
2410 
sadd16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2411 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2412                              int16_t b)
2413 {
2414     int16_t res = a + b;
2415     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2416         res = a > 0 ? INT16_MAX : INT16_MIN;
2417         env->vxsat = 0x1;
2418     }
2419     return res;
2420 }
2421 
sadd32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2422 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2423                              int32_t b)
2424 {
2425     int32_t res = a + b;
2426     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2427         res = a > 0 ? INT32_MAX : INT32_MIN;
2428         env->vxsat = 0x1;
2429     }
2430     return res;
2431 }
2432 
sadd64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2433 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2434                              int64_t b)
2435 {
2436     int64_t res = a + b;
2437     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2438         res = a > 0 ? INT64_MAX : INT64_MIN;
2439         env->vxsat = 0x1;
2440     }
2441     return res;
2442 }
2443 
RVVCALL(OPIVV2_RM,vsadd_vv_b,OP_SSS_B,H1,H1,H1,sadd8)2444 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2445 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2446 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2447 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2448 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2449 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2450 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2451 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2452 
2453 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2454 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2455 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2456 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2457 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2458 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2459 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2460 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2461 
2462 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2463                              uint8_t b)
2464 {
2465     uint8_t res = a - b;
2466     if (res > a) {
2467         res = 0;
2468         env->vxsat = 0x1;
2469     }
2470     return res;
2471 }
2472 
ssubu16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2473 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2474                                uint16_t b)
2475 {
2476     uint16_t res = a - b;
2477     if (res > a) {
2478         res = 0;
2479         env->vxsat = 0x1;
2480     }
2481     return res;
2482 }
2483 
ssubu32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2484 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2485                                uint32_t b)
2486 {
2487     uint32_t res = a - b;
2488     if (res > a) {
2489         res = 0;
2490         env->vxsat = 0x1;
2491     }
2492     return res;
2493 }
2494 
ssubu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2495 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2496                                uint64_t b)
2497 {
2498     uint64_t res = a - b;
2499     if (res > a) {
2500         res = 0;
2501         env->vxsat = 0x1;
2502     }
2503     return res;
2504 }
2505 
RVVCALL(OPIVV2_RM,vssubu_vv_b,OP_UUU_B,H1,H1,H1,ssubu8)2506 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2507 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2508 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2509 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2510 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2511 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2512 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2513 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2514 
2515 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2516 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2517 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2518 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2519 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2520 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2521 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2522 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2523 
2524 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2525 {
2526     int8_t res = a - b;
2527     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2528         res = a >= 0 ? INT8_MAX : INT8_MIN;
2529         env->vxsat = 0x1;
2530     }
2531     return res;
2532 }
2533 
ssub16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2534 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2535                              int16_t b)
2536 {
2537     int16_t res = a - b;
2538     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2539         res = a >= 0 ? INT16_MAX : INT16_MIN;
2540         env->vxsat = 0x1;
2541     }
2542     return res;
2543 }
2544 
ssub32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2545 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2546                              int32_t b)
2547 {
2548     int32_t res = a - b;
2549     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2550         res = a >= 0 ? INT32_MAX : INT32_MIN;
2551         env->vxsat = 0x1;
2552     }
2553     return res;
2554 }
2555 
ssub64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2556 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2557                              int64_t b)
2558 {
2559     int64_t res = a - b;
2560     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2561         res = a >= 0 ? INT64_MAX : INT64_MIN;
2562         env->vxsat = 0x1;
2563     }
2564     return res;
2565 }
2566 
RVVCALL(OPIVV2_RM,vssub_vv_b,OP_SSS_B,H1,H1,H1,ssub8)2567 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2568 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2569 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2570 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2571 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2572 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2573 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2574 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2575 
2576 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2577 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2578 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2579 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2580 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2581 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2582 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2583 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2584 
2585 /* Vector Single-Width Averaging Add and Subtract */
2586 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2587 {
2588     uint8_t d = extract64(v, shift, 1);
2589     uint8_t d1;
2590     uint64_t D1, D2;
2591 
2592     if (shift == 0 || shift > 64) {
2593         return 0;
2594     }
2595 
2596     d1 = extract64(v, shift - 1, 1);
2597     D1 = extract64(v, 0, shift);
2598     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2599         return d1;
2600     } else if (vxrm == 1) { /* round-to-nearest-even */
2601         if (shift > 1) {
2602             D2 = extract64(v, 0, shift - 1);
2603             return d1 & ((D2 != 0) | d);
2604         } else {
2605             return d1 & d;
2606         }
2607     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2608         return !d & (D1 != 0);
2609     }
2610     return 0; /* round-down (truncate) */
2611 }
2612 
aadd32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2613 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2614                              int32_t b)
2615 {
2616     int64_t res = (int64_t)a + b;
2617     uint8_t round = get_round(vxrm, res, 1);
2618 
2619     return (res >> 1) + round;
2620 }
2621 
aadd64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2622 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2623                              int64_t b)
2624 {
2625     int64_t res = a + b;
2626     uint8_t round = get_round(vxrm, res, 1);
2627     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2628 
2629     /* With signed overflow, bit 64 is inverse of bit 63. */
2630     return ((res >> 1) ^ over) + round;
2631 }
2632 
RVVCALL(OPIVV2_RM,vaadd_vv_b,OP_SSS_B,H1,H1,H1,aadd32)2633 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2634 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2635 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2636 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2637 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2638 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2639 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2640 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2641 
2642 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2643 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2644 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2645 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2646 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2647 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2648 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2649 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2650 
2651 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2652                                uint32_t a, uint32_t b)
2653 {
2654     uint64_t res = (uint64_t)a + b;
2655     uint8_t round = get_round(vxrm, res, 1);
2656 
2657     return (res >> 1) + round;
2658 }
2659 
aaddu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2660 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2661                                uint64_t a, uint64_t b)
2662 {
2663     uint64_t res = a + b;
2664     uint8_t round = get_round(vxrm, res, 1);
2665     uint64_t over = (uint64_t)(res < a) << 63;
2666 
2667     return ((res >> 1) | over) + round;
2668 }
2669 
RVVCALL(OPIVV2_RM,vaaddu_vv_b,OP_UUU_B,H1,H1,H1,aaddu32)2670 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2671 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2672 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2673 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2674 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2675 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2676 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2677 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2678 
2679 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2680 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2681 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2682 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2683 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2684 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2685 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2686 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2687 
2688 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2689                              int32_t b)
2690 {
2691     int64_t res = (int64_t)a - b;
2692     uint8_t round = get_round(vxrm, res, 1);
2693 
2694     return (res >> 1) + round;
2695 }
2696 
asub64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2697 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2698                              int64_t b)
2699 {
2700     int64_t res = (int64_t)a - b;
2701     uint8_t round = get_round(vxrm, res, 1);
2702     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2703 
2704     /* With signed overflow, bit 64 is inverse of bit 63. */
2705     return ((res >> 1) ^ over) + round;
2706 }
2707 
RVVCALL(OPIVV2_RM,vasub_vv_b,OP_SSS_B,H1,H1,H1,asub32)2708 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2709 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2710 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2711 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2712 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2713 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2714 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2715 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2716 
2717 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2718 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2719 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2720 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2721 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2722 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2723 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2724 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2725 
2726 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2727                                uint32_t a, uint32_t b)
2728 {
2729     int64_t res = (int64_t)a - b;
2730     uint8_t round = get_round(vxrm, res, 1);
2731 
2732     return (res >> 1) + round;
2733 }
2734 
asubu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2735 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2736                                uint64_t a, uint64_t b)
2737 {
2738     uint64_t res = (uint64_t)a - b;
2739     uint8_t round = get_round(vxrm, res, 1);
2740     uint64_t over = (uint64_t)(res > a) << 63;
2741 
2742     return ((res >> 1) | over) + round;
2743 }
2744 
RVVCALL(OPIVV2_RM,vasubu_vv_b,OP_UUU_B,H1,H1,H1,asubu32)2745 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2746 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2747 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2748 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2749 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2750 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2751 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2752 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2753 
2754 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2755 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2756 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2757 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2758 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2759 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2760 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2761 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2762 
2763 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2764 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2765 {
2766     uint8_t round;
2767     int16_t res;
2768 
2769     res = (int16_t)a * (int16_t)b;
2770     round = get_round(vxrm, res, 7);
2771     res = (res >> 7) + round;
2772 
2773     if (res > INT8_MAX) {
2774         env->vxsat = 0x1;
2775         return INT8_MAX;
2776     } else if (res < INT8_MIN) {
2777         env->vxsat = 0x1;
2778         return INT8_MIN;
2779     } else {
2780         return res;
2781     }
2782 }
2783 
vsmul16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2784 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2785 {
2786     uint8_t round;
2787     int32_t res;
2788 
2789     res = (int32_t)a * (int32_t)b;
2790     round = get_round(vxrm, res, 15);
2791     res = (res >> 15) + round;
2792 
2793     if (res > INT16_MAX) {
2794         env->vxsat = 0x1;
2795         return INT16_MAX;
2796     } else if (res < INT16_MIN) {
2797         env->vxsat = 0x1;
2798         return INT16_MIN;
2799     } else {
2800         return res;
2801     }
2802 }
2803 
vsmul32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2804 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2805 {
2806     uint8_t round;
2807     int64_t res;
2808 
2809     res = (int64_t)a * (int64_t)b;
2810     round = get_round(vxrm, res, 31);
2811     res = (res >> 31) + round;
2812 
2813     if (res > INT32_MAX) {
2814         env->vxsat = 0x1;
2815         return INT32_MAX;
2816     } else if (res < INT32_MIN) {
2817         env->vxsat = 0x1;
2818         return INT32_MIN;
2819     } else {
2820         return res;
2821     }
2822 }
2823 
vsmul64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2824 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2825 {
2826     uint8_t round;
2827     uint64_t hi_64, lo_64;
2828     int64_t res;
2829 
2830     if (a == INT64_MIN && b == INT64_MIN) {
2831         env->vxsat = 1;
2832         return INT64_MAX;
2833     }
2834 
2835     muls64(&lo_64, &hi_64, a, b);
2836     round = get_round(vxrm, lo_64, 63);
2837     /*
2838      * Cannot overflow, as there are always
2839      * 2 sign bits after multiply.
2840      */
2841     res = (hi_64 << 1) | (lo_64 >> 63);
2842     if (round) {
2843         if (res == INT64_MAX) {
2844             env->vxsat = 1;
2845         } else {
2846             res += 1;
2847         }
2848     }
2849     return res;
2850 }
2851 
RVVCALL(OPIVV2_RM,vsmul_vv_b,OP_SSS_B,H1,H1,H1,vsmul8)2852 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2853 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2854 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2855 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2856 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2857 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2858 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2859 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2860 
2861 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2862 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2863 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2864 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2865 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2866 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2867 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2868 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2869 
2870 /* Vector Single-Width Scaling Shift Instructions */
2871 static inline uint8_t
2872 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2873 {
2874     uint8_t round, shift = b & 0x7;
2875     uint8_t res;
2876 
2877     round = get_round(vxrm, a, shift);
2878     res = (a >> shift) + round;
2879     return res;
2880 }
2881 static inline uint16_t
vssrl16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2882 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2883 {
2884     uint8_t round, shift = b & 0xf;
2885 
2886     round = get_round(vxrm, a, shift);
2887     return (a >> shift) + round;
2888 }
2889 static inline uint32_t
vssrl32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2890 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2891 {
2892     uint8_t round, shift = b & 0x1f;
2893 
2894     round = get_round(vxrm, a, shift);
2895     return (a >> shift) + round;
2896 }
2897 static inline uint64_t
vssrl64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2898 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2899 {
2900     uint8_t round, shift = b & 0x3f;
2901 
2902     round = get_round(vxrm, a, shift);
2903     return (a >> shift) + round;
2904 }
RVVCALL(OPIVV2_RM,vssrl_vv_b,OP_UUU_B,H1,H1,H1,vssrl8)2905 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2906 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2907 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2908 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2909 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2910 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2911 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2912 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2913 
2914 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2915 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2916 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2917 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2918 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2919 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2920 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2921 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2922 
2923 static inline int8_t
2924 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2925 {
2926     uint8_t round, shift = b & 0x7;
2927 
2928     round = get_round(vxrm, a, shift);
2929     return (a >> shift) + round;
2930 }
2931 static inline int16_t
vssra16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2932 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2933 {
2934     uint8_t round, shift = b & 0xf;
2935 
2936     round = get_round(vxrm, a, shift);
2937     return (a >> shift) + round;
2938 }
2939 static inline int32_t
vssra32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2940 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2941 {
2942     uint8_t round, shift = b & 0x1f;
2943 
2944     round = get_round(vxrm, a, shift);
2945     return (a >> shift) + round;
2946 }
2947 static inline int64_t
vssra64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2948 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2949 {
2950     uint8_t round, shift = b & 0x3f;
2951 
2952     round = get_round(vxrm, a, shift);
2953     return (a >> shift) + round;
2954 }
2955 
RVVCALL(OPIVV2_RM,vssra_vv_b,OP_SSS_B,H1,H1,H1,vssra8)2956 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2957 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2958 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2959 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2960 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2961 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2962 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2963 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2964 
2965 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2966 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2967 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2968 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2969 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2970 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2971 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2972 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2973 
2974 /* Vector Narrowing Fixed-Point Clip Instructions */
2975 static inline int8_t
2976 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2977 {
2978     uint8_t round, shift = b & 0xf;
2979     int16_t res;
2980 
2981     round = get_round(vxrm, a, shift);
2982     res = (a >> shift) + round;
2983     if (res > INT8_MAX) {
2984         env->vxsat = 0x1;
2985         return INT8_MAX;
2986     } else if (res < INT8_MIN) {
2987         env->vxsat = 0x1;
2988         return INT8_MIN;
2989     } else {
2990         return res;
2991     }
2992 }
2993 
2994 static inline int16_t
vnclip16(CPURISCVState * env,int vxrm,int32_t a,int16_t b)2995 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2996 {
2997     uint8_t round, shift = b & 0x1f;
2998     int32_t res;
2999 
3000     round = get_round(vxrm, a, shift);
3001     res = (a >> shift) + round;
3002     if (res > INT16_MAX) {
3003         env->vxsat = 0x1;
3004         return INT16_MAX;
3005     } else if (res < INT16_MIN) {
3006         env->vxsat = 0x1;
3007         return INT16_MIN;
3008     } else {
3009         return res;
3010     }
3011 }
3012 
3013 static inline int32_t
vnclip32(CPURISCVState * env,int vxrm,int64_t a,int32_t b)3014 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
3015 {
3016     uint8_t round, shift = b & 0x3f;
3017     int64_t res;
3018 
3019     round = get_round(vxrm, a, shift);
3020     res = (a >> shift) + round;
3021     if (res > INT32_MAX) {
3022         env->vxsat = 0x1;
3023         return INT32_MAX;
3024     } else if (res < INT32_MIN) {
3025         env->vxsat = 0x1;
3026         return INT32_MIN;
3027     } else {
3028         return res;
3029     }
3030 }
3031 
RVVCALL(OPIVV2_RM,vnclip_wv_b,NOP_SSS_B,H1,H2,H1,vnclip8)3032 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
3033 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
3034 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
3035 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
3036 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
3037 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
3038 
3039 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
3040 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
3041 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
3042 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
3043 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
3044 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
3045 
3046 static inline uint8_t
3047 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
3048 {
3049     uint8_t round, shift = b & 0xf;
3050     uint16_t res;
3051 
3052     round = get_round(vxrm, a, shift);
3053     res = (a >> shift) + round;
3054     if (res > UINT8_MAX) {
3055         env->vxsat = 0x1;
3056         return UINT8_MAX;
3057     } else {
3058         return res;
3059     }
3060 }
3061 
3062 static inline uint16_t
vnclipu16(CPURISCVState * env,int vxrm,uint32_t a,uint16_t b)3063 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
3064 {
3065     uint8_t round, shift = b & 0x1f;
3066     uint32_t res;
3067 
3068     round = get_round(vxrm, a, shift);
3069     res = (a >> shift) + round;
3070     if (res > UINT16_MAX) {
3071         env->vxsat = 0x1;
3072         return UINT16_MAX;
3073     } else {
3074         return res;
3075     }
3076 }
3077 
3078 static inline uint32_t
vnclipu32(CPURISCVState * env,int vxrm,uint64_t a,uint32_t b)3079 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3080 {
3081     uint8_t round, shift = b & 0x3f;
3082     uint64_t res;
3083 
3084     round = get_round(vxrm, a, shift);
3085     res = (a >> shift) + round;
3086     if (res > UINT32_MAX) {
3087         env->vxsat = 0x1;
3088         return UINT32_MAX;
3089     } else {
3090         return res;
3091     }
3092 }
3093 
RVVCALL(OPIVV2_RM,vnclipu_wv_b,NOP_UUU_B,H1,H2,H1,vnclipu8)3094 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3095 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3096 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3097 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3098 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3099 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3100 
3101 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3102 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3103 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3104 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3105 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3106 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3107 
3108 /*
3109  * Vector Float Point Arithmetic Instructions
3110  */
3111 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3112 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3113 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3114                       CPURISCVState *env)                      \
3115 {                                                              \
3116     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3117     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3118     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3119 }
3120 
3121 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3122 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3123                   void *vs2, CPURISCVState *env,          \
3124                   uint32_t desc)                          \
3125 {                                                         \
3126     uint32_t vm = vext_vm(desc);                          \
3127     uint32_t vl = env->vl;                                \
3128     uint32_t total_elems =                                \
3129         vext_get_total_elems(env, desc, ESZ);             \
3130     uint32_t vta = vext_vta(desc);                        \
3131     uint32_t vma = vext_vma(desc);                        \
3132     uint32_t i;                                           \
3133                                                           \
3134     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3135                                                           \
3136     for (i = env->vstart; i < vl; i++) {                  \
3137         if (!vm && !vext_elem_mask(v0, i)) {              \
3138             /* set masked-off elements to 1s */           \
3139             vext_set_elems_1s(vd, vma, i * ESZ,           \
3140                               (i + 1) * ESZ);             \
3141             continue;                                     \
3142         }                                                 \
3143         do_##NAME(vd, vs1, vs2, i, env);                  \
3144     }                                                     \
3145     env->vstart = 0;                                      \
3146     /* set tail elements to 1s */                         \
3147     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3148                       total_elems * ESZ);                 \
3149 }
3150 
3151 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3152 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3153 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3154 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3155 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3156 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3157 
3158 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3159 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3160                       CPURISCVState *env)                      \
3161 {                                                              \
3162     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3163     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3164 }
3165 
3166 #define GEN_VEXT_VF(NAME, ESZ)                            \
3167 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3168                   void *vs2, CPURISCVState *env,          \
3169                   uint32_t desc)                          \
3170 {                                                         \
3171     uint32_t vm = vext_vm(desc);                          \
3172     uint32_t vl = env->vl;                                \
3173     uint32_t total_elems =                                \
3174         vext_get_total_elems(env, desc, ESZ);             \
3175     uint32_t vta = vext_vta(desc);                        \
3176     uint32_t vma = vext_vma(desc);                        \
3177     uint32_t i;                                           \
3178                                                           \
3179     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3180                                                           \
3181     for (i = env->vstart; i < vl; i++) {                  \
3182         if (!vm && !vext_elem_mask(v0, i)) {              \
3183             /* set masked-off elements to 1s */           \
3184             vext_set_elems_1s(vd, vma, i * ESZ,           \
3185                               (i + 1) * ESZ);             \
3186             continue;                                     \
3187         }                                                 \
3188         do_##NAME(vd, s1, vs2, i, env);                   \
3189     }                                                     \
3190     env->vstart = 0;                                      \
3191     /* set tail elements to 1s */                         \
3192     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3193                       total_elems * ESZ);                 \
3194 }
3195 
3196 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3197 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3198 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3199 GEN_VEXT_VF(vfadd_vf_h, 2)
3200 GEN_VEXT_VF(vfadd_vf_w, 4)
3201 GEN_VEXT_VF(vfadd_vf_d, 8)
3202 
3203 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3204 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3205 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3206 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3207 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3208 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3209 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3210 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3211 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3212 GEN_VEXT_VF(vfsub_vf_h, 2)
3213 GEN_VEXT_VF(vfsub_vf_w, 4)
3214 GEN_VEXT_VF(vfsub_vf_d, 8)
3215 
3216 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3217 {
3218     return float16_sub(b, a, s);
3219 }
3220 
float32_rsub(uint32_t a,uint32_t b,float_status * s)3221 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3222 {
3223     return float32_sub(b, a, s);
3224 }
3225 
float64_rsub(uint64_t a,uint64_t b,float_status * s)3226 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3227 {
3228     return float64_sub(b, a, s);
3229 }
3230 
RVVCALL(OPFVF2,vfrsub_vf_h,OP_UUU_H,H2,H2,float16_rsub)3231 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3232 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3233 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3234 GEN_VEXT_VF(vfrsub_vf_h, 2)
3235 GEN_VEXT_VF(vfrsub_vf_w, 4)
3236 GEN_VEXT_VF(vfrsub_vf_d, 8)
3237 
3238 /* Vector Widening Floating-Point Add/Subtract Instructions */
3239 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3240 {
3241     return float32_add(float16_to_float32(a, true, s),
3242                        float16_to_float32(b, true, s), s);
3243 }
3244 
vfwadd32(uint32_t a,uint32_t b,float_status * s)3245 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3246 {
3247     return float64_add(float32_to_float64(a, s),
3248                        float32_to_float64(b, s), s);
3249 
3250 }
3251 
RVVCALL(OPFVV2,vfwadd_vv_h,WOP_UUU_H,H4,H2,H2,vfwadd16)3252 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3253 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3254 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3255 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3256 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3257 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3258 GEN_VEXT_VF(vfwadd_vf_h, 4)
3259 GEN_VEXT_VF(vfwadd_vf_w, 8)
3260 
3261 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3262 {
3263     return float32_sub(float16_to_float32(a, true, s),
3264                        float16_to_float32(b, true, s), s);
3265 }
3266 
vfwsub32(uint32_t a,uint32_t b,float_status * s)3267 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3268 {
3269     return float64_sub(float32_to_float64(a, s),
3270                        float32_to_float64(b, s), s);
3271 
3272 }
3273 
RVVCALL(OPFVV2,vfwsub_vv_h,WOP_UUU_H,H4,H2,H2,vfwsub16)3274 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3275 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3276 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3277 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3278 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3279 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3280 GEN_VEXT_VF(vfwsub_vf_h, 4)
3281 GEN_VEXT_VF(vfwsub_vf_w, 8)
3282 
3283 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3284 {
3285     return float32_add(a, float16_to_float32(b, true, s), s);
3286 }
3287 
vfwaddw32(uint64_t a,uint32_t b,float_status * s)3288 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3289 {
3290     return float64_add(a, float32_to_float64(b, s), s);
3291 }
3292 
RVVCALL(OPFVV2,vfwadd_wv_h,WOP_WUUU_H,H4,H2,H2,vfwaddw16)3293 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3294 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3295 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3296 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3297 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3298 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3299 GEN_VEXT_VF(vfwadd_wf_h, 4)
3300 GEN_VEXT_VF(vfwadd_wf_w, 8)
3301 
3302 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3303 {
3304     return float32_sub(a, float16_to_float32(b, true, s), s);
3305 }
3306 
vfwsubw32(uint64_t a,uint32_t b,float_status * s)3307 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3308 {
3309     return float64_sub(a, float32_to_float64(b, s), s);
3310 }
3311 
RVVCALL(OPFVV2,vfwsub_wv_h,WOP_WUUU_H,H4,H2,H2,vfwsubw16)3312 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3313 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3314 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3315 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3316 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3317 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3318 GEN_VEXT_VF(vfwsub_wf_h, 4)
3319 GEN_VEXT_VF(vfwsub_wf_w, 8)
3320 
3321 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3322 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3323 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3324 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3325 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3326 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3327 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3328 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3329 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3330 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3331 GEN_VEXT_VF(vfmul_vf_h, 2)
3332 GEN_VEXT_VF(vfmul_vf_w, 4)
3333 GEN_VEXT_VF(vfmul_vf_d, 8)
3334 
3335 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3336 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3337 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3338 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3339 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3340 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3341 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3342 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3343 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3344 GEN_VEXT_VF(vfdiv_vf_h, 2)
3345 GEN_VEXT_VF(vfdiv_vf_w, 4)
3346 GEN_VEXT_VF(vfdiv_vf_d, 8)
3347 
3348 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3349 {
3350     return float16_div(b, a, s);
3351 }
3352 
float32_rdiv(uint32_t a,uint32_t b,float_status * s)3353 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3354 {
3355     return float32_div(b, a, s);
3356 }
3357 
float64_rdiv(uint64_t a,uint64_t b,float_status * s)3358 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3359 {
3360     return float64_div(b, a, s);
3361 }
3362 
RVVCALL(OPFVF2,vfrdiv_vf_h,OP_UUU_H,H2,H2,float16_rdiv)3363 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3364 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3365 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3366 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3367 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3368 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3369 
3370 /* Vector Widening Floating-Point Multiply */
3371 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3372 {
3373     return float32_mul(float16_to_float32(a, true, s),
3374                        float16_to_float32(b, true, s), s);
3375 }
3376 
vfwmul32(uint32_t a,uint32_t b,float_status * s)3377 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3378 {
3379     return float64_mul(float32_to_float64(a, s),
3380                        float32_to_float64(b, s), s);
3381 
3382 }
RVVCALL(OPFVV2,vfwmul_vv_h,WOP_UUU_H,H4,H2,H2,vfwmul16)3383 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3384 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3385 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3386 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3387 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3388 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3389 GEN_VEXT_VF(vfwmul_vf_h, 4)
3390 GEN_VEXT_VF(vfwmul_vf_w, 8)
3391 
3392 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3393 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3394 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3395                       CPURISCVState *env)                          \
3396 {                                                                  \
3397     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3398     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3399     TD d = *((TD *)vd + HD(i));                                    \
3400     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3401 }
3402 
3403 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3404 {
3405     return float16_muladd(a, b, d, 0, s);
3406 }
3407 
fmacc32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3408 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3409 {
3410     return float32_muladd(a, b, d, 0, s);
3411 }
3412 
fmacc64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3413 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3414 {
3415     return float64_muladd(a, b, d, 0, s);
3416 }
3417 
RVVCALL(OPFVV3,vfmacc_vv_h,OP_UUU_H,H2,H2,H2,fmacc16)3418 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3419 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3420 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3421 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3422 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3423 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3424 
3425 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3426 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3427                       CPURISCVState *env)                         \
3428 {                                                                 \
3429     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3430     TD d = *((TD *)vd + HD(i));                                   \
3431     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3432 }
3433 
3434 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3435 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3436 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3437 GEN_VEXT_VF(vfmacc_vf_h, 2)
3438 GEN_VEXT_VF(vfmacc_vf_w, 4)
3439 GEN_VEXT_VF(vfmacc_vf_d, 8)
3440 
3441 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3442 {
3443     return float16_muladd(a, b, d, float_muladd_negate_c |
3444                                    float_muladd_negate_product, s);
3445 }
3446 
fnmacc32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3447 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3448 {
3449     return float32_muladd(a, b, d, float_muladd_negate_c |
3450                                    float_muladd_negate_product, s);
3451 }
3452 
fnmacc64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3453 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3454 {
3455     return float64_muladd(a, b, d, float_muladd_negate_c |
3456                                    float_muladd_negate_product, s);
3457 }
3458 
RVVCALL(OPFVV3,vfnmacc_vv_h,OP_UUU_H,H2,H2,H2,fnmacc16)3459 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3460 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3461 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3462 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3463 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3464 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3465 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3466 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3467 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3468 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3469 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3470 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3471 
3472 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3473 {
3474     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3475 }
3476 
fmsac32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3477 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3478 {
3479     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3480 }
3481 
fmsac64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3482 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3483 {
3484     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3485 }
3486 
RVVCALL(OPFVV3,vfmsac_vv_h,OP_UUU_H,H2,H2,H2,fmsac16)3487 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3488 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3489 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3490 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3491 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3492 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3493 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3494 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3495 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3496 GEN_VEXT_VF(vfmsac_vf_h, 2)
3497 GEN_VEXT_VF(vfmsac_vf_w, 4)
3498 GEN_VEXT_VF(vfmsac_vf_d, 8)
3499 
3500 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3501 {
3502     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3503 }
3504 
fnmsac32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3505 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3506 {
3507     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3508 }
3509 
fnmsac64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3510 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3511 {
3512     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3513 }
3514 
RVVCALL(OPFVV3,vfnmsac_vv_h,OP_UUU_H,H2,H2,H2,fnmsac16)3515 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3516 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3517 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3518 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3519 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3520 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3521 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3522 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3523 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3524 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3525 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3526 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3527 
3528 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3529 {
3530     return float16_muladd(d, b, a, 0, s);
3531 }
3532 
fmadd32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3533 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3534 {
3535     return float32_muladd(d, b, a, 0, s);
3536 }
3537 
fmadd64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3538 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3539 {
3540     return float64_muladd(d, b, a, 0, s);
3541 }
3542 
RVVCALL(OPFVV3,vfmadd_vv_h,OP_UUU_H,H2,H2,H2,fmadd16)3543 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3544 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3545 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3546 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3547 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3548 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3549 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3550 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3551 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3552 GEN_VEXT_VF(vfmadd_vf_h, 2)
3553 GEN_VEXT_VF(vfmadd_vf_w, 4)
3554 GEN_VEXT_VF(vfmadd_vf_d, 8)
3555 
3556 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3557 {
3558     return float16_muladd(d, b, a, float_muladd_negate_c |
3559                                    float_muladd_negate_product, s);
3560 }
3561 
fnmadd32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3562 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3563 {
3564     return float32_muladd(d, b, a, float_muladd_negate_c |
3565                                    float_muladd_negate_product, s);
3566 }
3567 
fnmadd64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3568 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3569 {
3570     return float64_muladd(d, b, a, float_muladd_negate_c |
3571                                    float_muladd_negate_product, s);
3572 }
3573 
RVVCALL(OPFVV3,vfnmadd_vv_h,OP_UUU_H,H2,H2,H2,fnmadd16)3574 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3575 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3576 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3577 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3578 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3579 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3580 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3581 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3582 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3583 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3584 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3585 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3586 
3587 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3588 {
3589     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3590 }
3591 
fmsub32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3592 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3593 {
3594     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3595 }
3596 
fmsub64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3597 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3598 {
3599     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3600 }
3601 
RVVCALL(OPFVV3,vfmsub_vv_h,OP_UUU_H,H2,H2,H2,fmsub16)3602 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3603 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3604 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3605 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3606 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3607 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3608 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3609 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3610 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3611 GEN_VEXT_VF(vfmsub_vf_h, 2)
3612 GEN_VEXT_VF(vfmsub_vf_w, 4)
3613 GEN_VEXT_VF(vfmsub_vf_d, 8)
3614 
3615 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3616 {
3617     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3618 }
3619 
fnmsub32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3620 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3621 {
3622     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3623 }
3624 
fnmsub64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3625 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3626 {
3627     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3628 }
3629 
RVVCALL(OPFVV3,vfnmsub_vv_h,OP_UUU_H,H2,H2,H2,fnmsub16)3630 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3631 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3632 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3633 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3634 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3635 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3636 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3637 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3638 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3639 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3640 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3641 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3642 
3643 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3644 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3645 {
3646     return float32_muladd(float16_to_float32(a, true, s),
3647                           float16_to_float32(b, true, s), d, 0, s);
3648 }
3649 
fwmacc32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3650 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3651 {
3652     return float64_muladd(float32_to_float64(a, s),
3653                           float32_to_float64(b, s), d, 0, s);
3654 }
3655 
RVVCALL(OPFVV3,vfwmacc_vv_h,WOP_UUU_H,H4,H2,H2,fwmacc16)3656 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3657 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3658 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3659 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3660 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3661 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3662 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3663 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3664 
3665 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3666 {
3667     return float32_muladd(bfloat16_to_float32(a, s),
3668                           bfloat16_to_float32(b, s), d, 0, s);
3669 }
3670 
RVVCALL(OPFVV3,vfwmaccbf16_vv,WOP_UUU_H,H4,H2,H2,fwmaccbf16)3671 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3672 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3673 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3674 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3675 
3676 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3677 {
3678     return float32_muladd(float16_to_float32(a, true, s),
3679                           float16_to_float32(b, true, s), d,
3680                           float_muladd_negate_c | float_muladd_negate_product,
3681                           s);
3682 }
3683 
fwnmacc32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3684 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3685 {
3686     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3687                           d, float_muladd_negate_c |
3688                              float_muladd_negate_product, s);
3689 }
3690 
RVVCALL(OPFVV3,vfwnmacc_vv_h,WOP_UUU_H,H4,H2,H2,fwnmacc16)3691 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3692 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3693 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3694 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3695 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3696 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3697 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3698 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3699 
3700 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3701 {
3702     return float32_muladd(float16_to_float32(a, true, s),
3703                           float16_to_float32(b, true, s), d,
3704                           float_muladd_negate_c, s);
3705 }
3706 
fwmsac32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3707 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3708 {
3709     return float64_muladd(float32_to_float64(a, s),
3710                           float32_to_float64(b, s), d,
3711                           float_muladd_negate_c, s);
3712 }
3713 
RVVCALL(OPFVV3,vfwmsac_vv_h,WOP_UUU_H,H4,H2,H2,fwmsac16)3714 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3715 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3716 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3717 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3718 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3719 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3720 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3721 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3722 
3723 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3724 {
3725     return float32_muladd(float16_to_float32(a, true, s),
3726                           float16_to_float32(b, true, s), d,
3727                           float_muladd_negate_product, s);
3728 }
3729 
fwnmsac32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3730 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3731 {
3732     return float64_muladd(float32_to_float64(a, s),
3733                           float32_to_float64(b, s), d,
3734                           float_muladd_negate_product, s);
3735 }
3736 
RVVCALL(OPFVV3,vfwnmsac_vv_h,WOP_UUU_H,H4,H2,H2,fwnmsac16)3737 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3738 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3739 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3740 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3741 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3742 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3743 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3744 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3745 
3746 /* Vector Floating-Point Square-Root Instruction */
3747 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3748 static void do_##NAME(void *vd, void *vs2, int i,      \
3749                       CPURISCVState *env)              \
3750 {                                                      \
3751     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3752     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3753 }
3754 
3755 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3756 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3757                   CPURISCVState *env, uint32_t desc)   \
3758 {                                                      \
3759     uint32_t vm = vext_vm(desc);                       \
3760     uint32_t vl = env->vl;                             \
3761     uint32_t total_elems =                             \
3762         vext_get_total_elems(env, desc, ESZ);          \
3763     uint32_t vta = vext_vta(desc);                     \
3764     uint32_t vma = vext_vma(desc);                     \
3765     uint32_t i;                                        \
3766                                                        \
3767     VSTART_CHECK_EARLY_EXIT(env, vl);                  \
3768                                                        \
3769     if (vl == 0) {                                     \
3770         return;                                        \
3771     }                                                  \
3772     for (i = env->vstart; i < vl; i++) {               \
3773         if (!vm && !vext_elem_mask(v0, i)) {           \
3774             /* set masked-off elements to 1s */        \
3775             vext_set_elems_1s(vd, vma, i * ESZ,        \
3776                               (i + 1) * ESZ);          \
3777             continue;                                  \
3778         }                                              \
3779         do_##NAME(vd, vs2, i, env);                    \
3780     }                                                  \
3781     env->vstart = 0;                                   \
3782     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3783                       total_elems * ESZ);              \
3784 }
3785 
3786 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3787 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3788 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3789 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3790 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3791 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3792 
3793 /*
3794  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3795  *
3796  * Adapted from riscv-v-spec recip.c:
3797  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3798  */
3799 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3800 {
3801     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3802     uint64_t exp = extract64(f, frac_size, exp_size);
3803     uint64_t frac = extract64(f, 0, frac_size);
3804 
3805     const uint8_t lookup_table[] = {
3806         52, 51, 50, 48, 47, 46, 44, 43,
3807         42, 41, 40, 39, 38, 36, 35, 34,
3808         33, 32, 31, 30, 30, 29, 28, 27,
3809         26, 25, 24, 23, 23, 22, 21, 20,
3810         19, 19, 18, 17, 16, 16, 15, 14,
3811         14, 13, 12, 12, 11, 10, 10, 9,
3812         9, 8, 7, 7, 6, 6, 5, 4,
3813         4, 3, 3, 2, 2, 1, 1, 0,
3814         127, 125, 123, 121, 119, 118, 116, 114,
3815         113, 111, 109, 108, 106, 105, 103, 102,
3816         100, 99, 97, 96, 95, 93, 92, 91,
3817         90, 88, 87, 86, 85, 84, 83, 82,
3818         80, 79, 78, 77, 76, 75, 74, 73,
3819         72, 71, 70, 70, 69, 68, 67, 66,
3820         65, 64, 63, 63, 62, 61, 60, 59,
3821         59, 58, 57, 56, 56, 55, 54, 53
3822     };
3823     const int precision = 7;
3824 
3825     if (exp == 0 && frac != 0) { /* subnormal */
3826         /* Normalize the subnormal. */
3827         while (extract64(frac, frac_size - 1, 1) == 0) {
3828             exp--;
3829             frac <<= 1;
3830         }
3831 
3832         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3833     }
3834 
3835     int idx = ((exp & 1) << (precision - 1)) |
3836               (frac >> (frac_size - precision + 1));
3837     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3838                         (frac_size - precision);
3839     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3840 
3841     uint64_t val = 0;
3842     val = deposit64(val, 0, frac_size, out_frac);
3843     val = deposit64(val, frac_size, exp_size, out_exp);
3844     val = deposit64(val, frac_size + exp_size, 1, sign);
3845     return val;
3846 }
3847 
frsqrt7_h(float16 f,float_status * s)3848 static float16 frsqrt7_h(float16 f, float_status *s)
3849 {
3850     int exp_size = 5, frac_size = 10;
3851     bool sign = float16_is_neg(f);
3852 
3853     /*
3854      * frsqrt7(sNaN) = canonical NaN
3855      * frsqrt7(-inf) = canonical NaN
3856      * frsqrt7(-normal) = canonical NaN
3857      * frsqrt7(-subnormal) = canonical NaN
3858      */
3859     if (float16_is_signaling_nan(f, s) ||
3860         (float16_is_infinity(f) && sign) ||
3861         (float16_is_normal(f) && sign) ||
3862         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3863         s->float_exception_flags |= float_flag_invalid;
3864         return float16_default_nan(s);
3865     }
3866 
3867     /* frsqrt7(qNaN) = canonical NaN */
3868     if (float16_is_quiet_nan(f, s)) {
3869         return float16_default_nan(s);
3870     }
3871 
3872     /* frsqrt7(+-0) = +-inf */
3873     if (float16_is_zero(f)) {
3874         s->float_exception_flags |= float_flag_divbyzero;
3875         return float16_set_sign(float16_infinity, sign);
3876     }
3877 
3878     /* frsqrt7(+inf) = +0 */
3879     if (float16_is_infinity(f) && !sign) {
3880         return float16_set_sign(float16_zero, sign);
3881     }
3882 
3883     /* +normal, +subnormal */
3884     uint64_t val = frsqrt7(f, exp_size, frac_size);
3885     return make_float16(val);
3886 }
3887 
frsqrt7_s(float32 f,float_status * s)3888 static float32 frsqrt7_s(float32 f, float_status *s)
3889 {
3890     int exp_size = 8, frac_size = 23;
3891     bool sign = float32_is_neg(f);
3892 
3893     /*
3894      * frsqrt7(sNaN) = canonical NaN
3895      * frsqrt7(-inf) = canonical NaN
3896      * frsqrt7(-normal) = canonical NaN
3897      * frsqrt7(-subnormal) = canonical NaN
3898      */
3899     if (float32_is_signaling_nan(f, s) ||
3900         (float32_is_infinity(f) && sign) ||
3901         (float32_is_normal(f) && sign) ||
3902         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3903         s->float_exception_flags |= float_flag_invalid;
3904         return float32_default_nan(s);
3905     }
3906 
3907     /* frsqrt7(qNaN) = canonical NaN */
3908     if (float32_is_quiet_nan(f, s)) {
3909         return float32_default_nan(s);
3910     }
3911 
3912     /* frsqrt7(+-0) = +-inf */
3913     if (float32_is_zero(f)) {
3914         s->float_exception_flags |= float_flag_divbyzero;
3915         return float32_set_sign(float32_infinity, sign);
3916     }
3917 
3918     /* frsqrt7(+inf) = +0 */
3919     if (float32_is_infinity(f) && !sign) {
3920         return float32_set_sign(float32_zero, sign);
3921     }
3922 
3923     /* +normal, +subnormal */
3924     uint64_t val = frsqrt7(f, exp_size, frac_size);
3925     return make_float32(val);
3926 }
3927 
frsqrt7_d(float64 f,float_status * s)3928 static float64 frsqrt7_d(float64 f, float_status *s)
3929 {
3930     int exp_size = 11, frac_size = 52;
3931     bool sign = float64_is_neg(f);
3932 
3933     /*
3934      * frsqrt7(sNaN) = canonical NaN
3935      * frsqrt7(-inf) = canonical NaN
3936      * frsqrt7(-normal) = canonical NaN
3937      * frsqrt7(-subnormal) = canonical NaN
3938      */
3939     if (float64_is_signaling_nan(f, s) ||
3940         (float64_is_infinity(f) && sign) ||
3941         (float64_is_normal(f) && sign) ||
3942         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3943         s->float_exception_flags |= float_flag_invalid;
3944         return float64_default_nan(s);
3945     }
3946 
3947     /* frsqrt7(qNaN) = canonical NaN */
3948     if (float64_is_quiet_nan(f, s)) {
3949         return float64_default_nan(s);
3950     }
3951 
3952     /* frsqrt7(+-0) = +-inf */
3953     if (float64_is_zero(f)) {
3954         s->float_exception_flags |= float_flag_divbyzero;
3955         return float64_set_sign(float64_infinity, sign);
3956     }
3957 
3958     /* frsqrt7(+inf) = +0 */
3959     if (float64_is_infinity(f) && !sign) {
3960         return float64_set_sign(float64_zero, sign);
3961     }
3962 
3963     /* +normal, +subnormal */
3964     uint64_t val = frsqrt7(f, exp_size, frac_size);
3965     return make_float64(val);
3966 }
3967 
RVVCALL(OPFVV1,vfrsqrt7_v_h,OP_UU_H,H2,H2,frsqrt7_h)3968 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3969 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3970 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3971 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3972 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3973 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3974 
3975 /*
3976  * Vector Floating-Point Reciprocal Estimate Instruction
3977  *
3978  * Adapted from riscv-v-spec recip.c:
3979  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3980  */
3981 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3982                       float_status *s)
3983 {
3984     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3985     uint64_t exp = extract64(f, frac_size, exp_size);
3986     uint64_t frac = extract64(f, 0, frac_size);
3987 
3988     const uint8_t lookup_table[] = {
3989         127, 125, 123, 121, 119, 117, 116, 114,
3990         112, 110, 109, 107, 105, 104, 102, 100,
3991         99, 97, 96, 94, 93, 91, 90, 88,
3992         87, 85, 84, 83, 81, 80, 79, 77,
3993         76, 75, 74, 72, 71, 70, 69, 68,
3994         66, 65, 64, 63, 62, 61, 60, 59,
3995         58, 57, 56, 55, 54, 53, 52, 51,
3996         50, 49, 48, 47, 46, 45, 44, 43,
3997         42, 41, 40, 40, 39, 38, 37, 36,
3998         35, 35, 34, 33, 32, 31, 31, 30,
3999         29, 28, 28, 27, 26, 25, 25, 24,
4000         23, 23, 22, 21, 21, 20, 19, 19,
4001         18, 17, 17, 16, 15, 15, 14, 14,
4002         13, 12, 12, 11, 11, 10, 9, 9,
4003         8, 8, 7, 7, 6, 5, 5, 4,
4004         4, 3, 3, 2, 2, 1, 1, 0
4005     };
4006     const int precision = 7;
4007 
4008     if (exp == 0 && frac != 0) { /* subnormal */
4009         /* Normalize the subnormal. */
4010         while (extract64(frac, frac_size - 1, 1) == 0) {
4011             exp--;
4012             frac <<= 1;
4013         }
4014 
4015         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
4016 
4017         if (exp != 0 && exp != UINT64_MAX) {
4018             /*
4019              * Overflow to inf or max value of same sign,
4020              * depending on sign and rounding mode.
4021              */
4022             s->float_exception_flags |= (float_flag_inexact |
4023                                          float_flag_overflow);
4024 
4025             if ((s->float_rounding_mode == float_round_to_zero) ||
4026                 ((s->float_rounding_mode == float_round_down) && !sign) ||
4027                 ((s->float_rounding_mode == float_round_up) && sign)) {
4028                 /* Return greatest/negative finite value. */
4029                 return (sign << (exp_size + frac_size)) |
4030                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
4031             } else {
4032                 /* Return +-inf. */
4033                 return (sign << (exp_size + frac_size)) |
4034                        MAKE_64BIT_MASK(frac_size, exp_size);
4035             }
4036         }
4037     }
4038 
4039     int idx = frac >> (frac_size - precision);
4040     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
4041                         (frac_size - precision);
4042     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
4043 
4044     if (out_exp == 0 || out_exp == UINT64_MAX) {
4045         /*
4046          * The result is subnormal, but don't raise the underflow exception,
4047          * because there's no additional loss of precision.
4048          */
4049         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
4050         if (out_exp == UINT64_MAX) {
4051             out_frac >>= 1;
4052             out_exp = 0;
4053         }
4054     }
4055 
4056     uint64_t val = 0;
4057     val = deposit64(val, 0, frac_size, out_frac);
4058     val = deposit64(val, frac_size, exp_size, out_exp);
4059     val = deposit64(val, frac_size + exp_size, 1, sign);
4060     return val;
4061 }
4062 
frec7_h(float16 f,float_status * s)4063 static float16 frec7_h(float16 f, float_status *s)
4064 {
4065     int exp_size = 5, frac_size = 10;
4066     bool sign = float16_is_neg(f);
4067 
4068     /* frec7(+-inf) = +-0 */
4069     if (float16_is_infinity(f)) {
4070         return float16_set_sign(float16_zero, sign);
4071     }
4072 
4073     /* frec7(+-0) = +-inf */
4074     if (float16_is_zero(f)) {
4075         s->float_exception_flags |= float_flag_divbyzero;
4076         return float16_set_sign(float16_infinity, sign);
4077     }
4078 
4079     /* frec7(sNaN) = canonical NaN */
4080     if (float16_is_signaling_nan(f, s)) {
4081         s->float_exception_flags |= float_flag_invalid;
4082         return float16_default_nan(s);
4083     }
4084 
4085     /* frec7(qNaN) = canonical NaN */
4086     if (float16_is_quiet_nan(f, s)) {
4087         return float16_default_nan(s);
4088     }
4089 
4090     /* +-normal, +-subnormal */
4091     uint64_t val = frec7(f, exp_size, frac_size, s);
4092     return make_float16(val);
4093 }
4094 
frec7_s(float32 f,float_status * s)4095 static float32 frec7_s(float32 f, float_status *s)
4096 {
4097     int exp_size = 8, frac_size = 23;
4098     bool sign = float32_is_neg(f);
4099 
4100     /* frec7(+-inf) = +-0 */
4101     if (float32_is_infinity(f)) {
4102         return float32_set_sign(float32_zero, sign);
4103     }
4104 
4105     /* frec7(+-0) = +-inf */
4106     if (float32_is_zero(f)) {
4107         s->float_exception_flags |= float_flag_divbyzero;
4108         return float32_set_sign(float32_infinity, sign);
4109     }
4110 
4111     /* frec7(sNaN) = canonical NaN */
4112     if (float32_is_signaling_nan(f, s)) {
4113         s->float_exception_flags |= float_flag_invalid;
4114         return float32_default_nan(s);
4115     }
4116 
4117     /* frec7(qNaN) = canonical NaN */
4118     if (float32_is_quiet_nan(f, s)) {
4119         return float32_default_nan(s);
4120     }
4121 
4122     /* +-normal, +-subnormal */
4123     uint64_t val = frec7(f, exp_size, frac_size, s);
4124     return make_float32(val);
4125 }
4126 
frec7_d(float64 f,float_status * s)4127 static float64 frec7_d(float64 f, float_status *s)
4128 {
4129     int exp_size = 11, frac_size = 52;
4130     bool sign = float64_is_neg(f);
4131 
4132     /* frec7(+-inf) = +-0 */
4133     if (float64_is_infinity(f)) {
4134         return float64_set_sign(float64_zero, sign);
4135     }
4136 
4137     /* frec7(+-0) = +-inf */
4138     if (float64_is_zero(f)) {
4139         s->float_exception_flags |= float_flag_divbyzero;
4140         return float64_set_sign(float64_infinity, sign);
4141     }
4142 
4143     /* frec7(sNaN) = canonical NaN */
4144     if (float64_is_signaling_nan(f, s)) {
4145         s->float_exception_flags |= float_flag_invalid;
4146         return float64_default_nan(s);
4147     }
4148 
4149     /* frec7(qNaN) = canonical NaN */
4150     if (float64_is_quiet_nan(f, s)) {
4151         return float64_default_nan(s);
4152     }
4153 
4154     /* +-normal, +-subnormal */
4155     uint64_t val = frec7(f, exp_size, frac_size, s);
4156     return make_float64(val);
4157 }
4158 
RVVCALL(OPFVV1,vfrec7_v_h,OP_UU_H,H2,H2,frec7_h)4159 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4160 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4161 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4162 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4163 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4164 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4165 
4166 /* Vector Floating-Point MIN/MAX Instructions */
4167 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4168 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4169 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4170 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4171 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4172 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4173 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4174 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4175 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4176 GEN_VEXT_VF(vfmin_vf_h, 2)
4177 GEN_VEXT_VF(vfmin_vf_w, 4)
4178 GEN_VEXT_VF(vfmin_vf_d, 8)
4179 
4180 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4181 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4182 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4183 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4184 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4185 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4186 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4187 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4188 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4189 GEN_VEXT_VF(vfmax_vf_h, 2)
4190 GEN_VEXT_VF(vfmax_vf_w, 4)
4191 GEN_VEXT_VF(vfmax_vf_d, 8)
4192 
4193 /* Vector Floating-Point Sign-Injection Instructions */
4194 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4195 {
4196     return deposit64(b, 0, 15, a);
4197 }
4198 
fsgnj32(uint32_t a,uint32_t b,float_status * s)4199 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4200 {
4201     return deposit64(b, 0, 31, a);
4202 }
4203 
fsgnj64(uint64_t a,uint64_t b,float_status * s)4204 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4205 {
4206     return deposit64(b, 0, 63, a);
4207 }
4208 
RVVCALL(OPFVV2,vfsgnj_vv_h,OP_UUU_H,H2,H2,H2,fsgnj16)4209 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4210 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4211 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4212 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4213 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4214 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4215 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4216 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4217 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4218 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4219 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4220 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4221 
4222 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4223 {
4224     return deposit64(~b, 0, 15, a);
4225 }
4226 
fsgnjn32(uint32_t a,uint32_t b,float_status * s)4227 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4228 {
4229     return deposit64(~b, 0, 31, a);
4230 }
4231 
fsgnjn64(uint64_t a,uint64_t b,float_status * s)4232 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4233 {
4234     return deposit64(~b, 0, 63, a);
4235 }
4236 
RVVCALL(OPFVV2,vfsgnjn_vv_h,OP_UUU_H,H2,H2,H2,fsgnjn16)4237 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4238 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4239 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4240 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4241 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4242 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4243 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4244 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4245 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4246 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4247 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4248 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4249 
4250 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4251 {
4252     return deposit64(b ^ a, 0, 15, a);
4253 }
4254 
fsgnjx32(uint32_t a,uint32_t b,float_status * s)4255 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4256 {
4257     return deposit64(b ^ a, 0, 31, a);
4258 }
4259 
fsgnjx64(uint64_t a,uint64_t b,float_status * s)4260 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4261 {
4262     return deposit64(b ^ a, 0, 63, a);
4263 }
4264 
RVVCALL(OPFVV2,vfsgnjx_vv_h,OP_UUU_H,H2,H2,H2,fsgnjx16)4265 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4266 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4267 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4268 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4269 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4270 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4271 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4272 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4273 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4274 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4275 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4276 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4277 
4278 /* Vector Floating-Point Compare Instructions */
4279 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4280 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4281                   CPURISCVState *env, uint32_t desc)          \
4282 {                                                             \
4283     uint32_t vm = vext_vm(desc);                              \
4284     uint32_t vl = env->vl;                                    \
4285     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4286     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4287     uint32_t vma = vext_vma(desc);                            \
4288     uint32_t i;                                               \
4289                                                               \
4290     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4291                                                               \
4292     for (i = env->vstart; i < vl; i++) {                      \
4293         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4294         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4295         if (!vm && !vext_elem_mask(v0, i)) {                  \
4296             /* set masked-off elements to 1s */               \
4297             if (vma) {                                        \
4298                 vext_set_elem_mask(vd, i, 1);                 \
4299             }                                                 \
4300             continue;                                         \
4301         }                                                     \
4302         vext_set_elem_mask(vd, i,                             \
4303                            DO_OP(s2, s1, &env->fp_status));   \
4304     }                                                         \
4305     env->vstart = 0;                                          \
4306     /*
4307      * mask destination register are always tail-agnostic
4308      * set tail elements to 1s
4309      */                                                       \
4310     if (vta_all_1s) {                                         \
4311         for (; i < total_elems; i++) {                        \
4312             vext_set_elem_mask(vd, i, 1);                     \
4313         }                                                     \
4314     }                                                         \
4315 }
4316 
4317 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4318 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4319 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4320 
4321 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4322 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4323                   CPURISCVState *env, uint32_t desc)                \
4324 {                                                                   \
4325     uint32_t vm = vext_vm(desc);                                    \
4326     uint32_t vl = env->vl;                                          \
4327     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4328     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4329     uint32_t vma = vext_vma(desc);                                  \
4330     uint32_t i;                                                     \
4331                                                                     \
4332     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
4333                                                                     \
4334     for (i = env->vstart; i < vl; i++) {                            \
4335         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4336         if (!vm && !vext_elem_mask(v0, i)) {                        \
4337             /* set masked-off elements to 1s */                     \
4338             if (vma) {                                              \
4339                 vext_set_elem_mask(vd, i, 1);                       \
4340             }                                                       \
4341             continue;                                               \
4342         }                                                           \
4343         vext_set_elem_mask(vd, i,                                   \
4344                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4345     }                                                               \
4346     env->vstart = 0;                                                \
4347     /*
4348      * mask destination register are always tail-agnostic
4349      * set tail elements to 1s
4350      */                                                             \
4351     if (vta_all_1s) {                                               \
4352         for (; i < total_elems; i++) {                              \
4353             vext_set_elem_mask(vd, i, 1);                           \
4354         }                                                           \
4355     }                                                               \
4356 }
4357 
4358 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4359 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4360 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4361 
4362 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4363 {
4364     FloatRelation compare = float16_compare_quiet(a, b, s);
4365     return compare != float_relation_equal;
4366 }
4367 
vmfne32(uint32_t a,uint32_t b,float_status * s)4368 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4369 {
4370     FloatRelation compare = float32_compare_quiet(a, b, s);
4371     return compare != float_relation_equal;
4372 }
4373 
vmfne64(uint64_t a,uint64_t b,float_status * s)4374 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4375 {
4376     FloatRelation compare = float64_compare_quiet(a, b, s);
4377     return compare != float_relation_equal;
4378 }
4379 
GEN_VEXT_CMP_VV_ENV(vmfne_vv_h,uint16_t,H2,vmfne16)4380 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4381 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4382 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4383 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4384 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4385 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4386 
4387 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4388 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4389 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4390 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4391 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4392 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4393 
4394 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4395 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4396 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4397 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4398 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4399 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4400 
4401 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4402 {
4403     FloatRelation compare = float16_compare(a, b, s);
4404     return compare == float_relation_greater;
4405 }
4406 
vmfgt32(uint32_t a,uint32_t b,float_status * s)4407 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4408 {
4409     FloatRelation compare = float32_compare(a, b, s);
4410     return compare == float_relation_greater;
4411 }
4412 
vmfgt64(uint64_t a,uint64_t b,float_status * s)4413 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4414 {
4415     FloatRelation compare = float64_compare(a, b, s);
4416     return compare == float_relation_greater;
4417 }
4418 
GEN_VEXT_CMP_VF(vmfgt_vf_h,uint16_t,H2,vmfgt16)4419 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4420 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4421 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4422 
4423 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4424 {
4425     FloatRelation compare = float16_compare(a, b, s);
4426     return compare == float_relation_greater ||
4427            compare == float_relation_equal;
4428 }
4429 
vmfge32(uint32_t a,uint32_t b,float_status * s)4430 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4431 {
4432     FloatRelation compare = float32_compare(a, b, s);
4433     return compare == float_relation_greater ||
4434            compare == float_relation_equal;
4435 }
4436 
vmfge64(uint64_t a,uint64_t b,float_status * s)4437 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4438 {
4439     FloatRelation compare = float64_compare(a, b, s);
4440     return compare == float_relation_greater ||
4441            compare == float_relation_equal;
4442 }
4443 
GEN_VEXT_CMP_VF(vmfge_vf_h,uint16_t,H2,vmfge16)4444 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4445 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4446 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4447 
4448 /* Vector Floating-Point Classify Instruction */
4449 target_ulong fclass_h(uint64_t frs1)
4450 {
4451     float16 f = frs1;
4452     bool sign = float16_is_neg(f);
4453 
4454     if (float16_is_infinity(f)) {
4455         return sign ? 1 << 0 : 1 << 7;
4456     } else if (float16_is_zero(f)) {
4457         return sign ? 1 << 3 : 1 << 4;
4458     } else if (float16_is_zero_or_denormal(f)) {
4459         return sign ? 1 << 2 : 1 << 5;
4460     } else if (float16_is_any_nan(f)) {
4461         float_status s = { }; /* for snan_bit_is_one */
4462         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4463     } else {
4464         return sign ? 1 << 1 : 1 << 6;
4465     }
4466 }
4467 
fclass_s(uint64_t frs1)4468 target_ulong fclass_s(uint64_t frs1)
4469 {
4470     float32 f = frs1;
4471     bool sign = float32_is_neg(f);
4472 
4473     if (float32_is_infinity(f)) {
4474         return sign ? 1 << 0 : 1 << 7;
4475     } else if (float32_is_zero(f)) {
4476         return sign ? 1 << 3 : 1 << 4;
4477     } else if (float32_is_zero_or_denormal(f)) {
4478         return sign ? 1 << 2 : 1 << 5;
4479     } else if (float32_is_any_nan(f)) {
4480         float_status s = { }; /* for snan_bit_is_one */
4481         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4482     } else {
4483         return sign ? 1 << 1 : 1 << 6;
4484     }
4485 }
4486 
fclass_d(uint64_t frs1)4487 target_ulong fclass_d(uint64_t frs1)
4488 {
4489     float64 f = frs1;
4490     bool sign = float64_is_neg(f);
4491 
4492     if (float64_is_infinity(f)) {
4493         return sign ? 1 << 0 : 1 << 7;
4494     } else if (float64_is_zero(f)) {
4495         return sign ? 1 << 3 : 1 << 4;
4496     } else if (float64_is_zero_or_denormal(f)) {
4497         return sign ? 1 << 2 : 1 << 5;
4498     } else if (float64_is_any_nan(f)) {
4499         float_status s = { }; /* for snan_bit_is_one */
4500         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4501     } else {
4502         return sign ? 1 << 1 : 1 << 6;
4503     }
4504 }
4505 
RVVCALL(OPIVV1,vfclass_v_h,OP_UU_H,H2,H2,fclass_h)4506 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4507 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4508 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4509 GEN_VEXT_V(vfclass_v_h, 2)
4510 GEN_VEXT_V(vfclass_v_w, 4)
4511 GEN_VEXT_V(vfclass_v_d, 8)
4512 
4513 /* Vector Floating-Point Merge Instruction */
4514 
4515 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4516 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4517                   CPURISCVState *env, uint32_t desc)          \
4518 {                                                             \
4519     uint32_t vm = vext_vm(desc);                              \
4520     uint32_t vl = env->vl;                                    \
4521     uint32_t esz = sizeof(ETYPE);                             \
4522     uint32_t total_elems =                                    \
4523         vext_get_total_elems(env, desc, esz);                 \
4524     uint32_t vta = vext_vta(desc);                            \
4525     uint32_t i;                                               \
4526                                                               \
4527     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4528                                                               \
4529     for (i = env->vstart; i < vl; i++) {                      \
4530         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4531         *((ETYPE *)vd + H(i)) =                               \
4532             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4533     }                                                         \
4534     env->vstart = 0;                                          \
4535     /* set tail elements to 1s */                             \
4536     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4537 }
4538 
4539 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4540 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4541 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4542 
4543 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4544 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4545 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4546 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4547 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4548 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4549 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4550 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4551 
4552 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4553 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4554 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4555 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4556 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4557 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4558 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4559 
4560 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4561 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4562 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4563 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4564 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4565 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4566 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4567 
4568 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4569 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4570 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4571 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4572 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4573 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4574 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4575 
4576 /* Widening Floating-Point/Integer Type-Convert Instructions */
4577 /* (TD, T2, TX2) */
4578 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4579 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4580 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4581 /*
4582  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4583  */
4584 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4585 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4586 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4587 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4588 
4589 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4590 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4591 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4592 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4593 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4594 
4595 /*
4596  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4597  */
4598 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4599 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4600 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4601 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4602 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4603 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4604 
4605 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4606 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4607 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4608 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4609 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4610 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4611 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4612 
4613 /*
4614  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4615  */
4616 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4617 {
4618     return float16_to_float32(a, true, s);
4619 }
4620 
RVVCALL(OPFVV1,vfwcvt_f_f_v_h,WOP_UU_H,H4,H2,vfwcvtffv16)4621 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4622 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4623 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4624 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4625 
4626 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4627 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4628 
4629 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4630 /* (TD, T2, TX2) */
4631 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4632 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4633 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4634 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4635 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4636 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4637 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4638 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4639 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4640 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4641 
4642 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4643 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4644 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4645 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4646 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4647 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4648 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4649 
4650 /*
4651  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4652  */
4653 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4654 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4655 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4656 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4657 
4658 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4659 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4660 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4661 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4662 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4663 
4664 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4665 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4666 {
4667     return float32_to_float16(a, true, s);
4668 }
4669 
RVVCALL(OPFVV1,vfncvt_f_f_w_h,NOP_UU_H,H2,H4,vfncvtffv16)4670 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4671 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4672 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4673 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4674 
4675 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4676 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4677 
4678 /*
4679  * Vector Reduction Operations
4680  */
4681 /* Vector Single-Width Integer Reduction Instructions */
4682 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4683 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4684                   void *vs2, CPURISCVState *env,          \
4685                   uint32_t desc)                          \
4686 {                                                         \
4687     uint32_t vm = vext_vm(desc);                          \
4688     uint32_t vl = env->vl;                                \
4689     uint32_t esz = sizeof(TD);                            \
4690     uint32_t vlenb = simd_maxsz(desc);                    \
4691     uint32_t vta = vext_vta(desc);                        \
4692     uint32_t i;                                           \
4693     TD s1 =  *((TD *)vs1 + HD(0));                        \
4694                                                           \
4695     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4696                                                           \
4697     for (i = env->vstart; i < vl; i++) {                  \
4698         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4699         if (!vm && !vext_elem_mask(v0, i)) {              \
4700             continue;                                     \
4701         }                                                 \
4702         s1 = OP(s1, (TD)s2);                              \
4703     }                                                     \
4704     if (vl > 0) {                                         \
4705         *((TD *)vd + HD(0)) = s1;                         \
4706     }                                                     \
4707     env->vstart = 0;                                      \
4708     /* set tail elements to 1s */                         \
4709     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4710 }
4711 
4712 /* vd[0] = sum(vs1[0], vs2[*]) */
4713 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4714 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4715 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4716 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4717 
4718 /* vd[0] = maxu(vs1[0], vs2[*]) */
4719 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4720 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4721 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4722 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4723 
4724 /* vd[0] = max(vs1[0], vs2[*]) */
4725 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4726 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4727 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4728 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4729 
4730 /* vd[0] = minu(vs1[0], vs2[*]) */
4731 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4732 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4733 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4734 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4735 
4736 /* vd[0] = min(vs1[0], vs2[*]) */
4737 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4738 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4739 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4740 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4741 
4742 /* vd[0] = and(vs1[0], vs2[*]) */
4743 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4744 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4745 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4746 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4747 
4748 /* vd[0] = or(vs1[0], vs2[*]) */
4749 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4750 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4751 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4752 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4753 
4754 /* vd[0] = xor(vs1[0], vs2[*]) */
4755 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4756 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4757 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4758 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4759 
4760 /* Vector Widening Integer Reduction Instructions */
4761 /* signed sum reduction into double-width accumulator */
4762 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4763 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4764 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4765 
4766 /* Unsigned sum reduction into double-width accumulator */
4767 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4768 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4769 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4770 
4771 /* Vector Single-Width Floating-Point Reduction Instructions */
4772 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4773 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4774                   void *vs2, CPURISCVState *env,           \
4775                   uint32_t desc)                           \
4776 {                                                          \
4777     uint32_t vm = vext_vm(desc);                           \
4778     uint32_t vl = env->vl;                                 \
4779     uint32_t esz = sizeof(TD);                             \
4780     uint32_t vlenb = simd_maxsz(desc);                     \
4781     uint32_t vta = vext_vta(desc);                         \
4782     uint32_t i;                                            \
4783     TD s1 =  *((TD *)vs1 + HD(0));                         \
4784                                                            \
4785     VSTART_CHECK_EARLY_EXIT(env, vl);                      \
4786                                                            \
4787     for (i = env->vstart; i < vl; i++) {                   \
4788         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4789         if (!vm && !vext_elem_mask(v0, i)) {               \
4790             continue;                                      \
4791         }                                                  \
4792         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4793     }                                                      \
4794     if (vl > 0) {                                          \
4795         *((TD *)vd + HD(0)) = s1;                          \
4796     }                                                      \
4797     env->vstart = 0;                                       \
4798     /* set tail elements to 1s */                          \
4799     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4800 }
4801 
4802 /* Unordered sum */
4803 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4804 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4805 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4806 
4807 /* Ordered sum */
4808 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4809 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4810 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4811 
4812 /* Maximum value */
4813 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4814               float16_maximum_number)
4815 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4816               float32_maximum_number)
4817 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4818               float64_maximum_number)
4819 
4820 /* Minimum value */
4821 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4822               float16_minimum_number)
4823 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4824               float32_minimum_number)
4825 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4826               float64_minimum_number)
4827 
4828 /* Vector Widening Floating-Point Add Instructions */
4829 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4830 {
4831     return float32_add(a, float16_to_float32(b, true, s), s);
4832 }
4833 
fwadd32(uint64_t a,uint32_t b,float_status * s)4834 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4835 {
4836     return float64_add(a, float32_to_float64(b, s), s);
4837 }
4838 
4839 /* Vector Widening Floating-Point Reduction Instructions */
4840 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
GEN_VEXT_FRED(vfwredusum_vs_h,uint32_t,uint16_t,H4,H2,fwadd16)4841 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4842 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4843 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4844 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4845 
4846 /*
4847  * Vector Mask Operations
4848  */
4849 /* Vector Mask-Register Logical Instructions */
4850 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4851 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4852                   void *vs2, CPURISCVState *env,          \
4853                   uint32_t desc)                          \
4854 {                                                         \
4855     uint32_t vl = env->vl;                                \
4856     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4857     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4858     uint32_t i;                                           \
4859     int a, b;                                             \
4860                                                           \
4861     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4862                                                           \
4863     for (i = env->vstart; i < vl; i++) {                  \
4864         a = vext_elem_mask(vs1, i);                       \
4865         b = vext_elem_mask(vs2, i);                       \
4866         vext_set_elem_mask(vd, i, OP(b, a));              \
4867     }                                                     \
4868     env->vstart = 0;                                      \
4869     /*
4870      * mask destination register are always tail-agnostic
4871      * set tail elements to 1s
4872      */                                                   \
4873     if (vta_all_1s) {                                     \
4874         for (; i < total_elems; i++) {                    \
4875             vext_set_elem_mask(vd, i, 1);                 \
4876         }                                                 \
4877     }                                                     \
4878 }
4879 
4880 #define DO_NAND(N, M)  (!(N & M))
4881 #define DO_ANDNOT(N, M)  (N & !M)
4882 #define DO_NOR(N, M)  (!(N | M))
4883 #define DO_ORNOT(N, M)  (N | !M)
4884 #define DO_XNOR(N, M)  (!(N ^ M))
4885 
4886 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4887 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4888 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4889 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4890 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4891 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4892 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4893 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4894 
4895 /* Vector count population in mask vcpop */
4896 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4897                              uint32_t desc)
4898 {
4899     target_ulong cnt = 0;
4900     uint32_t vm = vext_vm(desc);
4901     uint32_t vl = env->vl;
4902     int i;
4903 
4904     for (i = env->vstart; i < vl; i++) {
4905         if (vm || vext_elem_mask(v0, i)) {
4906             if (vext_elem_mask(vs2, i)) {
4907                 cnt++;
4908             }
4909         }
4910     }
4911     env->vstart = 0;
4912     return cnt;
4913 }
4914 
4915 /* vfirst find-first-set mask bit */
HELPER(vfirst_m)4916 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4917                               uint32_t desc)
4918 {
4919     uint32_t vm = vext_vm(desc);
4920     uint32_t vl = env->vl;
4921     int i;
4922 
4923     for (i = env->vstart; i < vl; i++) {
4924         if (vm || vext_elem_mask(v0, i)) {
4925             if (vext_elem_mask(vs2, i)) {
4926                 return i;
4927             }
4928         }
4929     }
4930     env->vstart = 0;
4931     return -1LL;
4932 }
4933 
4934 enum set_mask_type {
4935     ONLY_FIRST = 1,
4936     INCLUDE_FIRST,
4937     BEFORE_FIRST,
4938 };
4939 
vmsetm(void * vd,void * v0,void * vs2,CPURISCVState * env,uint32_t desc,enum set_mask_type type)4940 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4941                    uint32_t desc, enum set_mask_type type)
4942 {
4943     uint32_t vm = vext_vm(desc);
4944     uint32_t vl = env->vl;
4945     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4946     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4947     uint32_t vma = vext_vma(desc);
4948     int i;
4949     bool first_mask_bit = false;
4950 
4951     VSTART_CHECK_EARLY_EXIT(env, vl);
4952 
4953     for (i = env->vstart; i < vl; i++) {
4954         if (!vm && !vext_elem_mask(v0, i)) {
4955             /* set masked-off elements to 1s */
4956             if (vma) {
4957                 vext_set_elem_mask(vd, i, 1);
4958             }
4959             continue;
4960         }
4961         /* write a zero to all following active elements */
4962         if (first_mask_bit) {
4963             vext_set_elem_mask(vd, i, 0);
4964             continue;
4965         }
4966         if (vext_elem_mask(vs2, i)) {
4967             first_mask_bit = true;
4968             if (type == BEFORE_FIRST) {
4969                 vext_set_elem_mask(vd, i, 0);
4970             } else {
4971                 vext_set_elem_mask(vd, i, 1);
4972             }
4973         } else {
4974             if (type == ONLY_FIRST) {
4975                 vext_set_elem_mask(vd, i, 0);
4976             } else {
4977                 vext_set_elem_mask(vd, i, 1);
4978             }
4979         }
4980     }
4981     env->vstart = 0;
4982     /*
4983      * mask destination register are always tail-agnostic
4984      * set tail elements to 1s
4985      */
4986     if (vta_all_1s) {
4987         for (; i < total_elems; i++) {
4988             vext_set_elem_mask(vd, i, 1);
4989         }
4990     }
4991 }
4992 
HELPER(vmsbf_m)4993 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4994                      uint32_t desc)
4995 {
4996     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4997 }
4998 
HELPER(vmsif_m)4999 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
5000                      uint32_t desc)
5001 {
5002     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
5003 }
5004 
HELPER(vmsof_m)5005 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
5006                      uint32_t desc)
5007 {
5008     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
5009 }
5010 
5011 /* Vector Iota Instruction */
5012 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
5013 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
5014                   uint32_t desc)                                          \
5015 {                                                                         \
5016     uint32_t vm = vext_vm(desc);                                          \
5017     uint32_t vl = env->vl;                                                \
5018     uint32_t esz = sizeof(ETYPE);                                         \
5019     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5020     uint32_t vta = vext_vta(desc);                                        \
5021     uint32_t vma = vext_vma(desc);                                        \
5022     uint32_t sum = 0;                                                     \
5023     int i;                                                                \
5024                                                                           \
5025     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5026                                                                           \
5027     for (i = env->vstart; i < vl; i++) {                                  \
5028         if (!vm && !vext_elem_mask(v0, i)) {                              \
5029             /* set masked-off elements to 1s */                           \
5030             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5031             continue;                                                     \
5032         }                                                                 \
5033         *((ETYPE *)vd + H(i)) = sum;                                      \
5034         if (vext_elem_mask(vs2, i)) {                                     \
5035             sum++;                                                        \
5036         }                                                                 \
5037     }                                                                     \
5038     env->vstart = 0;                                                      \
5039     /* set tail elements to 1s */                                         \
5040     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5041 }
5042 
GEN_VEXT_VIOTA_M(viota_m_b,uint8_t,H1)5043 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
5044 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
5045 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
5046 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
5047 
5048 /* Vector Element Index Instruction */
5049 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
5050 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
5051 {                                                                         \
5052     uint32_t vm = vext_vm(desc);                                          \
5053     uint32_t vl = env->vl;                                                \
5054     uint32_t esz = sizeof(ETYPE);                                         \
5055     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5056     uint32_t vta = vext_vta(desc);                                        \
5057     uint32_t vma = vext_vma(desc);                                        \
5058     int i;                                                                \
5059                                                                           \
5060     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5061                                                                           \
5062     for (i = env->vstart; i < vl; i++) {                                  \
5063         if (!vm && !vext_elem_mask(v0, i)) {                              \
5064             /* set masked-off elements to 1s */                           \
5065             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5066             continue;                                                     \
5067         }                                                                 \
5068         *((ETYPE *)vd + H(i)) = i;                                        \
5069     }                                                                     \
5070     env->vstart = 0;                                                      \
5071     /* set tail elements to 1s */                                         \
5072     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5073 }
5074 
5075 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5076 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5077 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5078 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5079 
5080 /*
5081  * Vector Permutation Instructions
5082  */
5083 
5084 /* Vector Slide Instructions */
5085 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5086 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5087                   CPURISCVState *env, uint32_t desc)                      \
5088 {                                                                         \
5089     uint32_t vm = vext_vm(desc);                                          \
5090     uint32_t vl = env->vl;                                                \
5091     uint32_t esz = sizeof(ETYPE);                                         \
5092     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5093     uint32_t vta = vext_vta(desc);                                        \
5094     uint32_t vma = vext_vma(desc);                                        \
5095     target_ulong offset = s1, i_min, i;                                   \
5096                                                                           \
5097     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5098                                                                           \
5099     i_min = MAX(env->vstart, offset);                                     \
5100     for (i = i_min; i < vl; i++) {                                        \
5101         if (!vm && !vext_elem_mask(v0, i)) {                              \
5102             /* set masked-off elements to 1s */                           \
5103             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5104             continue;                                                     \
5105         }                                                                 \
5106         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5107     }                                                                     \
5108     env->vstart = 0;                                                      \
5109     /* set tail elements to 1s */                                         \
5110     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5111 }
5112 
5113 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5114 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5115 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5116 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5117 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5118 
5119 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5120 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5121                   CPURISCVState *env, uint32_t desc)                      \
5122 {                                                                         \
5123     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5124     uint32_t vm = vext_vm(desc);                                          \
5125     uint32_t vl = env->vl;                                                \
5126     uint32_t esz = sizeof(ETYPE);                                         \
5127     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5128     uint32_t vta = vext_vta(desc);                                        \
5129     uint32_t vma = vext_vma(desc);                                        \
5130     target_ulong i_max, i_min, i;                                         \
5131                                                                           \
5132     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5133                                                                           \
5134     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
5135     i_max = MAX(i_min, env->vstart);                                      \
5136     for (i = env->vstart; i < i_max; ++i) {                               \
5137         if (!vm && !vext_elem_mask(v0, i)) {                              \
5138             /* set masked-off elements to 1s */                           \
5139             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5140             continue;                                                     \
5141         }                                                                 \
5142         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5143     }                                                                     \
5144                                                                           \
5145     for (i = i_max; i < vl; ++i) {                                        \
5146         if (!vm && !vext_elem_mask(v0, i)) {                              \
5147             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5148             continue;                                                     \
5149         }                                                                 \
5150         *((ETYPE *)vd + H(i)) = 0;                                        \
5151     }                                                                     \
5152                                                                           \
5153     env->vstart = 0;                                                      \
5154     /* set tail elements to 1s */                                         \
5155     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5156 }
5157 
5158 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5159 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5160 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5161 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5162 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5163 
5164 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5165 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5166                                  void *vs2, CPURISCVState *env,             \
5167                                  uint32_t desc)                             \
5168 {                                                                           \
5169     typedef uint##BITWIDTH##_t ETYPE;                                       \
5170     uint32_t vm = vext_vm(desc);                                            \
5171     uint32_t vl = env->vl;                                                  \
5172     uint32_t esz = sizeof(ETYPE);                                           \
5173     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5174     uint32_t vta = vext_vta(desc);                                          \
5175     uint32_t vma = vext_vma(desc);                                          \
5176     uint32_t i;                                                             \
5177                                                                             \
5178     VSTART_CHECK_EARLY_EXIT(env, vl);                                       \
5179                                                                             \
5180     for (i = env->vstart; i < vl; i++) {                                    \
5181         if (!vm && !vext_elem_mask(v0, i)) {                                \
5182             /* set masked-off elements to 1s */                             \
5183             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5184             continue;                                                       \
5185         }                                                                   \
5186         if (i == 0) {                                                       \
5187             *((ETYPE *)vd + H(i)) = s1;                                     \
5188         } else {                                                            \
5189             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5190         }                                                                   \
5191     }                                                                       \
5192     env->vstart = 0;                                                        \
5193     /* set tail elements to 1s */                                           \
5194     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5195 }
5196 
5197 GEN_VEXT_VSLIE1UP(8,  H1)
5198 GEN_VEXT_VSLIE1UP(16, H2)
5199 GEN_VEXT_VSLIE1UP(32, H4)
5200 GEN_VEXT_VSLIE1UP(64, H8)
5201 
5202 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5203 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5204                   CPURISCVState *env, uint32_t desc)              \
5205 {                                                                 \
5206     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5207 }
5208 
5209 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5210 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5211 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5212 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5213 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5214 
5215 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5216 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5217                                    void *vs2, CPURISCVState *env,             \
5218                                    uint32_t desc)                             \
5219 {                                                                             \
5220     typedef uint##BITWIDTH##_t ETYPE;                                         \
5221     uint32_t vm = vext_vm(desc);                                              \
5222     uint32_t vl = env->vl;                                                    \
5223     uint32_t esz = sizeof(ETYPE);                                             \
5224     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5225     uint32_t vta = vext_vta(desc);                                            \
5226     uint32_t vma = vext_vma(desc);                                            \
5227     uint32_t i;                                                               \
5228                                                                               \
5229     VSTART_CHECK_EARLY_EXIT(env, vl);                                         \
5230                                                                               \
5231     for (i = env->vstart; i < vl; i++) {                                      \
5232         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5233             /* set masked-off elements to 1s */                               \
5234             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5235             continue;                                                         \
5236         }                                                                     \
5237         if (i == vl - 1) {                                                    \
5238             *((ETYPE *)vd + H(i)) = s1;                                       \
5239         } else {                                                              \
5240             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5241         }                                                                     \
5242     }                                                                         \
5243     env->vstart = 0;                                                          \
5244     /* set tail elements to 1s */                                             \
5245     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5246 }
5247 
5248 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5249 GEN_VEXT_VSLIDE1DOWN(16, H2)
5250 GEN_VEXT_VSLIDE1DOWN(32, H4)
5251 GEN_VEXT_VSLIDE1DOWN(64, H8)
5252 
5253 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5254 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5255                   CPURISCVState *env, uint32_t desc)              \
5256 {                                                                 \
5257     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5258 }
5259 
5260 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5261 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5262 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5263 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5264 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5265 
5266 /* Vector Floating-Point Slide Instructions */
5267 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5268 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5269                   CPURISCVState *env, uint32_t desc)          \
5270 {                                                             \
5271     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5272 }
5273 
5274 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5275 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5276 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5277 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5278 
5279 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5280 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5281                   CPURISCVState *env, uint32_t desc)          \
5282 {                                                             \
5283     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5284 }
5285 
5286 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5287 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5288 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5289 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5290 
5291 /* Vector Register Gather Instruction */
5292 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5293 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5294                   CPURISCVState *env, uint32_t desc)                      \
5295 {                                                                         \
5296     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5297     uint32_t vm = vext_vm(desc);                                          \
5298     uint32_t vl = env->vl;                                                \
5299     uint32_t esz = sizeof(TS2);                                           \
5300     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5301     uint32_t vta = vext_vta(desc);                                        \
5302     uint32_t vma = vext_vma(desc);                                        \
5303     uint64_t index;                                                       \
5304     uint32_t i;                                                           \
5305                                                                           \
5306     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5307                                                                           \
5308     for (i = env->vstart; i < vl; i++) {                                  \
5309         if (!vm && !vext_elem_mask(v0, i)) {                              \
5310             /* set masked-off elements to 1s */                           \
5311             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5312             continue;                                                     \
5313         }                                                                 \
5314         index = *((TS1 *)vs1 + HS1(i));                                   \
5315         if (index >= vlmax) {                                             \
5316             *((TS2 *)vd + HS2(i)) = 0;                                    \
5317         } else {                                                          \
5318             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5319         }                                                                 \
5320     }                                                                     \
5321     env->vstart = 0;                                                      \
5322     /* set tail elements to 1s */                                         \
5323     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5324 }
5325 
5326 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5327 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5328 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5329 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5330 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5331 
5332 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5333 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5334 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5335 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5336 
5337 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5338 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5339                   CPURISCVState *env, uint32_t desc)                      \
5340 {                                                                         \
5341     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5342     uint32_t vm = vext_vm(desc);                                          \
5343     uint32_t vl = env->vl;                                                \
5344     uint32_t esz = sizeof(ETYPE);                                         \
5345     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5346     uint32_t vta = vext_vta(desc);                                        \
5347     uint32_t vma = vext_vma(desc);                                        \
5348     uint64_t index = s1;                                                  \
5349     uint32_t i;                                                           \
5350                                                                           \
5351     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5352                                                                           \
5353     for (i = env->vstart; i < vl; i++) {                                  \
5354         if (!vm && !vext_elem_mask(v0, i)) {                              \
5355             /* set masked-off elements to 1s */                           \
5356             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5357             continue;                                                     \
5358         }                                                                 \
5359         if (index >= vlmax) {                                             \
5360             *((ETYPE *)vd + H(i)) = 0;                                    \
5361         } else {                                                          \
5362             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5363         }                                                                 \
5364     }                                                                     \
5365     env->vstart = 0;                                                      \
5366     /* set tail elements to 1s */                                         \
5367     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5368 }
5369 
5370 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5371 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5372 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5373 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5374 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5375 
5376 /* Vector Compress Instruction */
5377 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5378 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5379                   CPURISCVState *env, uint32_t desc)                      \
5380 {                                                                         \
5381     uint32_t vl = env->vl;                                                \
5382     uint32_t esz = sizeof(ETYPE);                                         \
5383     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5384     uint32_t vta = vext_vta(desc);                                        \
5385     uint32_t num = 0, i;                                                  \
5386                                                                           \
5387     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5388                                                                           \
5389     for (i = env->vstart; i < vl; i++) {                                  \
5390         if (!vext_elem_mask(vs1, i)) {                                    \
5391             continue;                                                     \
5392         }                                                                 \
5393         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5394         num++;                                                            \
5395     }                                                                     \
5396     env->vstart = 0;                                                      \
5397     /* set tail elements to 1s */                                         \
5398     vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
5399 }
5400 
5401 /* Compress into vd elements of vs2 where vs1 is enabled */
5402 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5403 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5404 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5405 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5406 
5407 /* Vector Whole Register Move */
5408 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5409 {
5410     /* EEW = SEW */
5411     uint32_t maxsz = simd_maxsz(desc);
5412     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5413     uint32_t startb = env->vstart * sewb;
5414     uint32_t i = startb;
5415 
5416     if (startb >= maxsz) {
5417         env->vstart = 0;
5418         return;
5419     }
5420 
5421     if (HOST_BIG_ENDIAN && i % 8 != 0) {
5422         uint32_t j = ROUND_UP(i, 8);
5423         memcpy((uint8_t *)vd + H1(j - 1),
5424                (uint8_t *)vs2 + H1(j - 1),
5425                j - i);
5426         i = j;
5427     }
5428 
5429     memcpy((uint8_t *)vd + H1(i),
5430            (uint8_t *)vs2 + H1(i),
5431            maxsz - i);
5432 
5433     env->vstart = 0;
5434 }
5435 
5436 /* Vector Integer Extension */
5437 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5438 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5439                   CPURISCVState *env, uint32_t desc)             \
5440 {                                                                \
5441     uint32_t vl = env->vl;                                       \
5442     uint32_t vm = vext_vm(desc);                                 \
5443     uint32_t esz = sizeof(ETYPE);                                \
5444     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5445     uint32_t vta = vext_vta(desc);                               \
5446     uint32_t vma = vext_vma(desc);                               \
5447     uint32_t i;                                                  \
5448                                                                  \
5449     VSTART_CHECK_EARLY_EXIT(env, vl);                            \
5450                                                                  \
5451     for (i = env->vstart; i < vl; i++) {                         \
5452         if (!vm && !vext_elem_mask(v0, i)) {                     \
5453             /* set masked-off elements to 1s */                  \
5454             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5455             continue;                                            \
5456         }                                                        \
5457         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5458     }                                                            \
5459     env->vstart = 0;                                             \
5460     /* set tail elements to 1s */                                \
5461     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5462 }
5463 
5464 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5465 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5466 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5467 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5468 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5469 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5470 
5471 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5472 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5473 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5474 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5475 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5476 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5477