xref: /qemu/target/riscv/vector_helper.c (revision 2af4a82ab2cce3412ffc92cd4c96bd870e33bc8e)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "accel/tcg/cpu-ldst.h"
25 #include "accel/tcg/probe.h"
26 #include "exec/page-protection.h"
27 #include "exec/helper-proto.h"
28 #include "exec/tlb-flags.h"
29 #include "exec/target_page.h"
30 #include "exec/tswap.h"
31 #include "fpu/softfloat.h"
32 #include "tcg/tcg-gvec-desc.h"
33 #include "internals.h"
34 #include "vector_internals.h"
35 #include <math.h>
36 
HELPER(vsetvl)37 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
38                             target_ulong s2)
39 {
40     int vlmax, vl;
41     RISCVCPU *cpu = env_archcpu(env);
42     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
43     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
44     uint16_t sew = 8 << vsew;
45     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
46     int xlen = riscv_cpu_xlen(env);
47     bool vill = (s2 >> (xlen - 1)) & 0x1;
48     target_ulong reserved = s2 &
49                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
50                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
51     uint16_t vlen = cpu->cfg.vlenb << 3;
52     int8_t lmul;
53 
54     if (vlmul & 4) {
55         /*
56          * Fractional LMUL, check:
57          *
58          * VLEN * LMUL >= SEW
59          * VLEN >> (8 - lmul) >= sew
60          * (vlenb << 3) >> (8 - lmul) >= sew
61          */
62         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
63             vill = true;
64         }
65     }
66 
67     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
68         /* only set vill bit. */
69         env->vill = 1;
70         env->vtype = 0;
71         env->vl = 0;
72         env->vstart = 0;
73         return 0;
74     }
75 
76     /* lmul encoded as in DisasContext::lmul */
77     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
78     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
79     if (s1 <= vlmax) {
80         vl = s1;
81     } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
82         vl = (s1 + 1) >> 1;
83     } else {
84         vl = vlmax;
85     }
86     env->vl = vl;
87     env->vtype = s2;
88     env->vstart = 0;
89     env->vill = 0;
90     return vl;
91 }
92 
93 /*
94  * Get the maximum number of elements can be operated.
95  *
96  * log2_esz: log2 of element size in bytes.
97  */
vext_max_elems(uint32_t desc,uint32_t log2_esz)98 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
99 {
100     /*
101      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
102      * so vlen in bytes (vlenb) is encoded as maxsz.
103      */
104     uint32_t vlenb = simd_maxsz(desc);
105 
106     /* Return VLMAX */
107     int scale = vext_lmul(desc) - log2_esz;
108     return scale < 0 ? vlenb >> -scale : vlenb << scale;
109 }
110 
111 /*
112  * This function checks watchpoint before real load operation.
113  *
114  * In system mode, the TLB API probe_access is enough for watchpoint check.
115  * In user mode, there is no watchpoint support now.
116  *
117  * It will trigger an exception if there is no mapping in TLB
118  * and page table walk can't fill the TLB entry. Then the guest
119  * software can return here after process the exception or never return.
120  *
121  * This function can also be used when direct access to probe_access_flags is
122  * needed in order to access the flags. If a pointer to a flags operand is
123  * provided the function will call probe_access_flags instead, use nonfault
124  * and update host and flags.
125  */
probe_pages(CPURISCVState * env,target_ulong addr,target_ulong len,uintptr_t ra,MMUAccessType access_type,int mmu_index,void ** host,int * flags,bool nonfault)126 static void probe_pages(CPURISCVState *env, target_ulong addr, target_ulong len,
127                         uintptr_t ra, MMUAccessType access_type, int mmu_index,
128                         void **host, int *flags, bool nonfault)
129 {
130     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
131     target_ulong curlen = MIN(pagelen, len);
132 
133     if (flags != NULL) {
134         *flags = probe_access_flags(env, adjust_addr(env, addr), curlen,
135                                     access_type, mmu_index, nonfault, host, ra);
136     } else {
137         probe_access(env, adjust_addr(env, addr), curlen, access_type,
138                      mmu_index, ra);
139     }
140 
141     if (len > curlen) {
142         addr += curlen;
143         curlen = len - curlen;
144         if (flags != NULL) {
145             *flags = probe_access_flags(env, adjust_addr(env, addr), curlen,
146                                         access_type, mmu_index, nonfault,
147                                         host, ra);
148         } else {
149             probe_access(env, adjust_addr(env, addr), curlen, access_type,
150                          mmu_index, ra);
151         }
152     }
153 }
154 
155 
vext_set_elem_mask(void * v0,int index,uint8_t value)156 static inline void vext_set_elem_mask(void *v0, int index,
157                                       uint8_t value)
158 {
159     int idx = index / 64;
160     int pos = index % 64;
161     uint64_t old = ((uint64_t *)v0)[idx];
162     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
163 }
164 
165 /* elements operations for load and store */
166 typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
167                                    uint32_t idx, void *vd, uintptr_t retaddr);
168 typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
169 
170 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
171 static inline QEMU_ALWAYS_INLINE                            \
172 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
173                 uint32_t idx, void *vd, uintptr_t retaddr)  \
174 {                                                           \
175     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
176     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
177 }                                                           \
178                                                             \
179 static inline QEMU_ALWAYS_INLINE                            \
180 void NAME##_host(void *vd, uint32_t idx, void *host)        \
181 {                                                           \
182     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
183     *cur = (ETYPE)LDSUF##_p(host);                          \
184 }
185 
GEN_VEXT_LD_ELEM(lde_b,uint8_t,H1,ldub)186 GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
187 GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
188 GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
189 GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
190 
191 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
192 static inline QEMU_ALWAYS_INLINE                            \
193 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
194                 uint32_t idx, void *vd, uintptr_t retaddr)  \
195 {                                                           \
196     ETYPE data = *((ETYPE *)vd + H(idx));                   \
197     cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
198 }                                                           \
199                                                             \
200 static inline QEMU_ALWAYS_INLINE                            \
201 void NAME##_host(void *vd, uint32_t idx, void *host)        \
202 {                                                           \
203     ETYPE data = *((ETYPE *)vd + H(idx));                   \
204     STSUF##_p(host, data);                                  \
205 }
206 
207 GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
208 GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
209 GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
210 GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
211 
212 static inline QEMU_ALWAYS_INLINE void
213 vext_continuous_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
214                        void *vd, uint32_t evl, target_ulong addr,
215                        uint32_t reg_start, uintptr_t ra, uint32_t esz,
216                        bool is_load)
217 {
218     uint32_t i;
219     for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
220         ldst_tlb(env, adjust_addr(env, addr), i, vd, ra);
221     }
222 }
223 
224 static inline QEMU_ALWAYS_INLINE void
vext_continuous_ldst_host(CPURISCVState * env,vext_ldst_elem_fn_host * ldst_host,void * vd,uint32_t evl,uint32_t reg_start,void * host,uint32_t esz,bool is_load)225 vext_continuous_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
226                         void *vd, uint32_t evl, uint32_t reg_start, void *host,
227                         uint32_t esz, bool is_load)
228 {
229 #if HOST_BIG_ENDIAN
230     for (; reg_start < evl; reg_start++, host += esz) {
231         ldst_host(vd, reg_start, host);
232     }
233 #else
234     if (esz == 1) {
235         uint32_t byte_offset = reg_start * esz;
236         uint32_t size = (evl - reg_start) * esz;
237 
238         if (is_load) {
239             memcpy(vd + byte_offset, host, size);
240         } else {
241             memcpy(host, vd + byte_offset, size);
242         }
243     } else {
244         for (; reg_start < evl; reg_start++, host += esz) {
245             ldst_host(vd, reg_start, host);
246         }
247     }
248 #endif
249 }
250 
vext_set_tail_elems_1s(target_ulong vl,void * vd,uint32_t desc,uint32_t nf,uint32_t esz,uint32_t max_elems)251 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
252                                    uint32_t desc, uint32_t nf,
253                                    uint32_t esz, uint32_t max_elems)
254 {
255     uint32_t vta = vext_vta(desc);
256     int k;
257 
258     if (vta == 0) {
259         return;
260     }
261 
262     for (k = 0; k < nf; ++k) {
263         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
264                           (k * max_elems + max_elems) * esz);
265     }
266 }
267 
268 /*
269  * stride: access vector element from strided memory
270  */
271 static void
vext_ldst_stride(void * vd,void * v0,target_ulong base,target_ulong stride,CPURISCVState * env,uint32_t desc,uint32_t vm,vext_ldst_elem_fn_tlb * ldst_elem,uint32_t log2_esz,uintptr_t ra)272 vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride,
273                  CPURISCVState *env, uint32_t desc, uint32_t vm,
274                  vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
275                  uintptr_t ra)
276 {
277     uint32_t i, k;
278     uint32_t nf = vext_nf(desc);
279     uint32_t max_elems = vext_max_elems(desc, log2_esz);
280     uint32_t esz = 1 << log2_esz;
281     uint32_t vma = vext_vma(desc);
282 
283     VSTART_CHECK_EARLY_EXIT(env, env->vl);
284 
285     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
286         k = 0;
287         while (k < nf) {
288             if (!vm && !vext_elem_mask(v0, i)) {
289                 /* set masked-off elements to 1s */
290                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
291                                   (i + k * max_elems + 1) * esz);
292                 k++;
293                 continue;
294             }
295             target_ulong addr = base + stride * i + (k << log2_esz);
296             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
297             k++;
298         }
299     }
300     env->vstart = 0;
301 
302     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
303 }
304 
305 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
306 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
307                   target_ulong stride, CPURISCVState *env,              \
308                   uint32_t desc)                                        \
309 {                                                                       \
310     uint32_t vm = vext_vm(desc);                                        \
311     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
312                      ctzl(sizeof(ETYPE)), GETPC());                     \
313 }
314 
GEN_VEXT_LD_STRIDE(vlse8_v,int8_t,lde_b_tlb)315 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
316 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
317 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
318 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
319 
320 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
321 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
322                   target_ulong stride, CPURISCVState *env,              \
323                   uint32_t desc)                                        \
324 {                                                                       \
325     uint32_t vm = vext_vm(desc);                                        \
326     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
327                      ctzl(sizeof(ETYPE)), GETPC());                     \
328 }
329 
330 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
331 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
332 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
333 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
334 
335 /*
336  * unit-stride: access elements stored contiguously in memory
337  */
338 
339 /* unmasked unit-stride load and store operation */
340 static inline QEMU_ALWAYS_INLINE void
341 vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
342                   uint32_t elems, uint32_t nf, uint32_t max_elems,
343                   uint32_t log2_esz, bool is_load, int mmu_index,
344                   vext_ldst_elem_fn_tlb *ldst_tlb,
345                   vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
346 {
347     void *host;
348     int i, k, flags;
349     uint32_t esz = 1 << log2_esz;
350     uint32_t size = (elems * nf) << log2_esz;
351     uint32_t evl = env->vstart + elems;
352     MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
353 
354     /* Check page permission/pmp/watchpoint/etc. */
355     probe_pages(env, addr, size, ra, access_type, mmu_index, &host, &flags,
356                 true);
357 
358     if (flags == 0) {
359         if (nf == 1) {
360             vext_continuous_ldst_host(env, ldst_host, vd, evl, env->vstart,
361                                       host, esz, is_load);
362         } else {
363             for (i = env->vstart; i < evl; ++i) {
364                 k = 0;
365                 while (k < nf) {
366                     ldst_host(vd, i + k * max_elems, host);
367                     host += esz;
368                     k++;
369                 }
370             }
371         }
372         env->vstart += elems;
373     } else {
374         if (nf == 1) {
375             vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
376                                    ra, esz, is_load);
377         } else {
378             /* load bytes from guest memory */
379             for (i = env->vstart; i < evl; env->vstart = ++i) {
380                 k = 0;
381                 while (k < nf) {
382                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
383                              vd, ra);
384                     addr += esz;
385                     k++;
386                 }
387             }
388         }
389     }
390 }
391 
392 static inline QEMU_ALWAYS_INLINE void
vext_ldst_us(void * vd,target_ulong base,CPURISCVState * env,uint32_t desc,vext_ldst_elem_fn_tlb * ldst_tlb,vext_ldst_elem_fn_host * ldst_host,uint32_t log2_esz,uint32_t evl,uintptr_t ra,bool is_load)393 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
394              vext_ldst_elem_fn_tlb *ldst_tlb,
395              vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
396              uint32_t evl, uintptr_t ra, bool is_load)
397 {
398     uint32_t k;
399     target_ulong page_split, elems, addr;
400     uint32_t nf = vext_nf(desc);
401     uint32_t max_elems = vext_max_elems(desc, log2_esz);
402     uint32_t esz = 1 << log2_esz;
403     uint32_t msize = nf * esz;
404     int mmu_index = riscv_env_mmu_index(env, false);
405 
406     VSTART_CHECK_EARLY_EXIT(env, evl);
407 
408 #if defined(CONFIG_USER_ONLY)
409     /*
410      * For data sizes <= 6 bytes we get better performance by simply calling
411      * vext_continuous_ldst_tlb
412      */
413     if (nf == 1 && (evl << log2_esz) <= 6) {
414         addr = base + (env->vstart << log2_esz);
415         vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart, ra,
416                                  esz, is_load);
417 
418         env->vstart = 0;
419         vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
420         return;
421     }
422 #endif
423 
424     /* Calculate the page range of first page */
425     addr = base + ((env->vstart * nf) << log2_esz);
426     page_split = -(addr | TARGET_PAGE_MASK);
427     /* Get number of elements */
428     elems = page_split / msize;
429     if (unlikely(env->vstart + elems >= evl)) {
430         elems = evl - env->vstart;
431     }
432 
433     /* Load/store elements in the first page */
434     if (likely(elems)) {
435         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
436                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
437     }
438 
439     /* Load/store elements in the second page */
440     if (unlikely(env->vstart < evl)) {
441         /* Cross page element */
442         if (unlikely(page_split % msize)) {
443             for (k = 0; k < nf; k++) {
444                 addr = base + ((env->vstart * nf + k) << log2_esz);
445                 ldst_tlb(env, adjust_addr(env, addr),
446                         env->vstart + k * max_elems, vd, ra);
447             }
448             env->vstart++;
449         }
450 
451         addr = base + ((env->vstart * nf) << log2_esz);
452         /* Get number of elements of second page */
453         elems = evl - env->vstart;
454 
455         /* Load/store elements in the second page */
456         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
457                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
458     }
459 
460     env->vstart = 0;
461     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
462 }
463 
464 /*
465  * masked unit-stride load and store operation will be a special case of
466  * stride, stride = NF * sizeof (ETYPE)
467  */
468 
469 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
470 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
471                          CPURISCVState *env, uint32_t desc)         \
472 {                                                                   \
473     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
474     vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
475                      LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
476 }                                                                   \
477                                                                     \
478 void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
479                   CPURISCVState *env, uint32_t desc)                \
480 {                                                                   \
481     vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
482                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
483 }
484 
GEN_VEXT_LD_US(vle8_v,int8_t,lde_b_tlb,lde_b_host)485 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
486 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
487 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
488 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
489 
490 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
491 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
492                          CPURISCVState *env, uint32_t desc)              \
493 {                                                                        \
494     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
495     vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
496                      STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
497 }                                                                        \
498                                                                          \
499 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
500                   CPURISCVState *env, uint32_t desc)                     \
501 {                                                                        \
502     vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
503                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
504 }
505 
506 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
507 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
508 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
509 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
510 
511 /*
512  * unit stride mask load and store, EEW = 1
513  */
514 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
515                     CPURISCVState *env, uint32_t desc)
516 {
517     /* evl = ceil(vl/8) */
518     uint8_t evl = (env->vl + 7) >> 3;
519     vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
520                  0, evl, GETPC(), true);
521 }
522 
HELPER(vsm_v)523 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
524                     CPURISCVState *env, uint32_t desc)
525 {
526     /* evl = ceil(vl/8) */
527     uint8_t evl = (env->vl + 7) >> 3;
528     vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
529                  0, evl, GETPC(), false);
530 }
531 
532 /*
533  * index: access vector element from indexed memory
534  */
535 typedef target_ulong vext_get_index_addr(target_ulong base,
536         uint32_t idx, void *vs2);
537 
538 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
539 static target_ulong NAME(target_ulong base,            \
540                          uint32_t idx, void *vs2)      \
541 {                                                      \
542     return (base + *((ETYPE *)vs2 + H(idx)));          \
543 }
544 
GEN_VEXT_GET_INDEX_ADDR(idx_b,uint8_t,H1)545 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
546 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
547 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
548 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
549 
550 static inline void
551 vext_ldst_index(void *vd, void *v0, target_ulong base,
552                 void *vs2, CPURISCVState *env, uint32_t desc,
553                 vext_get_index_addr get_index_addr,
554                 vext_ldst_elem_fn_tlb *ldst_elem,
555                 uint32_t log2_esz, uintptr_t ra)
556 {
557     uint32_t i, k;
558     uint32_t nf = vext_nf(desc);
559     uint32_t vm = vext_vm(desc);
560     uint32_t max_elems = vext_max_elems(desc, log2_esz);
561     uint32_t esz = 1 << log2_esz;
562     uint32_t vma = vext_vma(desc);
563 
564     VSTART_CHECK_EARLY_EXIT(env, env->vl);
565 
566     /* load bytes from guest memory */
567     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
568         k = 0;
569         while (k < nf) {
570             if (!vm && !vext_elem_mask(v0, i)) {
571                 /* set masked-off elements to 1s */
572                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
573                                   (i + k * max_elems + 1) * esz);
574                 k++;
575                 continue;
576             }
577             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
578             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
579             k++;
580         }
581     }
582     env->vstart = 0;
583 
584     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
585 }
586 
587 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
588 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
589                   void *vs2, CPURISCVState *env, uint32_t desc)            \
590 {                                                                          \
591     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
592                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
593 }
594 
GEN_VEXT_LD_INDEX(vlxei8_8_v,int8_t,idx_b,lde_b_tlb)595 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
596 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
597 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
598 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
599 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
600 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
601 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
602 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
603 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
604 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
605 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
606 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
607 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
608 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
609 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
610 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
611 
612 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
613 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
614                   void *vs2, CPURISCVState *env, uint32_t desc)  \
615 {                                                                \
616     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
617                     STORE_FN, ctzl(sizeof(ETYPE)),               \
618                     GETPC());                                    \
619 }
620 
621 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
622 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
623 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
624 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
625 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
626 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
627 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
628 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
629 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
630 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
631 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
632 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
633 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
634 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
635 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
636 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
637 
638 /*
639  * unit-stride fault-only-fisrt load instructions
640  */
641 static inline void
642 vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
643           uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
644           vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
645 {
646     uint32_t i, k, vl = 0;
647     uint32_t nf = vext_nf(desc);
648     uint32_t vm = vext_vm(desc);
649     uint32_t max_elems = vext_max_elems(desc, log2_esz);
650     uint32_t esz = 1 << log2_esz;
651     uint32_t msize = nf * esz;
652     uint32_t vma = vext_vma(desc);
653     target_ulong addr, addr_probe, addr_i, offset, remain, page_split, elems;
654     int mmu_index = riscv_env_mmu_index(env, false);
655     int flags, probe_flags;
656     void *host;
657 
658     VSTART_CHECK_EARLY_EXIT(env, env->vl);
659 
660     addr = base + ((env->vstart * nf) << log2_esz);
661     page_split = -(addr | TARGET_PAGE_MASK);
662     /* Get number of elements */
663     elems = page_split / msize;
664     if (unlikely(env->vstart + elems >= env->vl)) {
665         elems = env->vl - env->vstart;
666     }
667 
668     /* Check page permission/pmp/watchpoint/etc. */
669     probe_pages(env, addr, elems * msize, ra, MMU_DATA_LOAD, mmu_index, &host,
670                 &flags, true);
671 
672     /* If we are crossing a page check also the second page. */
673     if (env->vl > elems) {
674         addr_probe = addr + (elems << log2_esz);
675         probe_pages(env, addr_probe, elems * msize, ra, MMU_DATA_LOAD,
676                     mmu_index, &host, &probe_flags, true);
677         flags |= probe_flags;
678     }
679 
680     if (flags & ~TLB_WATCHPOINT) {
681         /* probe every access */
682         for (i = env->vstart; i < env->vl; i++) {
683             if (!vm && !vext_elem_mask(v0, i)) {
684                 continue;
685             }
686             addr_i = adjust_addr(env, base + i * (nf << log2_esz));
687             if (i == 0) {
688                 /* Allow fault on first element. */
689                 probe_pages(env, addr_i, nf << log2_esz, ra, MMU_DATA_LOAD,
690                             mmu_index, &host, NULL, false);
691             } else {
692                 remain = nf << log2_esz;
693                 while (remain > 0) {
694                     offset = -(addr_i | TARGET_PAGE_MASK);
695 
696                     /* Probe nonfault on subsequent elements. */
697                     probe_pages(env, addr_i, offset, 0, MMU_DATA_LOAD,
698                                 mmu_index, &host, &flags, true);
699 
700                     /*
701                      * Stop if invalid (unmapped) or mmio (transaction may
702                      * fail). Do not stop if watchpoint, as the spec says that
703                      * first-fault should continue to access the same
704                      * elements regardless of any watchpoint.
705                      */
706                     if (flags & ~TLB_WATCHPOINT) {
707                         vl = i;
708                         goto ProbeSuccess;
709                     }
710                     if (remain <= offset) {
711                         break;
712                     }
713                     remain -= offset;
714                     addr_i = adjust_addr(env, addr_i + offset);
715                 }
716             }
717         }
718     }
719 ProbeSuccess:
720     /* load bytes from guest memory */
721     if (vl != 0) {
722         env->vl = vl;
723     }
724 
725     if (env->vstart < env->vl) {
726         if (vm) {
727             /* Load/store elements in the first page */
728             if (likely(elems)) {
729                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
730                                   log2_esz, true, mmu_index, ldst_tlb,
731                                   ldst_host, ra);
732             }
733 
734             /* Load/store elements in the second page */
735             if (unlikely(env->vstart < env->vl)) {
736                 /* Cross page element */
737                 if (unlikely(page_split % msize)) {
738                     for (k = 0; k < nf; k++) {
739                         addr = base + ((env->vstart * nf + k) << log2_esz);
740                         ldst_tlb(env, adjust_addr(env, addr),
741                                  env->vstart + k * max_elems, vd, ra);
742                     }
743                     env->vstart++;
744                 }
745 
746                 addr = base + ((env->vstart * nf) << log2_esz);
747                 /* Get number of elements of second page */
748                 elems = env->vl - env->vstart;
749 
750                 /* Load/store elements in the second page */
751                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
752                                   log2_esz, true, mmu_index, ldst_tlb,
753                                   ldst_host, ra);
754             }
755         } else {
756             for (i = env->vstart; i < env->vl; i++) {
757                 k = 0;
758                 while (k < nf) {
759                     if (!vext_elem_mask(v0, i)) {
760                         /* set masked-off elements to 1s */
761                         vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
762                                           (i + k * max_elems + 1) * esz);
763                         k++;
764                         continue;
765                     }
766                     addr = base + ((i * nf + k) << log2_esz);
767                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
768                              vd, ra);
769                     k++;
770                 }
771             }
772         }
773     }
774     env->vstart = 0;
775 
776     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
777 }
778 
779 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
780 void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
781                   CPURISCVState *env, uint32_t desc)            \
782 {                                                               \
783     vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
784               LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
785 }
786 
GEN_VEXT_LDFF(vle8ff_v,int8_t,lde_b_tlb,lde_b_host)787 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
788 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
789 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
790 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
791 
792 #define DO_SWAP(N, M) (M)
793 #define DO_AND(N, M)  (N & M)
794 #define DO_XOR(N, M)  (N ^ M)
795 #define DO_OR(N, M)   (N | M)
796 #define DO_ADD(N, M)  (N + M)
797 
798 /* Signed min/max */
799 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
800 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
801 
802 /*
803  * load and store whole register instructions
804  */
805 static inline QEMU_ALWAYS_INLINE void
806 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
807                 vext_ldst_elem_fn_tlb *ldst_tlb,
808                 vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
809                 uintptr_t ra, bool is_load)
810 {
811     target_ulong page_split, elems, addr;
812     uint32_t nf = vext_nf(desc);
813     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
814     uint32_t max_elems = vlenb >> log2_esz;
815     uint32_t evl = nf * max_elems;
816     uint32_t esz = 1 << log2_esz;
817     int mmu_index = riscv_env_mmu_index(env, false);
818 
819     /* Calculate the page range of first page */
820     addr = base + (env->vstart << log2_esz);
821     page_split = -(addr | TARGET_PAGE_MASK);
822     /* Get number of elements */
823     elems = page_split / esz;
824     if (unlikely(env->vstart + elems >= evl)) {
825         elems = evl - env->vstart;
826     }
827 
828     /* Load/store elements in the first page */
829     if (likely(elems)) {
830         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
831                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
832     }
833 
834     /* Load/store elements in the second page */
835     if (unlikely(env->vstart < evl)) {
836         /* Cross page element */
837         if (unlikely(page_split % esz)) {
838             addr = base + (env->vstart << log2_esz);
839             ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
840             env->vstart++;
841         }
842 
843         addr = base + (env->vstart << log2_esz);
844         /* Get number of elements of second page */
845         elems = evl - env->vstart;
846 
847         /* Load/store elements in the second page */
848         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
849                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
850     }
851 
852     env->vstart = 0;
853 }
854 
855 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
856 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
857                   uint32_t desc)                                    \
858 {                                                                   \
859     vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
860                     ctzl(sizeof(ETYPE)), GETPC(), true);            \
861 }
862 
GEN_VEXT_LD_WHOLE(vl1re8_v,int8_t,lde_b_tlb,lde_b_host)863 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
864 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
865 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
866 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
867 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
868 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
869 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
870 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
871 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
872 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
873 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
874 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
875 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
876 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
877 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
878 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
879 
880 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
881 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
882                   uint32_t desc)                                        \
883 {                                                                       \
884     vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
885                     ctzl(sizeof(ETYPE)), GETPC(), false);               \
886 }
887 
888 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
889 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
890 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
891 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
892 
893 /*
894  * Vector Integer Arithmetic Instructions
895  */
896 
897 /* (TD, T1, T2, TX1, TX2) */
898 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
899 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
900 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
901 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
902 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
903 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
904 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
905 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
906 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
907 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
908 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
909 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
910 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
911 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
912 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
913 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
914 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
915 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
916 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
917 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
918 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
919 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
920 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
921 
922 #define DO_SUB(N, M) (N - M)
923 #define DO_RSUB(N, M) (M - N)
924 
925 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
926 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
927 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
928 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
929 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
930 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
931 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
932 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
933 
934 GEN_VEXT_VV(vadd_vv_b, 1)
935 GEN_VEXT_VV(vadd_vv_h, 2)
936 GEN_VEXT_VV(vadd_vv_w, 4)
937 GEN_VEXT_VV(vadd_vv_d, 8)
938 GEN_VEXT_VV(vsub_vv_b, 1)
939 GEN_VEXT_VV(vsub_vv_h, 2)
940 GEN_VEXT_VV(vsub_vv_w, 4)
941 GEN_VEXT_VV(vsub_vv_d, 8)
942 
943 
944 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
945 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
946 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
947 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
948 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
949 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
950 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
951 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
952 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
953 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
954 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
955 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
956 
957 GEN_VEXT_VX(vadd_vx_b, 1)
958 GEN_VEXT_VX(vadd_vx_h, 2)
959 GEN_VEXT_VX(vadd_vx_w, 4)
960 GEN_VEXT_VX(vadd_vx_d, 8)
961 GEN_VEXT_VX(vsub_vx_b, 1)
962 GEN_VEXT_VX(vsub_vx_h, 2)
963 GEN_VEXT_VX(vsub_vx_w, 4)
964 GEN_VEXT_VX(vsub_vx_d, 8)
965 GEN_VEXT_VX(vrsub_vx_b, 1)
966 GEN_VEXT_VX(vrsub_vx_h, 2)
967 GEN_VEXT_VX(vrsub_vx_w, 4)
968 GEN_VEXT_VX(vrsub_vx_d, 8)
969 
970 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
971 {
972     intptr_t oprsz = simd_oprsz(desc);
973     intptr_t i;
974 
975     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
976         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
977     }
978 }
979 
HELPER(vec_rsubs16)980 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
981 {
982     intptr_t oprsz = simd_oprsz(desc);
983     intptr_t i;
984 
985     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
986         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
987     }
988 }
989 
HELPER(vec_rsubs32)990 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
991 {
992     intptr_t oprsz = simd_oprsz(desc);
993     intptr_t i;
994 
995     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
996         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
997     }
998 }
999 
HELPER(vec_rsubs64)1000 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
1001 {
1002     intptr_t oprsz = simd_oprsz(desc);
1003     intptr_t i;
1004 
1005     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1006         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
1007     }
1008 }
1009 
1010 /* Vector Widening Integer Add/Subtract */
1011 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
1012 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
1013 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
1014 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
1015 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
1016 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
1017 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
1018 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
1019 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
1020 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
1021 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
1022 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
RVVCALL(OPIVV2,vwaddu_vv_b,WOP_UUU_B,H2,H1,H1,DO_ADD)1023 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
1024 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
1025 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
1026 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
1027 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
1028 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1029 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1030 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1031 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1032 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1033 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1034 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1035 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1036 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1037 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1038 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1039 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1040 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1041 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1042 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1043 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1044 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1045 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1046 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1047 GEN_VEXT_VV(vwaddu_vv_b, 2)
1048 GEN_VEXT_VV(vwaddu_vv_h, 4)
1049 GEN_VEXT_VV(vwaddu_vv_w, 8)
1050 GEN_VEXT_VV(vwsubu_vv_b, 2)
1051 GEN_VEXT_VV(vwsubu_vv_h, 4)
1052 GEN_VEXT_VV(vwsubu_vv_w, 8)
1053 GEN_VEXT_VV(vwadd_vv_b, 2)
1054 GEN_VEXT_VV(vwadd_vv_h, 4)
1055 GEN_VEXT_VV(vwadd_vv_w, 8)
1056 GEN_VEXT_VV(vwsub_vv_b, 2)
1057 GEN_VEXT_VV(vwsub_vv_h, 4)
1058 GEN_VEXT_VV(vwsub_vv_w, 8)
1059 GEN_VEXT_VV(vwaddu_wv_b, 2)
1060 GEN_VEXT_VV(vwaddu_wv_h, 4)
1061 GEN_VEXT_VV(vwaddu_wv_w, 8)
1062 GEN_VEXT_VV(vwsubu_wv_b, 2)
1063 GEN_VEXT_VV(vwsubu_wv_h, 4)
1064 GEN_VEXT_VV(vwsubu_wv_w, 8)
1065 GEN_VEXT_VV(vwadd_wv_b, 2)
1066 GEN_VEXT_VV(vwadd_wv_h, 4)
1067 GEN_VEXT_VV(vwadd_wv_w, 8)
1068 GEN_VEXT_VV(vwsub_wv_b, 2)
1069 GEN_VEXT_VV(vwsub_wv_h, 4)
1070 GEN_VEXT_VV(vwsub_wv_w, 8)
1071 
1072 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1073 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1074 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1075 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1076 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1077 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1078 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1079 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1080 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1081 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1082 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1083 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1084 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1085 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1086 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1087 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1088 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1089 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1090 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1091 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1092 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1093 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1094 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1095 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1096 GEN_VEXT_VX(vwaddu_vx_b, 2)
1097 GEN_VEXT_VX(vwaddu_vx_h, 4)
1098 GEN_VEXT_VX(vwaddu_vx_w, 8)
1099 GEN_VEXT_VX(vwsubu_vx_b, 2)
1100 GEN_VEXT_VX(vwsubu_vx_h, 4)
1101 GEN_VEXT_VX(vwsubu_vx_w, 8)
1102 GEN_VEXT_VX(vwadd_vx_b, 2)
1103 GEN_VEXT_VX(vwadd_vx_h, 4)
1104 GEN_VEXT_VX(vwadd_vx_w, 8)
1105 GEN_VEXT_VX(vwsub_vx_b, 2)
1106 GEN_VEXT_VX(vwsub_vx_h, 4)
1107 GEN_VEXT_VX(vwsub_vx_w, 8)
1108 GEN_VEXT_VX(vwaddu_wx_b, 2)
1109 GEN_VEXT_VX(vwaddu_wx_h, 4)
1110 GEN_VEXT_VX(vwaddu_wx_w, 8)
1111 GEN_VEXT_VX(vwsubu_wx_b, 2)
1112 GEN_VEXT_VX(vwsubu_wx_h, 4)
1113 GEN_VEXT_VX(vwsubu_wx_w, 8)
1114 GEN_VEXT_VX(vwadd_wx_b, 2)
1115 GEN_VEXT_VX(vwadd_wx_h, 4)
1116 GEN_VEXT_VX(vwadd_wx_w, 8)
1117 GEN_VEXT_VX(vwsub_wx_b, 2)
1118 GEN_VEXT_VX(vwsub_wx_h, 4)
1119 GEN_VEXT_VX(vwsub_wx_w, 8)
1120 
1121 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1122 #define DO_VADC(N, M, C) (N + M + C)
1123 #define DO_VSBC(N, M, C) (N - M - C)
1124 
1125 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1126 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1127                   CPURISCVState *env, uint32_t desc)          \
1128 {                                                             \
1129     uint32_t vl = env->vl;                                    \
1130     uint32_t esz = sizeof(ETYPE);                             \
1131     uint32_t total_elems =                                    \
1132         vext_get_total_elems(env, desc, esz);                 \
1133     uint32_t vta = vext_vta(desc);                            \
1134     uint32_t i;                                               \
1135                                                               \
1136     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1137                                                               \
1138     for (i = env->vstart; i < vl; i++) {                      \
1139         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1140         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1141         ETYPE carry = vext_elem_mask(v0, i);                  \
1142                                                               \
1143         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1144     }                                                         \
1145     env->vstart = 0;                                          \
1146     /* set tail elements to 1s */                             \
1147     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1148 }
1149 
1150 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1151 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1152 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1153 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1154 
1155 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1156 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1157 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1158 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1159 
1160 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1161 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1162                   CPURISCVState *env, uint32_t desc)                     \
1163 {                                                                        \
1164     uint32_t vl = env->vl;                                               \
1165     uint32_t esz = sizeof(ETYPE);                                        \
1166     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1167     uint32_t vta = vext_vta(desc);                                       \
1168     uint32_t i;                                                          \
1169                                                                          \
1170     VSTART_CHECK_EARLY_EXIT(env, vl);                                    \
1171                                                                          \
1172     for (i = env->vstart; i < vl; i++) {                                 \
1173         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1174         ETYPE carry = vext_elem_mask(v0, i);                             \
1175                                                                          \
1176         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1177     }                                                                    \
1178     env->vstart = 0;                                                     \
1179     /* set tail elements to 1s */                                        \
1180     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1181 }
1182 
1183 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1184 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1185 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1186 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1187 
1188 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1189 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1190 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1191 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1192 
1193 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1194                           (__typeof(N))(N + M) < N)
1195 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1196 
1197 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1198 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1199                   CPURISCVState *env, uint32_t desc)          \
1200 {                                                             \
1201     uint32_t vl = env->vl;                                    \
1202     uint32_t vm = vext_vm(desc);                              \
1203     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1204     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1205     uint32_t i;                                               \
1206                                                               \
1207     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1208                                                               \
1209     for (i = env->vstart; i < vl; i++) {                      \
1210         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1211         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1212         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1213         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1214     }                                                         \
1215     env->vstart = 0;                                          \
1216     /*
1217      * mask destination register are always tail-agnostic
1218      * set tail elements to 1s
1219      */                                                       \
1220     if (vta_all_1s) {                                         \
1221         for (; i < total_elems; i++) {                        \
1222             vext_set_elem_mask(vd, i, 1);                     \
1223         }                                                     \
1224     }                                                         \
1225 }
1226 
1227 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1228 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1229 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1230 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1231 
1232 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1233 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1234 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1235 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1236 
1237 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1238 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1239                   void *vs2, CPURISCVState *env, uint32_t desc) \
1240 {                                                               \
1241     uint32_t vl = env->vl;                                      \
1242     uint32_t vm = vext_vm(desc);                                \
1243     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1244     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1245     uint32_t i;                                                 \
1246                                                                 \
1247     VSTART_CHECK_EARLY_EXIT(env, vl);                           \
1248                                                                 \
1249     for (i = env->vstart; i < vl; i++) {                        \
1250         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1251         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1252         vext_set_elem_mask(vd, i,                               \
1253                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1254     }                                                           \
1255     env->vstart = 0;                                            \
1256     /*
1257      * mask destination register are always tail-agnostic
1258      * set tail elements to 1s
1259      */                                                         \
1260     if (vta_all_1s) {                                           \
1261         for (; i < total_elems; i++) {                          \
1262             vext_set_elem_mask(vd, i, 1);                       \
1263         }                                                       \
1264     }                                                           \
1265 }
1266 
1267 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1268 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1269 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1270 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1271 
1272 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1273 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1274 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1275 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1276 
1277 /* Vector Bitwise Logical Instructions */
1278 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1279 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1280 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1281 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1282 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1283 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1284 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1285 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1286 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1287 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1288 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1289 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1290 GEN_VEXT_VV(vand_vv_b, 1)
1291 GEN_VEXT_VV(vand_vv_h, 2)
1292 GEN_VEXT_VV(vand_vv_w, 4)
1293 GEN_VEXT_VV(vand_vv_d, 8)
1294 GEN_VEXT_VV(vor_vv_b, 1)
1295 GEN_VEXT_VV(vor_vv_h, 2)
1296 GEN_VEXT_VV(vor_vv_w, 4)
1297 GEN_VEXT_VV(vor_vv_d, 8)
1298 GEN_VEXT_VV(vxor_vv_b, 1)
1299 GEN_VEXT_VV(vxor_vv_h, 2)
1300 GEN_VEXT_VV(vxor_vv_w, 4)
1301 GEN_VEXT_VV(vxor_vv_d, 8)
1302 
1303 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1304 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1305 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1306 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1307 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1308 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1309 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1310 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1311 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1312 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1313 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1314 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1315 GEN_VEXT_VX(vand_vx_b, 1)
1316 GEN_VEXT_VX(vand_vx_h, 2)
1317 GEN_VEXT_VX(vand_vx_w, 4)
1318 GEN_VEXT_VX(vand_vx_d, 8)
1319 GEN_VEXT_VX(vor_vx_b, 1)
1320 GEN_VEXT_VX(vor_vx_h, 2)
1321 GEN_VEXT_VX(vor_vx_w, 4)
1322 GEN_VEXT_VX(vor_vx_d, 8)
1323 GEN_VEXT_VX(vxor_vx_b, 1)
1324 GEN_VEXT_VX(vxor_vx_h, 2)
1325 GEN_VEXT_VX(vxor_vx_w, 4)
1326 GEN_VEXT_VX(vxor_vx_d, 8)
1327 
1328 /* Vector Single-Width Bit Shift Instructions */
1329 #define DO_SLL(N, M)  (N << (M))
1330 #define DO_SRL(N, M)  (N >> (M))
1331 
1332 /* generate the helpers for shift instructions with two vector operators */
1333 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1334 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1335                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1336 {                                                                         \
1337     uint32_t vm = vext_vm(desc);                                          \
1338     uint32_t vl = env->vl;                                                \
1339     uint32_t esz = sizeof(TS1);                                           \
1340     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1341     uint32_t vta = vext_vta(desc);                                        \
1342     uint32_t vma = vext_vma(desc);                                        \
1343     uint32_t i;                                                           \
1344                                                                           \
1345     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
1346                                                                           \
1347     for (i = env->vstart; i < vl; i++) {                                  \
1348         if (!vm && !vext_elem_mask(v0, i)) {                              \
1349             /* set masked-off elements to 1s */                           \
1350             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1351             continue;                                                     \
1352         }                                                                 \
1353         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1354         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1355         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1356     }                                                                     \
1357     env->vstart = 0;                                                      \
1358     /* set tail elements to 1s */                                         \
1359     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1360 }
1361 
1362 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1363 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1364 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1365 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1366 
1367 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1368 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1369 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1370 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1371 
1372 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1373 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1374 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1375 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1376 
1377 /*
1378  * generate the helpers for shift instructions with one vector and one scalar
1379  */
1380 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1381 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1382                   void *vs2, CPURISCVState *env,            \
1383                   uint32_t desc)                            \
1384 {                                                           \
1385     uint32_t vm = vext_vm(desc);                            \
1386     uint32_t vl = env->vl;                                  \
1387     uint32_t esz = sizeof(TD);                              \
1388     uint32_t total_elems =                                  \
1389         vext_get_total_elems(env, desc, esz);               \
1390     uint32_t vta = vext_vta(desc);                          \
1391     uint32_t vma = vext_vma(desc);                          \
1392     uint32_t i;                                             \
1393                                                             \
1394     VSTART_CHECK_EARLY_EXIT(env, vl);                       \
1395                                                             \
1396     for (i = env->vstart; i < vl; i++) {                    \
1397         if (!vm && !vext_elem_mask(v0, i)) {                \
1398             /* set masked-off elements to 1s */             \
1399             vext_set_elems_1s(vd, vma, i * esz,             \
1400                               (i + 1) * esz);               \
1401             continue;                                       \
1402         }                                                   \
1403         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1404         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1405     }                                                       \
1406     env->vstart = 0;                                        \
1407     /* set tail elements to 1s */                           \
1408     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1409 }
1410 
1411 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1412 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1413 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1414 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1415 
1416 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1417 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1418 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1419 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1420 
1421 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1422 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1423 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1424 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1425 
1426 /* Vector Narrowing Integer Right Shift Instructions */
1427 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1428 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1429 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1430 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1431 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1432 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1433 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1434 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1435 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1436 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1437 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1438 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1439 
1440 /* Vector Integer Comparison Instructions */
1441 #define DO_MSEQ(N, M) (N == M)
1442 #define DO_MSNE(N, M) (N != M)
1443 #define DO_MSLT(N, M) (N < M)
1444 #define DO_MSLE(N, M) (N <= M)
1445 #define DO_MSGT(N, M) (N > M)
1446 
1447 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1448 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1449                   CPURISCVState *env, uint32_t desc)          \
1450 {                                                             \
1451     uint32_t vm = vext_vm(desc);                              \
1452     uint32_t vl = env->vl;                                    \
1453     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1454     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1455     uint32_t vma = vext_vma(desc);                            \
1456     uint32_t i;                                               \
1457                                                               \
1458     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1459                                                               \
1460     for (i = env->vstart; i < vl; i++) {                      \
1461         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1462         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1463         if (!vm && !vext_elem_mask(v0, i)) {                  \
1464             /* set masked-off elements to 1s */               \
1465             if (vma) {                                        \
1466                 vext_set_elem_mask(vd, i, 1);                 \
1467             }                                                 \
1468             continue;                                         \
1469         }                                                     \
1470         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1471     }                                                         \
1472     env->vstart = 0;                                          \
1473     /*
1474      * mask destination register are always tail-agnostic
1475      * set tail elements to 1s
1476      */                                                       \
1477     if (vta_all_1s) {                                         \
1478         for (; i < total_elems; i++) {                        \
1479             vext_set_elem_mask(vd, i, 1);                     \
1480         }                                                     \
1481     }                                                         \
1482 }
1483 
1484 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1485 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1486 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1487 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1488 
1489 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1490 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1491 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1492 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1493 
1494 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1495 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1496 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1497 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1498 
1499 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1500 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1501 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1502 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1503 
1504 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1505 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1506 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1507 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1508 
1509 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1510 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1511 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1512 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1513 
1514 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1515 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1516                   CPURISCVState *env, uint32_t desc)                \
1517 {                                                                   \
1518     uint32_t vm = vext_vm(desc);                                    \
1519     uint32_t vl = env->vl;                                          \
1520     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1521     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1522     uint32_t vma = vext_vma(desc);                                  \
1523     uint32_t i;                                                     \
1524                                                                     \
1525     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
1526                                                                     \
1527     for (i = env->vstart; i < vl; i++) {                            \
1528         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1529         if (!vm && !vext_elem_mask(v0, i)) {                        \
1530             /* set masked-off elements to 1s */                     \
1531             if (vma) {                                              \
1532                 vext_set_elem_mask(vd, i, 1);                       \
1533             }                                                       \
1534             continue;                                               \
1535         }                                                           \
1536         vext_set_elem_mask(vd, i,                                   \
1537                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1538     }                                                               \
1539     env->vstart = 0;                                                \
1540     /*
1541      * mask destination register are always tail-agnostic
1542      * set tail elements to 1s
1543      */                                                             \
1544     if (vta_all_1s) {                                               \
1545         for (; i < total_elems; i++) {                              \
1546             vext_set_elem_mask(vd, i, 1);                           \
1547         }                                                           \
1548     }                                                               \
1549 }
1550 
1551 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1552 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1553 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1554 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1555 
1556 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1557 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1558 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1559 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1560 
1561 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1562 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1563 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1564 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1565 
1566 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1567 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1568 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1569 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1570 
1571 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1572 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1573 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1574 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1575 
1576 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1577 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1578 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1579 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1580 
1581 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1582 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1583 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1584 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1585 
1586 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1587 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1588 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1589 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1590 
1591 /* Vector Integer Min/Max Instructions */
1592 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1593 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1594 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1595 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1596 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1597 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1598 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1599 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1600 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1601 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1602 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1603 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1604 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1605 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1606 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1607 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1608 GEN_VEXT_VV(vminu_vv_b, 1)
1609 GEN_VEXT_VV(vminu_vv_h, 2)
1610 GEN_VEXT_VV(vminu_vv_w, 4)
1611 GEN_VEXT_VV(vminu_vv_d, 8)
1612 GEN_VEXT_VV(vmin_vv_b, 1)
1613 GEN_VEXT_VV(vmin_vv_h, 2)
1614 GEN_VEXT_VV(vmin_vv_w, 4)
1615 GEN_VEXT_VV(vmin_vv_d, 8)
1616 GEN_VEXT_VV(vmaxu_vv_b, 1)
1617 GEN_VEXT_VV(vmaxu_vv_h, 2)
1618 GEN_VEXT_VV(vmaxu_vv_w, 4)
1619 GEN_VEXT_VV(vmaxu_vv_d, 8)
1620 GEN_VEXT_VV(vmax_vv_b, 1)
1621 GEN_VEXT_VV(vmax_vv_h, 2)
1622 GEN_VEXT_VV(vmax_vv_w, 4)
1623 GEN_VEXT_VV(vmax_vv_d, 8)
1624 
1625 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1626 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1627 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1628 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1629 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1630 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1631 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1632 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1633 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1634 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1635 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1636 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1637 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1638 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1639 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1640 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1641 GEN_VEXT_VX(vminu_vx_b, 1)
1642 GEN_VEXT_VX(vminu_vx_h, 2)
1643 GEN_VEXT_VX(vminu_vx_w, 4)
1644 GEN_VEXT_VX(vminu_vx_d, 8)
1645 GEN_VEXT_VX(vmin_vx_b, 1)
1646 GEN_VEXT_VX(vmin_vx_h, 2)
1647 GEN_VEXT_VX(vmin_vx_w, 4)
1648 GEN_VEXT_VX(vmin_vx_d, 8)
1649 GEN_VEXT_VX(vmaxu_vx_b, 1)
1650 GEN_VEXT_VX(vmaxu_vx_h, 2)
1651 GEN_VEXT_VX(vmaxu_vx_w, 4)
1652 GEN_VEXT_VX(vmaxu_vx_d, 8)
1653 GEN_VEXT_VX(vmax_vx_b, 1)
1654 GEN_VEXT_VX(vmax_vx_h, 2)
1655 GEN_VEXT_VX(vmax_vx_w, 4)
1656 GEN_VEXT_VX(vmax_vx_d, 8)
1657 
1658 /* Vector Single-Width Integer Multiply Instructions */
1659 #define DO_MUL(N, M) (N * M)
1660 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1661 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1662 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1663 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1664 GEN_VEXT_VV(vmul_vv_b, 1)
1665 GEN_VEXT_VV(vmul_vv_h, 2)
1666 GEN_VEXT_VV(vmul_vv_w, 4)
1667 GEN_VEXT_VV(vmul_vv_d, 8)
1668 
1669 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1670 {
1671     return (int16_t)s2 * (int16_t)s1 >> 8;
1672 }
1673 
do_mulh_h(int16_t s2,int16_t s1)1674 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1675 {
1676     return (int32_t)s2 * (int32_t)s1 >> 16;
1677 }
1678 
do_mulh_w(int32_t s2,int32_t s1)1679 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1680 {
1681     return (int64_t)s2 * (int64_t)s1 >> 32;
1682 }
1683 
do_mulh_d(int64_t s2,int64_t s1)1684 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1685 {
1686     uint64_t hi_64, lo_64;
1687 
1688     muls64(&lo_64, &hi_64, s1, s2);
1689     return hi_64;
1690 }
1691 
do_mulhu_b(uint8_t s2,uint8_t s1)1692 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1693 {
1694     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1695 }
1696 
do_mulhu_h(uint16_t s2,uint16_t s1)1697 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1698 {
1699     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1700 }
1701 
do_mulhu_w(uint32_t s2,uint32_t s1)1702 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1703 {
1704     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1705 }
1706 
do_mulhu_d(uint64_t s2,uint64_t s1)1707 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1708 {
1709     uint64_t hi_64, lo_64;
1710 
1711     mulu64(&lo_64, &hi_64, s2, s1);
1712     return hi_64;
1713 }
1714 
do_mulhsu_b(int8_t s2,uint8_t s1)1715 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1716 {
1717     return (int16_t)s2 * (uint16_t)s1 >> 8;
1718 }
1719 
do_mulhsu_h(int16_t s2,uint16_t s1)1720 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1721 {
1722     return (int32_t)s2 * (uint32_t)s1 >> 16;
1723 }
1724 
do_mulhsu_w(int32_t s2,uint32_t s1)1725 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1726 {
1727     return (int64_t)s2 * (uint64_t)s1 >> 32;
1728 }
1729 
1730 /*
1731  * Let  A = signed operand,
1732  *      B = unsigned operand
1733  *      P = mulu64(A, B), unsigned product
1734  *
1735  * LET  X = 2 ** 64  - A, 2's complement of A
1736  *      SP = signed product
1737  * THEN
1738  *      IF A < 0
1739  *          SP = -X * B
1740  *             = -(2 ** 64 - A) * B
1741  *             = A * B - 2 ** 64 * B
1742  *             = P - 2 ** 64 * B
1743  *      ELSE
1744  *          SP = P
1745  * THEN
1746  *      HI_P -= (A < 0 ? B : 0)
1747  */
1748 
do_mulhsu_d(int64_t s2,uint64_t s1)1749 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1750 {
1751     uint64_t hi_64, lo_64;
1752 
1753     mulu64(&lo_64, &hi_64, s2, s1);
1754 
1755     hi_64 -= s2 < 0 ? s1 : 0;
1756     return hi_64;
1757 }
1758 
1759 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1760 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1761 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1762 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1763 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1764 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1765 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1766 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1767 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1768 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1769 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1770 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1771 GEN_VEXT_VV(vmulh_vv_b, 1)
1772 GEN_VEXT_VV(vmulh_vv_h, 2)
1773 GEN_VEXT_VV(vmulh_vv_w, 4)
1774 GEN_VEXT_VV(vmulh_vv_d, 8)
1775 GEN_VEXT_VV(vmulhu_vv_b, 1)
1776 GEN_VEXT_VV(vmulhu_vv_h, 2)
1777 GEN_VEXT_VV(vmulhu_vv_w, 4)
1778 GEN_VEXT_VV(vmulhu_vv_d, 8)
1779 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1780 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1781 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1782 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1783 
1784 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1785 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1786 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1787 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1788 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1789 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1790 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1791 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1792 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1793 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1794 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1795 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1796 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1797 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1798 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1799 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1800 GEN_VEXT_VX(vmul_vx_b, 1)
1801 GEN_VEXT_VX(vmul_vx_h, 2)
1802 GEN_VEXT_VX(vmul_vx_w, 4)
1803 GEN_VEXT_VX(vmul_vx_d, 8)
1804 GEN_VEXT_VX(vmulh_vx_b, 1)
1805 GEN_VEXT_VX(vmulh_vx_h, 2)
1806 GEN_VEXT_VX(vmulh_vx_w, 4)
1807 GEN_VEXT_VX(vmulh_vx_d, 8)
1808 GEN_VEXT_VX(vmulhu_vx_b, 1)
1809 GEN_VEXT_VX(vmulhu_vx_h, 2)
1810 GEN_VEXT_VX(vmulhu_vx_w, 4)
1811 GEN_VEXT_VX(vmulhu_vx_d, 8)
1812 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1813 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1814 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1815 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1816 
1817 /* Vector Integer Divide Instructions */
1818 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1819 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1820 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1821         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1822 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1823         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1824 
1825 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1826 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1827 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1828 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1829 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1830 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1831 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1832 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1833 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1834 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1835 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1836 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1837 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1838 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1839 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1840 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1841 GEN_VEXT_VV(vdivu_vv_b, 1)
1842 GEN_VEXT_VV(vdivu_vv_h, 2)
1843 GEN_VEXT_VV(vdivu_vv_w, 4)
1844 GEN_VEXT_VV(vdivu_vv_d, 8)
1845 GEN_VEXT_VV(vdiv_vv_b, 1)
1846 GEN_VEXT_VV(vdiv_vv_h, 2)
1847 GEN_VEXT_VV(vdiv_vv_w, 4)
1848 GEN_VEXT_VV(vdiv_vv_d, 8)
1849 GEN_VEXT_VV(vremu_vv_b, 1)
1850 GEN_VEXT_VV(vremu_vv_h, 2)
1851 GEN_VEXT_VV(vremu_vv_w, 4)
1852 GEN_VEXT_VV(vremu_vv_d, 8)
1853 GEN_VEXT_VV(vrem_vv_b, 1)
1854 GEN_VEXT_VV(vrem_vv_h, 2)
1855 GEN_VEXT_VV(vrem_vv_w, 4)
1856 GEN_VEXT_VV(vrem_vv_d, 8)
1857 
1858 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1859 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1860 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1861 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1862 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1863 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1864 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1865 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1866 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1867 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1868 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1869 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1870 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1871 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1872 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1873 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1874 GEN_VEXT_VX(vdivu_vx_b, 1)
1875 GEN_VEXT_VX(vdivu_vx_h, 2)
1876 GEN_VEXT_VX(vdivu_vx_w, 4)
1877 GEN_VEXT_VX(vdivu_vx_d, 8)
1878 GEN_VEXT_VX(vdiv_vx_b, 1)
1879 GEN_VEXT_VX(vdiv_vx_h, 2)
1880 GEN_VEXT_VX(vdiv_vx_w, 4)
1881 GEN_VEXT_VX(vdiv_vx_d, 8)
1882 GEN_VEXT_VX(vremu_vx_b, 1)
1883 GEN_VEXT_VX(vremu_vx_h, 2)
1884 GEN_VEXT_VX(vremu_vx_w, 4)
1885 GEN_VEXT_VX(vremu_vx_d, 8)
1886 GEN_VEXT_VX(vrem_vx_b, 1)
1887 GEN_VEXT_VX(vrem_vx_h, 2)
1888 GEN_VEXT_VX(vrem_vx_w, 4)
1889 GEN_VEXT_VX(vrem_vx_d, 8)
1890 
1891 /* Vector Widening Integer Multiply Instructions */
1892 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1893 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1894 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1895 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1896 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1897 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1898 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1899 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1900 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1901 GEN_VEXT_VV(vwmul_vv_b, 2)
1902 GEN_VEXT_VV(vwmul_vv_h, 4)
1903 GEN_VEXT_VV(vwmul_vv_w, 8)
1904 GEN_VEXT_VV(vwmulu_vv_b, 2)
1905 GEN_VEXT_VV(vwmulu_vv_h, 4)
1906 GEN_VEXT_VV(vwmulu_vv_w, 8)
1907 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1908 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1909 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1910 
1911 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1912 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1913 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1914 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1915 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1916 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1917 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1918 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1919 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1920 GEN_VEXT_VX(vwmul_vx_b, 2)
1921 GEN_VEXT_VX(vwmul_vx_h, 4)
1922 GEN_VEXT_VX(vwmul_vx_w, 8)
1923 GEN_VEXT_VX(vwmulu_vx_b, 2)
1924 GEN_VEXT_VX(vwmulu_vx_h, 4)
1925 GEN_VEXT_VX(vwmulu_vx_w, 8)
1926 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1927 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1928 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1929 
1930 /* Vector Single-Width Integer Multiply-Add Instructions */
1931 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1932 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1933 {                                                                  \
1934     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1935     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1936     TD d = *((TD *)vd + HD(i));                                    \
1937     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1938 }
1939 
1940 #define DO_MACC(N, M, D) (M * N + D)
1941 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1942 #define DO_MADD(N, M, D) (M * D + N)
1943 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1944 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1945 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1946 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1947 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1948 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1949 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1950 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1951 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1952 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1953 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1954 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1955 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1956 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1957 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1958 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1959 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1960 GEN_VEXT_VV(vmacc_vv_b, 1)
1961 GEN_VEXT_VV(vmacc_vv_h, 2)
1962 GEN_VEXT_VV(vmacc_vv_w, 4)
1963 GEN_VEXT_VV(vmacc_vv_d, 8)
1964 GEN_VEXT_VV(vnmsac_vv_b, 1)
1965 GEN_VEXT_VV(vnmsac_vv_h, 2)
1966 GEN_VEXT_VV(vnmsac_vv_w, 4)
1967 GEN_VEXT_VV(vnmsac_vv_d, 8)
1968 GEN_VEXT_VV(vmadd_vv_b, 1)
1969 GEN_VEXT_VV(vmadd_vv_h, 2)
1970 GEN_VEXT_VV(vmadd_vv_w, 4)
1971 GEN_VEXT_VV(vmadd_vv_d, 8)
1972 GEN_VEXT_VV(vnmsub_vv_b, 1)
1973 GEN_VEXT_VV(vnmsub_vv_h, 2)
1974 GEN_VEXT_VV(vnmsub_vv_w, 4)
1975 GEN_VEXT_VV(vnmsub_vv_d, 8)
1976 
1977 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1978 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1979 {                                                                   \
1980     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1981     TD d = *((TD *)vd + HD(i));                                     \
1982     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1983 }
1984 
1985 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1986 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1987 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1988 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1989 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1990 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1991 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1992 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1993 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1994 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1995 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1996 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1997 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1998 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1999 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
2000 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
2001 GEN_VEXT_VX(vmacc_vx_b, 1)
2002 GEN_VEXT_VX(vmacc_vx_h, 2)
2003 GEN_VEXT_VX(vmacc_vx_w, 4)
2004 GEN_VEXT_VX(vmacc_vx_d, 8)
2005 GEN_VEXT_VX(vnmsac_vx_b, 1)
2006 GEN_VEXT_VX(vnmsac_vx_h, 2)
2007 GEN_VEXT_VX(vnmsac_vx_w, 4)
2008 GEN_VEXT_VX(vnmsac_vx_d, 8)
2009 GEN_VEXT_VX(vmadd_vx_b, 1)
2010 GEN_VEXT_VX(vmadd_vx_h, 2)
2011 GEN_VEXT_VX(vmadd_vx_w, 4)
2012 GEN_VEXT_VX(vmadd_vx_d, 8)
2013 GEN_VEXT_VX(vnmsub_vx_b, 1)
2014 GEN_VEXT_VX(vnmsub_vx_h, 2)
2015 GEN_VEXT_VX(vnmsub_vx_w, 4)
2016 GEN_VEXT_VX(vnmsub_vx_d, 8)
2017 
2018 /* Vector Widening Integer Multiply-Add Instructions */
2019 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
2020 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
2021 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
2022 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
2023 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
2024 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
2025 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
2026 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
2027 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
2028 GEN_VEXT_VV(vwmaccu_vv_b, 2)
2029 GEN_VEXT_VV(vwmaccu_vv_h, 4)
2030 GEN_VEXT_VV(vwmaccu_vv_w, 8)
2031 GEN_VEXT_VV(vwmacc_vv_b, 2)
2032 GEN_VEXT_VV(vwmacc_vv_h, 4)
2033 GEN_VEXT_VV(vwmacc_vv_w, 8)
2034 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
2035 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
2036 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
2037 
2038 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
2039 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
2040 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
2041 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
2042 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
2043 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
2044 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
2045 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
2046 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
2047 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
2048 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
2049 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
2050 GEN_VEXT_VX(vwmaccu_vx_b, 2)
2051 GEN_VEXT_VX(vwmaccu_vx_h, 4)
2052 GEN_VEXT_VX(vwmaccu_vx_w, 8)
2053 GEN_VEXT_VX(vwmacc_vx_b, 2)
2054 GEN_VEXT_VX(vwmacc_vx_h, 4)
2055 GEN_VEXT_VX(vwmacc_vx_w, 8)
2056 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2057 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2058 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2059 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2060 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2061 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2062 
2063 /* Vector Integer Merge and Move Instructions */
2064 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2065 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2066                   uint32_t desc)                                     \
2067 {                                                                    \
2068     uint32_t vl = env->vl;                                           \
2069     uint32_t esz = sizeof(ETYPE);                                    \
2070     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2071     uint32_t vta = vext_vta(desc);                                   \
2072     uint32_t i;                                                      \
2073                                                                      \
2074     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2075                                                                      \
2076     for (i = env->vstart; i < vl; i++) {                             \
2077         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2078         *((ETYPE *)vd + H(i)) = s1;                                  \
2079     }                                                                \
2080     env->vstart = 0;                                                 \
2081     /* set tail elements to 1s */                                    \
2082     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2083 }
2084 
2085 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2086 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2087 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2088 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2089 
2090 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2091 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2092                   uint32_t desc)                                     \
2093 {                                                                    \
2094     uint32_t vl = env->vl;                                           \
2095     uint32_t esz = sizeof(ETYPE);                                    \
2096     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2097     uint32_t vta = vext_vta(desc);                                   \
2098     uint32_t i;                                                      \
2099                                                                      \
2100     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2101                                                                      \
2102     for (i = env->vstart; i < vl; i++) {                             \
2103         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2104     }                                                                \
2105     env->vstart = 0;                                                 \
2106     /* set tail elements to 1s */                                    \
2107     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2108 }
2109 
2110 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2111 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2112 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2113 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2114 
2115 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2116 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2117                   CPURISCVState *env, uint32_t desc)                 \
2118 {                                                                    \
2119     uint32_t vl = env->vl;                                           \
2120     uint32_t esz = sizeof(ETYPE);                                    \
2121     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2122     uint32_t vta = vext_vta(desc);                                   \
2123     uint32_t i;                                                      \
2124                                                                      \
2125     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2126                                                                      \
2127     for (i = env->vstart; i < vl; i++) {                             \
2128         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2129         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2130     }                                                                \
2131     env->vstart = 0;                                                 \
2132     /* set tail elements to 1s */                                    \
2133     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2134 }
2135 
2136 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2137 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2138 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2139 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2140 
2141 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2142 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2143                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2144 {                                                                    \
2145     uint32_t vl = env->vl;                                           \
2146     uint32_t esz = sizeof(ETYPE);                                    \
2147     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2148     uint32_t vta = vext_vta(desc);                                   \
2149     uint32_t i;                                                      \
2150                                                                      \
2151     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2152                                                                      \
2153     for (i = env->vstart; i < vl; i++) {                             \
2154         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2155         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2156                    (ETYPE)(target_long)s1);                          \
2157         *((ETYPE *)vd + H(i)) = d;                                   \
2158     }                                                                \
2159     env->vstart = 0;                                                 \
2160     /* set tail elements to 1s */                                    \
2161     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2162 }
2163 
2164 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2165 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2166 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2167 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2168 
2169 /*
2170  * Vector Fixed-Point Arithmetic Instructions
2171  */
2172 
2173 /* Vector Single-Width Saturating Add and Subtract */
2174 
2175 /*
2176  * As fixed point instructions probably have round mode and saturation,
2177  * define common macros for fixed point here.
2178  */
2179 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2180                           CPURISCVState *env, int vxrm);
2181 
2182 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2183 static inline void                                                  \
2184 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2185           CPURISCVState *env, int vxrm)                             \
2186 {                                                                   \
2187     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2188     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2189     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2190 }
2191 
2192 static inline void
vext_vv_rm_1(void * vd,void * v0,void * vs1,void * vs2,CPURISCVState * env,uint32_t vl,uint32_t vm,int vxrm,opivv2_rm_fn * fn,uint32_t vma,uint32_t esz)2193 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2194              CPURISCVState *env,
2195              uint32_t vl, uint32_t vm, int vxrm,
2196              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2197 {
2198     for (uint32_t i = env->vstart; i < vl; i++) {
2199         if (!vm && !vext_elem_mask(v0, i)) {
2200             /* set masked-off elements to 1s */
2201             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2202             continue;
2203         }
2204         fn(vd, vs1, vs2, i, env, vxrm);
2205     }
2206     env->vstart = 0;
2207 }
2208 
2209 static inline void
vext_vv_rm_2(void * vd,void * v0,void * vs1,void * vs2,CPURISCVState * env,uint32_t desc,opivv2_rm_fn * fn,uint32_t esz)2210 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2211              CPURISCVState *env,
2212              uint32_t desc,
2213              opivv2_rm_fn *fn, uint32_t esz)
2214 {
2215     uint32_t vm = vext_vm(desc);
2216     uint32_t vl = env->vl;
2217     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2218     uint32_t vta = vext_vta(desc);
2219     uint32_t vma = vext_vma(desc);
2220 
2221     VSTART_CHECK_EARLY_EXIT(env, vl);
2222 
2223     switch (env->vxrm) {
2224     case 0: /* rnu */
2225         vext_vv_rm_1(vd, v0, vs1, vs2,
2226                      env, vl, vm, 0, fn, vma, esz);
2227         break;
2228     case 1: /* rne */
2229         vext_vv_rm_1(vd, v0, vs1, vs2,
2230                      env, vl, vm, 1, fn, vma, esz);
2231         break;
2232     case 2: /* rdn */
2233         vext_vv_rm_1(vd, v0, vs1, vs2,
2234                      env, vl, vm, 2, fn, vma, esz);
2235         break;
2236     default: /* rod */
2237         vext_vv_rm_1(vd, v0, vs1, vs2,
2238                      env, vl, vm, 3, fn, vma, esz);
2239         break;
2240     }
2241     /* set tail elements to 1s */
2242     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2243 }
2244 
2245 /* generate helpers for fixed point instructions with OPIVV format */
2246 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2247 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2248                   CPURISCVState *env, uint32_t desc)            \
2249 {                                                               \
2250     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2251                  do_##NAME, ESZ);                               \
2252 }
2253 
saddu8(CPURISCVState * env,int vxrm,uint8_t a,uint8_t b)2254 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2255                              uint8_t b)
2256 {
2257     uint8_t res = a + b;
2258     if (res < a) {
2259         res = UINT8_MAX;
2260         env->vxsat = 0x1;
2261     }
2262     return res;
2263 }
2264 
saddu16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2265 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2266                                uint16_t b)
2267 {
2268     uint16_t res = a + b;
2269     if (res < a) {
2270         res = UINT16_MAX;
2271         env->vxsat = 0x1;
2272     }
2273     return res;
2274 }
2275 
saddu32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2276 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2277                                uint32_t b)
2278 {
2279     uint32_t res = a + b;
2280     if (res < a) {
2281         res = UINT32_MAX;
2282         env->vxsat = 0x1;
2283     }
2284     return res;
2285 }
2286 
saddu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2287 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2288                                uint64_t b)
2289 {
2290     uint64_t res = a + b;
2291     if (res < a) {
2292         res = UINT64_MAX;
2293         env->vxsat = 0x1;
2294     }
2295     return res;
2296 }
2297 
2298 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2299 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2300 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2301 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2302 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2303 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2304 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2305 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2306 
2307 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2308                           CPURISCVState *env, int vxrm);
2309 
2310 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2311 static inline void                                                  \
2312 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2313           CPURISCVState *env, int vxrm)                             \
2314 {                                                                   \
2315     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2316     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2317 }
2318 
2319 static inline void
vext_vx_rm_1(void * vd,void * v0,target_long s1,void * vs2,CPURISCVState * env,uint32_t vl,uint32_t vm,int vxrm,opivx2_rm_fn * fn,uint32_t vma,uint32_t esz)2320 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2321              CPURISCVState *env,
2322              uint32_t vl, uint32_t vm, int vxrm,
2323              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2324 {
2325     for (uint32_t i = env->vstart; i < vl; i++) {
2326         if (!vm && !vext_elem_mask(v0, i)) {
2327             /* set masked-off elements to 1s */
2328             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2329             continue;
2330         }
2331         fn(vd, s1, vs2, i, env, vxrm);
2332     }
2333     env->vstart = 0;
2334 }
2335 
2336 static inline void
vext_vx_rm_2(void * vd,void * v0,target_long s1,void * vs2,CPURISCVState * env,uint32_t desc,opivx2_rm_fn * fn,uint32_t esz)2337 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2338              CPURISCVState *env,
2339              uint32_t desc,
2340              opivx2_rm_fn *fn, uint32_t esz)
2341 {
2342     uint32_t vm = vext_vm(desc);
2343     uint32_t vl = env->vl;
2344     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2345     uint32_t vta = vext_vta(desc);
2346     uint32_t vma = vext_vma(desc);
2347 
2348     VSTART_CHECK_EARLY_EXIT(env, vl);
2349 
2350     switch (env->vxrm) {
2351     case 0: /* rnu */
2352         vext_vx_rm_1(vd, v0, s1, vs2,
2353                      env, vl, vm, 0, fn, vma, esz);
2354         break;
2355     case 1: /* rne */
2356         vext_vx_rm_1(vd, v0, s1, vs2,
2357                      env, vl, vm, 1, fn, vma, esz);
2358         break;
2359     case 2: /* rdn */
2360         vext_vx_rm_1(vd, v0, s1, vs2,
2361                      env, vl, vm, 2, fn, vma, esz);
2362         break;
2363     default: /* rod */
2364         vext_vx_rm_1(vd, v0, s1, vs2,
2365                      env, vl, vm, 3, fn, vma, esz);
2366         break;
2367     }
2368     /* set tail elements to 1s */
2369     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2370 }
2371 
2372 /* generate helpers for fixed point instructions with OPIVX format */
2373 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2374 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2375                   void *vs2, CPURISCVState *env,          \
2376                   uint32_t desc)                          \
2377 {                                                         \
2378     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2379                  do_##NAME, ESZ);                         \
2380 }
2381 
RVVCALL(OPIVX2_RM,vsaddu_vx_b,OP_UUU_B,H1,H1,saddu8)2382 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2383 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2384 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2385 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2386 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2387 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2388 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2389 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2390 
2391 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2392 {
2393     int8_t res = a + b;
2394     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2395         res = a > 0 ? INT8_MAX : INT8_MIN;
2396         env->vxsat = 0x1;
2397     }
2398     return res;
2399 }
2400 
sadd16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2401 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2402                              int16_t b)
2403 {
2404     int16_t res = a + b;
2405     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2406         res = a > 0 ? INT16_MAX : INT16_MIN;
2407         env->vxsat = 0x1;
2408     }
2409     return res;
2410 }
2411 
sadd32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2412 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2413                              int32_t b)
2414 {
2415     int32_t res = a + b;
2416     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2417         res = a > 0 ? INT32_MAX : INT32_MIN;
2418         env->vxsat = 0x1;
2419     }
2420     return res;
2421 }
2422 
sadd64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2423 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2424                              int64_t b)
2425 {
2426     int64_t res = a + b;
2427     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2428         res = a > 0 ? INT64_MAX : INT64_MIN;
2429         env->vxsat = 0x1;
2430     }
2431     return res;
2432 }
2433 
RVVCALL(OPIVV2_RM,vsadd_vv_b,OP_SSS_B,H1,H1,H1,sadd8)2434 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2435 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2436 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2437 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2438 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2439 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2440 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2441 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2442 
2443 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2444 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2445 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2446 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2447 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2448 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2449 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2450 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2451 
2452 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2453                              uint8_t b)
2454 {
2455     uint8_t res = a - b;
2456     if (res > a) {
2457         res = 0;
2458         env->vxsat = 0x1;
2459     }
2460     return res;
2461 }
2462 
ssubu16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2463 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2464                                uint16_t b)
2465 {
2466     uint16_t res = a - b;
2467     if (res > a) {
2468         res = 0;
2469         env->vxsat = 0x1;
2470     }
2471     return res;
2472 }
2473 
ssubu32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2474 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2475                                uint32_t b)
2476 {
2477     uint32_t res = a - b;
2478     if (res > a) {
2479         res = 0;
2480         env->vxsat = 0x1;
2481     }
2482     return res;
2483 }
2484 
ssubu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2485 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2486                                uint64_t b)
2487 {
2488     uint64_t res = a - b;
2489     if (res > a) {
2490         res = 0;
2491         env->vxsat = 0x1;
2492     }
2493     return res;
2494 }
2495 
RVVCALL(OPIVV2_RM,vssubu_vv_b,OP_UUU_B,H1,H1,H1,ssubu8)2496 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2497 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2498 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2499 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2500 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2501 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2502 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2503 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2504 
2505 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2506 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2507 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2508 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2509 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2510 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2511 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2512 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2513 
2514 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2515 {
2516     int8_t res = a - b;
2517     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2518         res = a >= 0 ? INT8_MAX : INT8_MIN;
2519         env->vxsat = 0x1;
2520     }
2521     return res;
2522 }
2523 
ssub16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2524 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2525                              int16_t b)
2526 {
2527     int16_t res = a - b;
2528     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2529         res = a >= 0 ? INT16_MAX : INT16_MIN;
2530         env->vxsat = 0x1;
2531     }
2532     return res;
2533 }
2534 
ssub32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2535 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2536                              int32_t b)
2537 {
2538     int32_t res = a - b;
2539     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2540         res = a >= 0 ? INT32_MAX : INT32_MIN;
2541         env->vxsat = 0x1;
2542     }
2543     return res;
2544 }
2545 
ssub64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2546 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2547                              int64_t b)
2548 {
2549     int64_t res = a - b;
2550     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2551         res = a >= 0 ? INT64_MAX : INT64_MIN;
2552         env->vxsat = 0x1;
2553     }
2554     return res;
2555 }
2556 
RVVCALL(OPIVV2_RM,vssub_vv_b,OP_SSS_B,H1,H1,H1,ssub8)2557 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2558 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2559 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2560 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2561 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2562 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2563 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2564 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2565 
2566 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2567 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2568 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2569 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2570 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2571 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2572 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2573 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2574 
2575 /* Vector Single-Width Averaging Add and Subtract */
2576 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2577 {
2578     uint8_t d = extract64(v, shift, 1);
2579     uint8_t d1;
2580     uint64_t D1, D2;
2581 
2582     if (shift == 0 || shift > 64) {
2583         return 0;
2584     }
2585 
2586     d1 = extract64(v, shift - 1, 1);
2587     D1 = extract64(v, 0, shift);
2588     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2589         return d1;
2590     } else if (vxrm == 1) { /* round-to-nearest-even */
2591         if (shift > 1) {
2592             D2 = extract64(v, 0, shift - 1);
2593             return d1 & ((D2 != 0) | d);
2594         } else {
2595             return d1 & d;
2596         }
2597     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2598         return !d & (D1 != 0);
2599     }
2600     return 0; /* round-down (truncate) */
2601 }
2602 
aadd32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2603 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2604                              int32_t b)
2605 {
2606     int64_t res = (int64_t)a + b;
2607     uint8_t round = get_round(vxrm, res, 1);
2608 
2609     return (res >> 1) + round;
2610 }
2611 
aadd64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2612 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2613                              int64_t b)
2614 {
2615     int64_t res = a + b;
2616     uint8_t round = get_round(vxrm, res, 1);
2617     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2618 
2619     /* With signed overflow, bit 64 is inverse of bit 63. */
2620     return ((res >> 1) ^ over) + round;
2621 }
2622 
RVVCALL(OPIVV2_RM,vaadd_vv_b,OP_SSS_B,H1,H1,H1,aadd32)2623 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2624 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2625 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2626 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2627 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2628 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2629 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2630 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2631 
2632 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2633 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2634 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2635 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2636 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2637 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2638 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2639 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2640 
2641 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2642                                uint32_t a, uint32_t b)
2643 {
2644     uint64_t res = (uint64_t)a + b;
2645     uint8_t round = get_round(vxrm, res, 1);
2646 
2647     return (res >> 1) + round;
2648 }
2649 
aaddu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2650 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2651                                uint64_t a, uint64_t b)
2652 {
2653     uint64_t res = a + b;
2654     uint8_t round = get_round(vxrm, res, 1);
2655     uint64_t over = (uint64_t)(res < a) << 63;
2656 
2657     return ((res >> 1) | over) + round;
2658 }
2659 
RVVCALL(OPIVV2_RM,vaaddu_vv_b,OP_UUU_B,H1,H1,H1,aaddu32)2660 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2661 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2662 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2663 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2664 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2665 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2666 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2667 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2668 
2669 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2670 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2671 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2672 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2673 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2674 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2675 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2676 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2677 
2678 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2679                              int32_t b)
2680 {
2681     int64_t res = (int64_t)a - b;
2682     uint8_t round = get_round(vxrm, res, 1);
2683 
2684     return (res >> 1) + round;
2685 }
2686 
asub64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2687 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2688                              int64_t b)
2689 {
2690     int64_t res = (int64_t)a - b;
2691     uint8_t round = get_round(vxrm, res, 1);
2692     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2693 
2694     /* With signed overflow, bit 64 is inverse of bit 63. */
2695     return ((res >> 1) ^ over) + round;
2696 }
2697 
RVVCALL(OPIVV2_RM,vasub_vv_b,OP_SSS_B,H1,H1,H1,asub32)2698 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2699 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2700 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2701 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2702 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2703 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2704 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2705 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2706 
2707 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2708 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2709 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2710 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2711 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2712 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2713 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2714 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2715 
2716 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2717                                uint32_t a, uint32_t b)
2718 {
2719     int64_t res = (int64_t)a - b;
2720     uint8_t round = get_round(vxrm, res, 1);
2721 
2722     return (res >> 1) + round;
2723 }
2724 
asubu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2725 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2726                                uint64_t a, uint64_t b)
2727 {
2728     uint64_t res = (uint64_t)a - b;
2729     uint8_t round = get_round(vxrm, res, 1);
2730     uint64_t over = (uint64_t)(res > a) << 63;
2731 
2732     return ((res >> 1) | over) + round;
2733 }
2734 
RVVCALL(OPIVV2_RM,vasubu_vv_b,OP_UUU_B,H1,H1,H1,asubu32)2735 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2736 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2737 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2738 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2739 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2740 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2741 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2742 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2743 
2744 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2745 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2746 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2747 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2748 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2749 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2750 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2751 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2752 
2753 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2754 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2755 {
2756     uint8_t round;
2757     int16_t res;
2758 
2759     res = (int16_t)a * (int16_t)b;
2760     round = get_round(vxrm, res, 7);
2761     res = (res >> 7) + round;
2762 
2763     if (res > INT8_MAX) {
2764         env->vxsat = 0x1;
2765         return INT8_MAX;
2766     } else if (res < INT8_MIN) {
2767         env->vxsat = 0x1;
2768         return INT8_MIN;
2769     } else {
2770         return res;
2771     }
2772 }
2773 
vsmul16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2774 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2775 {
2776     uint8_t round;
2777     int32_t res;
2778 
2779     res = (int32_t)a * (int32_t)b;
2780     round = get_round(vxrm, res, 15);
2781     res = (res >> 15) + round;
2782 
2783     if (res > INT16_MAX) {
2784         env->vxsat = 0x1;
2785         return INT16_MAX;
2786     } else if (res < INT16_MIN) {
2787         env->vxsat = 0x1;
2788         return INT16_MIN;
2789     } else {
2790         return res;
2791     }
2792 }
2793 
vsmul32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2794 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2795 {
2796     uint8_t round;
2797     int64_t res;
2798 
2799     res = (int64_t)a * (int64_t)b;
2800     round = get_round(vxrm, res, 31);
2801     res = (res >> 31) + round;
2802 
2803     if (res > INT32_MAX) {
2804         env->vxsat = 0x1;
2805         return INT32_MAX;
2806     } else if (res < INT32_MIN) {
2807         env->vxsat = 0x1;
2808         return INT32_MIN;
2809     } else {
2810         return res;
2811     }
2812 }
2813 
vsmul64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2814 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2815 {
2816     uint8_t round;
2817     uint64_t hi_64, lo_64;
2818     int64_t res;
2819 
2820     if (a == INT64_MIN && b == INT64_MIN) {
2821         env->vxsat = 1;
2822         return INT64_MAX;
2823     }
2824 
2825     muls64(&lo_64, &hi_64, a, b);
2826     round = get_round(vxrm, lo_64, 63);
2827     /*
2828      * Cannot overflow, as there are always
2829      * 2 sign bits after multiply.
2830      */
2831     res = (hi_64 << 1) | (lo_64 >> 63);
2832     if (round) {
2833         if (res == INT64_MAX) {
2834             env->vxsat = 1;
2835         } else {
2836             res += 1;
2837         }
2838     }
2839     return res;
2840 }
2841 
RVVCALL(OPIVV2_RM,vsmul_vv_b,OP_SSS_B,H1,H1,H1,vsmul8)2842 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2843 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2844 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2845 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2846 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2847 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2848 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2849 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2850 
2851 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2852 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2853 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2854 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2855 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2856 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2857 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2858 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2859 
2860 /* Vector Single-Width Scaling Shift Instructions */
2861 static inline uint8_t
2862 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2863 {
2864     uint8_t round, shift = b & 0x7;
2865     uint8_t res;
2866 
2867     round = get_round(vxrm, a, shift);
2868     res = (a >> shift) + round;
2869     return res;
2870 }
2871 static inline uint16_t
vssrl16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2872 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2873 {
2874     uint8_t round, shift = b & 0xf;
2875 
2876     round = get_round(vxrm, a, shift);
2877     return (a >> shift) + round;
2878 }
2879 static inline uint32_t
vssrl32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2880 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2881 {
2882     uint8_t round, shift = b & 0x1f;
2883 
2884     round = get_round(vxrm, a, shift);
2885     return (a >> shift) + round;
2886 }
2887 static inline uint64_t
vssrl64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2888 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2889 {
2890     uint8_t round, shift = b & 0x3f;
2891 
2892     round = get_round(vxrm, a, shift);
2893     return (a >> shift) + round;
2894 }
RVVCALL(OPIVV2_RM,vssrl_vv_b,OP_UUU_B,H1,H1,H1,vssrl8)2895 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2896 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2897 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2898 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2899 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2900 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2901 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2902 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2903 
2904 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2905 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2906 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2907 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2908 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2909 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2910 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2911 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2912 
2913 static inline int8_t
2914 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2915 {
2916     uint8_t round, shift = b & 0x7;
2917 
2918     round = get_round(vxrm, a, shift);
2919     return (a >> shift) + round;
2920 }
2921 static inline int16_t
vssra16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2922 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2923 {
2924     uint8_t round, shift = b & 0xf;
2925 
2926     round = get_round(vxrm, a, shift);
2927     return (a >> shift) + round;
2928 }
2929 static inline int32_t
vssra32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2930 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2931 {
2932     uint8_t round, shift = b & 0x1f;
2933 
2934     round = get_round(vxrm, a, shift);
2935     return (a >> shift) + round;
2936 }
2937 static inline int64_t
vssra64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2938 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2939 {
2940     uint8_t round, shift = b & 0x3f;
2941 
2942     round = get_round(vxrm, a, shift);
2943     return (a >> shift) + round;
2944 }
2945 
RVVCALL(OPIVV2_RM,vssra_vv_b,OP_SSS_B,H1,H1,H1,vssra8)2946 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2947 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2948 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2949 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2950 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2951 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2952 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2953 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2954 
2955 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2956 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2957 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2958 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2959 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2960 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2961 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2962 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2963 
2964 /* Vector Narrowing Fixed-Point Clip Instructions */
2965 static inline int8_t
2966 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2967 {
2968     uint8_t round, shift = b & 0xf;
2969     int16_t res;
2970 
2971     round = get_round(vxrm, a, shift);
2972     res = (a >> shift) + round;
2973     if (res > INT8_MAX) {
2974         env->vxsat = 0x1;
2975         return INT8_MAX;
2976     } else if (res < INT8_MIN) {
2977         env->vxsat = 0x1;
2978         return INT8_MIN;
2979     } else {
2980         return res;
2981     }
2982 }
2983 
2984 static inline int16_t
vnclip16(CPURISCVState * env,int vxrm,int32_t a,int16_t b)2985 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2986 {
2987     uint8_t round, shift = b & 0x1f;
2988     int32_t res;
2989 
2990     round = get_round(vxrm, a, shift);
2991     res = (a >> shift) + round;
2992     if (res > INT16_MAX) {
2993         env->vxsat = 0x1;
2994         return INT16_MAX;
2995     } else if (res < INT16_MIN) {
2996         env->vxsat = 0x1;
2997         return INT16_MIN;
2998     } else {
2999         return res;
3000     }
3001 }
3002 
3003 static inline int32_t
vnclip32(CPURISCVState * env,int vxrm,int64_t a,int32_t b)3004 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
3005 {
3006     uint8_t round, shift = b & 0x3f;
3007     int64_t res;
3008 
3009     round = get_round(vxrm, a, shift);
3010     res = (a >> shift) + round;
3011     if (res > INT32_MAX) {
3012         env->vxsat = 0x1;
3013         return INT32_MAX;
3014     } else if (res < INT32_MIN) {
3015         env->vxsat = 0x1;
3016         return INT32_MIN;
3017     } else {
3018         return res;
3019     }
3020 }
3021 
RVVCALL(OPIVV2_RM,vnclip_wv_b,NOP_SSS_B,H1,H2,H1,vnclip8)3022 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
3023 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
3024 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
3025 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
3026 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
3027 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
3028 
3029 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
3030 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
3031 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
3032 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
3033 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
3034 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
3035 
3036 static inline uint8_t
3037 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
3038 {
3039     uint8_t round, shift = b & 0xf;
3040     uint16_t res;
3041 
3042     round = get_round(vxrm, a, shift);
3043     res = (a >> shift) + round;
3044     if (res > UINT8_MAX) {
3045         env->vxsat = 0x1;
3046         return UINT8_MAX;
3047     } else {
3048         return res;
3049     }
3050 }
3051 
3052 static inline uint16_t
vnclipu16(CPURISCVState * env,int vxrm,uint32_t a,uint16_t b)3053 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
3054 {
3055     uint8_t round, shift = b & 0x1f;
3056     uint32_t res;
3057 
3058     round = get_round(vxrm, a, shift);
3059     res = (a >> shift) + round;
3060     if (res > UINT16_MAX) {
3061         env->vxsat = 0x1;
3062         return UINT16_MAX;
3063     } else {
3064         return res;
3065     }
3066 }
3067 
3068 static inline uint32_t
vnclipu32(CPURISCVState * env,int vxrm,uint64_t a,uint32_t b)3069 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3070 {
3071     uint8_t round, shift = b & 0x3f;
3072     uint64_t res;
3073 
3074     round = get_round(vxrm, a, shift);
3075     res = (a >> shift) + round;
3076     if (res > UINT32_MAX) {
3077         env->vxsat = 0x1;
3078         return UINT32_MAX;
3079     } else {
3080         return res;
3081     }
3082 }
3083 
RVVCALL(OPIVV2_RM,vnclipu_wv_b,NOP_UUU_B,H1,H2,H1,vnclipu8)3084 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3085 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3086 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3087 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3088 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3089 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3090 
3091 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3092 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3093 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3094 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3095 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3096 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3097 
3098 /*
3099  * Vector Float Point Arithmetic Instructions
3100  */
3101 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3102 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3103 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3104                       CPURISCVState *env)                      \
3105 {                                                              \
3106     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3107     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3108     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3109 }
3110 
3111 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3112 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3113                   void *vs2, CPURISCVState *env,          \
3114                   uint32_t desc)                          \
3115 {                                                         \
3116     uint32_t vm = vext_vm(desc);                          \
3117     uint32_t vl = env->vl;                                \
3118     uint32_t total_elems =                                \
3119         vext_get_total_elems(env, desc, ESZ);             \
3120     uint32_t vta = vext_vta(desc);                        \
3121     uint32_t vma = vext_vma(desc);                        \
3122     uint32_t i;                                           \
3123                                                           \
3124     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3125                                                           \
3126     for (i = env->vstart; i < vl; i++) {                  \
3127         if (!vm && !vext_elem_mask(v0, i)) {              \
3128             /* set masked-off elements to 1s */           \
3129             vext_set_elems_1s(vd, vma, i * ESZ,           \
3130                               (i + 1) * ESZ);             \
3131             continue;                                     \
3132         }                                                 \
3133         do_##NAME(vd, vs1, vs2, i, env);                  \
3134     }                                                     \
3135     env->vstart = 0;                                      \
3136     /* set tail elements to 1s */                         \
3137     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3138                       total_elems * ESZ);                 \
3139 }
3140 
3141 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3142 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3143 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3144 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3145 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3146 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3147 
3148 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3149 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3150                       CPURISCVState *env)                      \
3151 {                                                              \
3152     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3153     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3154 }
3155 
3156 #define GEN_VEXT_VF(NAME, ESZ)                            \
3157 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3158                   void *vs2, CPURISCVState *env,          \
3159                   uint32_t desc)                          \
3160 {                                                         \
3161     uint32_t vm = vext_vm(desc);                          \
3162     uint32_t vl = env->vl;                                \
3163     uint32_t total_elems =                                \
3164         vext_get_total_elems(env, desc, ESZ);             \
3165     uint32_t vta = vext_vta(desc);                        \
3166     uint32_t vma = vext_vma(desc);                        \
3167     uint32_t i;                                           \
3168                                                           \
3169     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3170                                                           \
3171     for (i = env->vstart; i < vl; i++) {                  \
3172         if (!vm && !vext_elem_mask(v0, i)) {              \
3173             /* set masked-off elements to 1s */           \
3174             vext_set_elems_1s(vd, vma, i * ESZ,           \
3175                               (i + 1) * ESZ);             \
3176             continue;                                     \
3177         }                                                 \
3178         do_##NAME(vd, s1, vs2, i, env);                   \
3179     }                                                     \
3180     env->vstart = 0;                                      \
3181     /* set tail elements to 1s */                         \
3182     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3183                       total_elems * ESZ);                 \
3184 }
3185 
3186 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3187 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3188 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3189 GEN_VEXT_VF(vfadd_vf_h, 2)
3190 GEN_VEXT_VF(vfadd_vf_w, 4)
3191 GEN_VEXT_VF(vfadd_vf_d, 8)
3192 
3193 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3194 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3195 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3196 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3197 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3198 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3199 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3200 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3201 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3202 GEN_VEXT_VF(vfsub_vf_h, 2)
3203 GEN_VEXT_VF(vfsub_vf_w, 4)
3204 GEN_VEXT_VF(vfsub_vf_d, 8)
3205 
3206 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3207 {
3208     return float16_sub(b, a, s);
3209 }
3210 
float32_rsub(uint32_t a,uint32_t b,float_status * s)3211 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3212 {
3213     return float32_sub(b, a, s);
3214 }
3215 
float64_rsub(uint64_t a,uint64_t b,float_status * s)3216 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3217 {
3218     return float64_sub(b, a, s);
3219 }
3220 
RVVCALL(OPFVF2,vfrsub_vf_h,OP_UUU_H,H2,H2,float16_rsub)3221 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3222 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3223 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3224 GEN_VEXT_VF(vfrsub_vf_h, 2)
3225 GEN_VEXT_VF(vfrsub_vf_w, 4)
3226 GEN_VEXT_VF(vfrsub_vf_d, 8)
3227 
3228 /* Vector Widening Floating-Point Add/Subtract Instructions */
3229 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3230 {
3231     return float32_add(float16_to_float32(a, true, s),
3232                        float16_to_float32(b, true, s), s);
3233 }
3234 
vfwadd32(uint32_t a,uint32_t b,float_status * s)3235 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3236 {
3237     return float64_add(float32_to_float64(a, s),
3238                        float32_to_float64(b, s), s);
3239 
3240 }
3241 
RVVCALL(OPFVV2,vfwadd_vv_h,WOP_UUU_H,H4,H2,H2,vfwadd16)3242 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3243 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3244 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3245 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3246 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3247 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3248 GEN_VEXT_VF(vfwadd_vf_h, 4)
3249 GEN_VEXT_VF(vfwadd_vf_w, 8)
3250 
3251 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3252 {
3253     return float32_sub(float16_to_float32(a, true, s),
3254                        float16_to_float32(b, true, s), s);
3255 }
3256 
vfwsub32(uint32_t a,uint32_t b,float_status * s)3257 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3258 {
3259     return float64_sub(float32_to_float64(a, s),
3260                        float32_to_float64(b, s), s);
3261 
3262 }
3263 
RVVCALL(OPFVV2,vfwsub_vv_h,WOP_UUU_H,H4,H2,H2,vfwsub16)3264 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3265 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3266 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3267 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3268 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3269 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3270 GEN_VEXT_VF(vfwsub_vf_h, 4)
3271 GEN_VEXT_VF(vfwsub_vf_w, 8)
3272 
3273 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3274 {
3275     return float32_add(a, float16_to_float32(b, true, s), s);
3276 }
3277 
vfwaddw32(uint64_t a,uint32_t b,float_status * s)3278 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3279 {
3280     return float64_add(a, float32_to_float64(b, s), s);
3281 }
3282 
RVVCALL(OPFVV2,vfwadd_wv_h,WOP_WUUU_H,H4,H2,H2,vfwaddw16)3283 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3284 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3285 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3286 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3287 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3288 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3289 GEN_VEXT_VF(vfwadd_wf_h, 4)
3290 GEN_VEXT_VF(vfwadd_wf_w, 8)
3291 
3292 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3293 {
3294     return float32_sub(a, float16_to_float32(b, true, s), s);
3295 }
3296 
vfwsubw32(uint64_t a,uint32_t b,float_status * s)3297 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3298 {
3299     return float64_sub(a, float32_to_float64(b, s), s);
3300 }
3301 
RVVCALL(OPFVV2,vfwsub_wv_h,WOP_WUUU_H,H4,H2,H2,vfwsubw16)3302 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3303 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3304 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3305 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3306 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3307 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3308 GEN_VEXT_VF(vfwsub_wf_h, 4)
3309 GEN_VEXT_VF(vfwsub_wf_w, 8)
3310 
3311 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3312 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3313 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3314 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3315 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3316 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3317 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3318 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3319 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3320 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3321 GEN_VEXT_VF(vfmul_vf_h, 2)
3322 GEN_VEXT_VF(vfmul_vf_w, 4)
3323 GEN_VEXT_VF(vfmul_vf_d, 8)
3324 
3325 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3326 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3327 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3328 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3329 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3330 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3331 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3332 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3333 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3334 GEN_VEXT_VF(vfdiv_vf_h, 2)
3335 GEN_VEXT_VF(vfdiv_vf_w, 4)
3336 GEN_VEXT_VF(vfdiv_vf_d, 8)
3337 
3338 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3339 {
3340     return float16_div(b, a, s);
3341 }
3342 
float32_rdiv(uint32_t a,uint32_t b,float_status * s)3343 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3344 {
3345     return float32_div(b, a, s);
3346 }
3347 
float64_rdiv(uint64_t a,uint64_t b,float_status * s)3348 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3349 {
3350     return float64_div(b, a, s);
3351 }
3352 
RVVCALL(OPFVF2,vfrdiv_vf_h,OP_UUU_H,H2,H2,float16_rdiv)3353 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3354 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3355 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3356 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3357 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3358 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3359 
3360 /* Vector Widening Floating-Point Multiply */
3361 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3362 {
3363     return float32_mul(float16_to_float32(a, true, s),
3364                        float16_to_float32(b, true, s), s);
3365 }
3366 
vfwmul32(uint32_t a,uint32_t b,float_status * s)3367 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3368 {
3369     return float64_mul(float32_to_float64(a, s),
3370                        float32_to_float64(b, s), s);
3371 
3372 }
RVVCALL(OPFVV2,vfwmul_vv_h,WOP_UUU_H,H4,H2,H2,vfwmul16)3373 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3374 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3375 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3376 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3377 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3378 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3379 GEN_VEXT_VF(vfwmul_vf_h, 4)
3380 GEN_VEXT_VF(vfwmul_vf_w, 8)
3381 
3382 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3383 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3384 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3385                       CPURISCVState *env)                          \
3386 {                                                                  \
3387     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3388     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3389     TD d = *((TD *)vd + HD(i));                                    \
3390     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3391 }
3392 
3393 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3394 {
3395     return float16_muladd(a, b, d, 0, s);
3396 }
3397 
fmacc32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3398 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3399 {
3400     return float32_muladd(a, b, d, 0, s);
3401 }
3402 
fmacc64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3403 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3404 {
3405     return float64_muladd(a, b, d, 0, s);
3406 }
3407 
RVVCALL(OPFVV3,vfmacc_vv_h,OP_UUU_H,H2,H2,H2,fmacc16)3408 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3409 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3410 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3411 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3412 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3413 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3414 
3415 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3416 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3417                       CPURISCVState *env)                         \
3418 {                                                                 \
3419     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3420     TD d = *((TD *)vd + HD(i));                                   \
3421     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3422 }
3423 
3424 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3425 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3426 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3427 GEN_VEXT_VF(vfmacc_vf_h, 2)
3428 GEN_VEXT_VF(vfmacc_vf_w, 4)
3429 GEN_VEXT_VF(vfmacc_vf_d, 8)
3430 
3431 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3432 {
3433     return float16_muladd(a, b, d, float_muladd_negate_c |
3434                                    float_muladd_negate_product, s);
3435 }
3436 
fnmacc32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3437 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3438 {
3439     return float32_muladd(a, b, d, float_muladd_negate_c |
3440                                    float_muladd_negate_product, s);
3441 }
3442 
fnmacc64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3443 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3444 {
3445     return float64_muladd(a, b, d, float_muladd_negate_c |
3446                                    float_muladd_negate_product, s);
3447 }
3448 
RVVCALL(OPFVV3,vfnmacc_vv_h,OP_UUU_H,H2,H2,H2,fnmacc16)3449 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3450 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3451 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3452 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3453 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3454 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3455 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3456 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3457 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3458 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3459 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3460 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3461 
3462 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3463 {
3464     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3465 }
3466 
fmsac32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3467 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3468 {
3469     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3470 }
3471 
fmsac64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3472 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3473 {
3474     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3475 }
3476 
RVVCALL(OPFVV3,vfmsac_vv_h,OP_UUU_H,H2,H2,H2,fmsac16)3477 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3478 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3479 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3480 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3481 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3482 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3483 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3484 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3485 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3486 GEN_VEXT_VF(vfmsac_vf_h, 2)
3487 GEN_VEXT_VF(vfmsac_vf_w, 4)
3488 GEN_VEXT_VF(vfmsac_vf_d, 8)
3489 
3490 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3491 {
3492     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3493 }
3494 
fnmsac32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3495 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3496 {
3497     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3498 }
3499 
fnmsac64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3500 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3501 {
3502     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3503 }
3504 
RVVCALL(OPFVV3,vfnmsac_vv_h,OP_UUU_H,H2,H2,H2,fnmsac16)3505 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3506 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3507 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3508 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3509 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3510 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3511 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3512 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3513 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3514 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3515 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3516 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3517 
3518 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3519 {
3520     return float16_muladd(d, b, a, 0, s);
3521 }
3522 
fmadd32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3523 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3524 {
3525     return float32_muladd(d, b, a, 0, s);
3526 }
3527 
fmadd64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3528 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3529 {
3530     return float64_muladd(d, b, a, 0, s);
3531 }
3532 
RVVCALL(OPFVV3,vfmadd_vv_h,OP_UUU_H,H2,H2,H2,fmadd16)3533 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3534 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3535 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3536 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3537 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3538 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3539 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3540 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3541 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3542 GEN_VEXT_VF(vfmadd_vf_h, 2)
3543 GEN_VEXT_VF(vfmadd_vf_w, 4)
3544 GEN_VEXT_VF(vfmadd_vf_d, 8)
3545 
3546 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3547 {
3548     return float16_muladd(d, b, a, float_muladd_negate_c |
3549                                    float_muladd_negate_product, s);
3550 }
3551 
fnmadd32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3552 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3553 {
3554     return float32_muladd(d, b, a, float_muladd_negate_c |
3555                                    float_muladd_negate_product, s);
3556 }
3557 
fnmadd64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3558 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3559 {
3560     return float64_muladd(d, b, a, float_muladd_negate_c |
3561                                    float_muladd_negate_product, s);
3562 }
3563 
RVVCALL(OPFVV3,vfnmadd_vv_h,OP_UUU_H,H2,H2,H2,fnmadd16)3564 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3565 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3566 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3567 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3568 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3569 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3570 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3571 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3572 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3573 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3574 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3575 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3576 
3577 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3578 {
3579     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3580 }
3581 
fmsub32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3582 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3583 {
3584     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3585 }
3586 
fmsub64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3587 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3588 {
3589     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3590 }
3591 
RVVCALL(OPFVV3,vfmsub_vv_h,OP_UUU_H,H2,H2,H2,fmsub16)3592 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3593 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3594 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3595 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3596 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3597 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3598 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3599 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3600 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3601 GEN_VEXT_VF(vfmsub_vf_h, 2)
3602 GEN_VEXT_VF(vfmsub_vf_w, 4)
3603 GEN_VEXT_VF(vfmsub_vf_d, 8)
3604 
3605 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3606 {
3607     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3608 }
3609 
fnmsub32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3610 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3611 {
3612     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3613 }
3614 
fnmsub64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3615 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3616 {
3617     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3618 }
3619 
RVVCALL(OPFVV3,vfnmsub_vv_h,OP_UUU_H,H2,H2,H2,fnmsub16)3620 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3621 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3622 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3623 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3624 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3625 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3626 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3627 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3628 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3629 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3630 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3631 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3632 
3633 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3634 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3635 {
3636     return float32_muladd(float16_to_float32(a, true, s),
3637                           float16_to_float32(b, true, s), d, 0, s);
3638 }
3639 
fwmacc32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3640 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3641 {
3642     return float64_muladd(float32_to_float64(a, s),
3643                           float32_to_float64(b, s), d, 0, s);
3644 }
3645 
RVVCALL(OPFVV3,vfwmacc_vv_h,WOP_UUU_H,H4,H2,H2,fwmacc16)3646 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3647 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3648 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3649 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3650 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3651 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3652 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3653 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3654 
3655 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3656 {
3657     return float32_muladd(bfloat16_to_float32(a, s),
3658                           bfloat16_to_float32(b, s), d, 0, s);
3659 }
3660 
RVVCALL(OPFVV3,vfwmaccbf16_vv,WOP_UUU_H,H4,H2,H2,fwmaccbf16)3661 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3662 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3663 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3664 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3665 
3666 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3667 {
3668     return float32_muladd(float16_to_float32(a, true, s),
3669                           float16_to_float32(b, true, s), d,
3670                           float_muladd_negate_c | float_muladd_negate_product,
3671                           s);
3672 }
3673 
fwnmacc32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3674 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3675 {
3676     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3677                           d, float_muladd_negate_c |
3678                              float_muladd_negate_product, s);
3679 }
3680 
RVVCALL(OPFVV3,vfwnmacc_vv_h,WOP_UUU_H,H4,H2,H2,fwnmacc16)3681 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3682 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3683 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3684 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3685 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3686 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3687 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3688 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3689 
3690 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3691 {
3692     return float32_muladd(float16_to_float32(a, true, s),
3693                           float16_to_float32(b, true, s), d,
3694                           float_muladd_negate_c, s);
3695 }
3696 
fwmsac32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3697 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3698 {
3699     return float64_muladd(float32_to_float64(a, s),
3700                           float32_to_float64(b, s), d,
3701                           float_muladd_negate_c, s);
3702 }
3703 
RVVCALL(OPFVV3,vfwmsac_vv_h,WOP_UUU_H,H4,H2,H2,fwmsac16)3704 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3705 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3706 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3707 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3708 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3709 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3710 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3711 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3712 
3713 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3714 {
3715     return float32_muladd(float16_to_float32(a, true, s),
3716                           float16_to_float32(b, true, s), d,
3717                           float_muladd_negate_product, s);
3718 }
3719 
fwnmsac32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3720 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3721 {
3722     return float64_muladd(float32_to_float64(a, s),
3723                           float32_to_float64(b, s), d,
3724                           float_muladd_negate_product, s);
3725 }
3726 
RVVCALL(OPFVV3,vfwnmsac_vv_h,WOP_UUU_H,H4,H2,H2,fwnmsac16)3727 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3728 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3729 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3730 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3731 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3732 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3733 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3734 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3735 
3736 /* Vector Floating-Point Square-Root Instruction */
3737 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3738 static void do_##NAME(void *vd, void *vs2, int i,      \
3739                       CPURISCVState *env)              \
3740 {                                                      \
3741     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3742     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3743 }
3744 
3745 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3746 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3747                   CPURISCVState *env, uint32_t desc)   \
3748 {                                                      \
3749     uint32_t vm = vext_vm(desc);                       \
3750     uint32_t vl = env->vl;                             \
3751     uint32_t total_elems =                             \
3752         vext_get_total_elems(env, desc, ESZ);          \
3753     uint32_t vta = vext_vta(desc);                     \
3754     uint32_t vma = vext_vma(desc);                     \
3755     uint32_t i;                                        \
3756                                                        \
3757     VSTART_CHECK_EARLY_EXIT(env, vl);                  \
3758                                                        \
3759     if (vl == 0) {                                     \
3760         return;                                        \
3761     }                                                  \
3762     for (i = env->vstart; i < vl; i++) {               \
3763         if (!vm && !vext_elem_mask(v0, i)) {           \
3764             /* set masked-off elements to 1s */        \
3765             vext_set_elems_1s(vd, vma, i * ESZ,        \
3766                               (i + 1) * ESZ);          \
3767             continue;                                  \
3768         }                                              \
3769         do_##NAME(vd, vs2, i, env);                    \
3770     }                                                  \
3771     env->vstart = 0;                                   \
3772     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3773                       total_elems * ESZ);              \
3774 }
3775 
3776 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3777 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3778 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3779 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3780 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3781 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3782 
3783 /*
3784  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3785  *
3786  * Adapted from riscv-v-spec recip.c:
3787  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3788  */
3789 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3790 {
3791     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3792     uint64_t exp = extract64(f, frac_size, exp_size);
3793     uint64_t frac = extract64(f, 0, frac_size);
3794 
3795     const uint8_t lookup_table[] = {
3796         52, 51, 50, 48, 47, 46, 44, 43,
3797         42, 41, 40, 39, 38, 36, 35, 34,
3798         33, 32, 31, 30, 30, 29, 28, 27,
3799         26, 25, 24, 23, 23, 22, 21, 20,
3800         19, 19, 18, 17, 16, 16, 15, 14,
3801         14, 13, 12, 12, 11, 10, 10, 9,
3802         9, 8, 7, 7, 6, 6, 5, 4,
3803         4, 3, 3, 2, 2, 1, 1, 0,
3804         127, 125, 123, 121, 119, 118, 116, 114,
3805         113, 111, 109, 108, 106, 105, 103, 102,
3806         100, 99, 97, 96, 95, 93, 92, 91,
3807         90, 88, 87, 86, 85, 84, 83, 82,
3808         80, 79, 78, 77, 76, 75, 74, 73,
3809         72, 71, 70, 70, 69, 68, 67, 66,
3810         65, 64, 63, 63, 62, 61, 60, 59,
3811         59, 58, 57, 56, 56, 55, 54, 53
3812     };
3813     const int precision = 7;
3814 
3815     if (exp == 0 && frac != 0) { /* subnormal */
3816         /* Normalize the subnormal. */
3817         while (extract64(frac, frac_size - 1, 1) == 0) {
3818             exp--;
3819             frac <<= 1;
3820         }
3821 
3822         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3823     }
3824 
3825     int idx = ((exp & 1) << (precision - 1)) |
3826               (frac >> (frac_size - precision + 1));
3827     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3828                         (frac_size - precision);
3829     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3830 
3831     uint64_t val = 0;
3832     val = deposit64(val, 0, frac_size, out_frac);
3833     val = deposit64(val, frac_size, exp_size, out_exp);
3834     val = deposit64(val, frac_size + exp_size, 1, sign);
3835     return val;
3836 }
3837 
frsqrt7_h(float16 f,float_status * s)3838 static float16 frsqrt7_h(float16 f, float_status *s)
3839 {
3840     int exp_size = 5, frac_size = 10;
3841     bool sign = float16_is_neg(f);
3842 
3843     /*
3844      * frsqrt7(sNaN) = canonical NaN
3845      * frsqrt7(-inf) = canonical NaN
3846      * frsqrt7(-normal) = canonical NaN
3847      * frsqrt7(-subnormal) = canonical NaN
3848      */
3849     if (float16_is_signaling_nan(f, s) ||
3850         (float16_is_infinity(f) && sign) ||
3851         (float16_is_normal(f) && sign) ||
3852         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3853         s->float_exception_flags |= float_flag_invalid;
3854         return float16_default_nan(s);
3855     }
3856 
3857     /* frsqrt7(qNaN) = canonical NaN */
3858     if (float16_is_quiet_nan(f, s)) {
3859         return float16_default_nan(s);
3860     }
3861 
3862     /* frsqrt7(+-0) = +-inf */
3863     if (float16_is_zero(f)) {
3864         s->float_exception_flags |= float_flag_divbyzero;
3865         return float16_set_sign(float16_infinity, sign);
3866     }
3867 
3868     /* frsqrt7(+inf) = +0 */
3869     if (float16_is_infinity(f) && !sign) {
3870         return float16_set_sign(float16_zero, sign);
3871     }
3872 
3873     /* +normal, +subnormal */
3874     uint64_t val = frsqrt7(f, exp_size, frac_size);
3875     return make_float16(val);
3876 }
3877 
frsqrt7_s(float32 f,float_status * s)3878 static float32 frsqrt7_s(float32 f, float_status *s)
3879 {
3880     int exp_size = 8, frac_size = 23;
3881     bool sign = float32_is_neg(f);
3882 
3883     /*
3884      * frsqrt7(sNaN) = canonical NaN
3885      * frsqrt7(-inf) = canonical NaN
3886      * frsqrt7(-normal) = canonical NaN
3887      * frsqrt7(-subnormal) = canonical NaN
3888      */
3889     if (float32_is_signaling_nan(f, s) ||
3890         (float32_is_infinity(f) && sign) ||
3891         (float32_is_normal(f) && sign) ||
3892         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3893         s->float_exception_flags |= float_flag_invalid;
3894         return float32_default_nan(s);
3895     }
3896 
3897     /* frsqrt7(qNaN) = canonical NaN */
3898     if (float32_is_quiet_nan(f, s)) {
3899         return float32_default_nan(s);
3900     }
3901 
3902     /* frsqrt7(+-0) = +-inf */
3903     if (float32_is_zero(f)) {
3904         s->float_exception_flags |= float_flag_divbyzero;
3905         return float32_set_sign(float32_infinity, sign);
3906     }
3907 
3908     /* frsqrt7(+inf) = +0 */
3909     if (float32_is_infinity(f) && !sign) {
3910         return float32_set_sign(float32_zero, sign);
3911     }
3912 
3913     /* +normal, +subnormal */
3914     uint64_t val = frsqrt7(f, exp_size, frac_size);
3915     return make_float32(val);
3916 }
3917 
frsqrt7_d(float64 f,float_status * s)3918 static float64 frsqrt7_d(float64 f, float_status *s)
3919 {
3920     int exp_size = 11, frac_size = 52;
3921     bool sign = float64_is_neg(f);
3922 
3923     /*
3924      * frsqrt7(sNaN) = canonical NaN
3925      * frsqrt7(-inf) = canonical NaN
3926      * frsqrt7(-normal) = canonical NaN
3927      * frsqrt7(-subnormal) = canonical NaN
3928      */
3929     if (float64_is_signaling_nan(f, s) ||
3930         (float64_is_infinity(f) && sign) ||
3931         (float64_is_normal(f) && sign) ||
3932         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3933         s->float_exception_flags |= float_flag_invalid;
3934         return float64_default_nan(s);
3935     }
3936 
3937     /* frsqrt7(qNaN) = canonical NaN */
3938     if (float64_is_quiet_nan(f, s)) {
3939         return float64_default_nan(s);
3940     }
3941 
3942     /* frsqrt7(+-0) = +-inf */
3943     if (float64_is_zero(f)) {
3944         s->float_exception_flags |= float_flag_divbyzero;
3945         return float64_set_sign(float64_infinity, sign);
3946     }
3947 
3948     /* frsqrt7(+inf) = +0 */
3949     if (float64_is_infinity(f) && !sign) {
3950         return float64_set_sign(float64_zero, sign);
3951     }
3952 
3953     /* +normal, +subnormal */
3954     uint64_t val = frsqrt7(f, exp_size, frac_size);
3955     return make_float64(val);
3956 }
3957 
RVVCALL(OPFVV1,vfrsqrt7_v_h,OP_UU_H,H2,H2,frsqrt7_h)3958 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3959 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3960 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3961 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3962 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3963 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3964 
3965 /*
3966  * Vector Floating-Point Reciprocal Estimate Instruction
3967  *
3968  * Adapted from riscv-v-spec recip.c:
3969  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3970  */
3971 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3972                       float_status *s)
3973 {
3974     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3975     uint64_t exp = extract64(f, frac_size, exp_size);
3976     uint64_t frac = extract64(f, 0, frac_size);
3977 
3978     const uint8_t lookup_table[] = {
3979         127, 125, 123, 121, 119, 117, 116, 114,
3980         112, 110, 109, 107, 105, 104, 102, 100,
3981         99, 97, 96, 94, 93, 91, 90, 88,
3982         87, 85, 84, 83, 81, 80, 79, 77,
3983         76, 75, 74, 72, 71, 70, 69, 68,
3984         66, 65, 64, 63, 62, 61, 60, 59,
3985         58, 57, 56, 55, 54, 53, 52, 51,
3986         50, 49, 48, 47, 46, 45, 44, 43,
3987         42, 41, 40, 40, 39, 38, 37, 36,
3988         35, 35, 34, 33, 32, 31, 31, 30,
3989         29, 28, 28, 27, 26, 25, 25, 24,
3990         23, 23, 22, 21, 21, 20, 19, 19,
3991         18, 17, 17, 16, 15, 15, 14, 14,
3992         13, 12, 12, 11, 11, 10, 9, 9,
3993         8, 8, 7, 7, 6, 5, 5, 4,
3994         4, 3, 3, 2, 2, 1, 1, 0
3995     };
3996     const int precision = 7;
3997 
3998     if (exp == 0 && frac != 0) { /* subnormal */
3999         /* Normalize the subnormal. */
4000         while (extract64(frac, frac_size - 1, 1) == 0) {
4001             exp--;
4002             frac <<= 1;
4003         }
4004 
4005         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
4006 
4007         if (exp != 0 && exp != UINT64_MAX) {
4008             /*
4009              * Overflow to inf or max value of same sign,
4010              * depending on sign and rounding mode.
4011              */
4012             s->float_exception_flags |= (float_flag_inexact |
4013                                          float_flag_overflow);
4014 
4015             if ((s->float_rounding_mode == float_round_to_zero) ||
4016                 ((s->float_rounding_mode == float_round_down) && !sign) ||
4017                 ((s->float_rounding_mode == float_round_up) && sign)) {
4018                 /* Return greatest/negative finite value. */
4019                 return (sign << (exp_size + frac_size)) |
4020                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
4021             } else {
4022                 /* Return +-inf. */
4023                 return (sign << (exp_size + frac_size)) |
4024                        MAKE_64BIT_MASK(frac_size, exp_size);
4025             }
4026         }
4027     }
4028 
4029     int idx = frac >> (frac_size - precision);
4030     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
4031                         (frac_size - precision);
4032     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
4033 
4034     if (out_exp == 0 || out_exp == UINT64_MAX) {
4035         /*
4036          * The result is subnormal, but don't raise the underflow exception,
4037          * because there's no additional loss of precision.
4038          */
4039         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
4040         if (out_exp == UINT64_MAX) {
4041             out_frac >>= 1;
4042             out_exp = 0;
4043         }
4044     }
4045 
4046     uint64_t val = 0;
4047     val = deposit64(val, 0, frac_size, out_frac);
4048     val = deposit64(val, frac_size, exp_size, out_exp);
4049     val = deposit64(val, frac_size + exp_size, 1, sign);
4050     return val;
4051 }
4052 
frec7_h(float16 f,float_status * s)4053 static float16 frec7_h(float16 f, float_status *s)
4054 {
4055     int exp_size = 5, frac_size = 10;
4056     bool sign = float16_is_neg(f);
4057 
4058     /* frec7(+-inf) = +-0 */
4059     if (float16_is_infinity(f)) {
4060         return float16_set_sign(float16_zero, sign);
4061     }
4062 
4063     /* frec7(+-0) = +-inf */
4064     if (float16_is_zero(f)) {
4065         s->float_exception_flags |= float_flag_divbyzero;
4066         return float16_set_sign(float16_infinity, sign);
4067     }
4068 
4069     /* frec7(sNaN) = canonical NaN */
4070     if (float16_is_signaling_nan(f, s)) {
4071         s->float_exception_flags |= float_flag_invalid;
4072         return float16_default_nan(s);
4073     }
4074 
4075     /* frec7(qNaN) = canonical NaN */
4076     if (float16_is_quiet_nan(f, s)) {
4077         return float16_default_nan(s);
4078     }
4079 
4080     /* +-normal, +-subnormal */
4081     uint64_t val = frec7(f, exp_size, frac_size, s);
4082     return make_float16(val);
4083 }
4084 
frec7_s(float32 f,float_status * s)4085 static float32 frec7_s(float32 f, float_status *s)
4086 {
4087     int exp_size = 8, frac_size = 23;
4088     bool sign = float32_is_neg(f);
4089 
4090     /* frec7(+-inf) = +-0 */
4091     if (float32_is_infinity(f)) {
4092         return float32_set_sign(float32_zero, sign);
4093     }
4094 
4095     /* frec7(+-0) = +-inf */
4096     if (float32_is_zero(f)) {
4097         s->float_exception_flags |= float_flag_divbyzero;
4098         return float32_set_sign(float32_infinity, sign);
4099     }
4100 
4101     /* frec7(sNaN) = canonical NaN */
4102     if (float32_is_signaling_nan(f, s)) {
4103         s->float_exception_flags |= float_flag_invalid;
4104         return float32_default_nan(s);
4105     }
4106 
4107     /* frec7(qNaN) = canonical NaN */
4108     if (float32_is_quiet_nan(f, s)) {
4109         return float32_default_nan(s);
4110     }
4111 
4112     /* +-normal, +-subnormal */
4113     uint64_t val = frec7(f, exp_size, frac_size, s);
4114     return make_float32(val);
4115 }
4116 
frec7_d(float64 f,float_status * s)4117 static float64 frec7_d(float64 f, float_status *s)
4118 {
4119     int exp_size = 11, frac_size = 52;
4120     bool sign = float64_is_neg(f);
4121 
4122     /* frec7(+-inf) = +-0 */
4123     if (float64_is_infinity(f)) {
4124         return float64_set_sign(float64_zero, sign);
4125     }
4126 
4127     /* frec7(+-0) = +-inf */
4128     if (float64_is_zero(f)) {
4129         s->float_exception_flags |= float_flag_divbyzero;
4130         return float64_set_sign(float64_infinity, sign);
4131     }
4132 
4133     /* frec7(sNaN) = canonical NaN */
4134     if (float64_is_signaling_nan(f, s)) {
4135         s->float_exception_flags |= float_flag_invalid;
4136         return float64_default_nan(s);
4137     }
4138 
4139     /* frec7(qNaN) = canonical NaN */
4140     if (float64_is_quiet_nan(f, s)) {
4141         return float64_default_nan(s);
4142     }
4143 
4144     /* +-normal, +-subnormal */
4145     uint64_t val = frec7(f, exp_size, frac_size, s);
4146     return make_float64(val);
4147 }
4148 
RVVCALL(OPFVV1,vfrec7_v_h,OP_UU_H,H2,H2,frec7_h)4149 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4150 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4151 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4152 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4153 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4154 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4155 
4156 /* Vector Floating-Point MIN/MAX Instructions */
4157 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4158 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4159 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4160 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4161 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4162 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4163 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4164 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4165 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4166 GEN_VEXT_VF(vfmin_vf_h, 2)
4167 GEN_VEXT_VF(vfmin_vf_w, 4)
4168 GEN_VEXT_VF(vfmin_vf_d, 8)
4169 
4170 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4171 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4172 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4173 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4174 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4175 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4176 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4177 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4178 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4179 GEN_VEXT_VF(vfmax_vf_h, 2)
4180 GEN_VEXT_VF(vfmax_vf_w, 4)
4181 GEN_VEXT_VF(vfmax_vf_d, 8)
4182 
4183 /* Vector Floating-Point Sign-Injection Instructions */
4184 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4185 {
4186     return deposit64(b, 0, 15, a);
4187 }
4188 
fsgnj32(uint32_t a,uint32_t b,float_status * s)4189 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4190 {
4191     return deposit64(b, 0, 31, a);
4192 }
4193 
fsgnj64(uint64_t a,uint64_t b,float_status * s)4194 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4195 {
4196     return deposit64(b, 0, 63, a);
4197 }
4198 
RVVCALL(OPFVV2,vfsgnj_vv_h,OP_UUU_H,H2,H2,H2,fsgnj16)4199 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4200 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4201 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4202 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4203 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4204 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4205 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4206 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4207 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4208 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4209 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4210 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4211 
4212 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4213 {
4214     return deposit64(~b, 0, 15, a);
4215 }
4216 
fsgnjn32(uint32_t a,uint32_t b,float_status * s)4217 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4218 {
4219     return deposit64(~b, 0, 31, a);
4220 }
4221 
fsgnjn64(uint64_t a,uint64_t b,float_status * s)4222 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4223 {
4224     return deposit64(~b, 0, 63, a);
4225 }
4226 
RVVCALL(OPFVV2,vfsgnjn_vv_h,OP_UUU_H,H2,H2,H2,fsgnjn16)4227 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4228 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4229 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4230 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4231 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4232 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4233 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4234 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4235 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4236 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4237 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4238 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4239 
4240 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4241 {
4242     return deposit64(b ^ a, 0, 15, a);
4243 }
4244 
fsgnjx32(uint32_t a,uint32_t b,float_status * s)4245 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4246 {
4247     return deposit64(b ^ a, 0, 31, a);
4248 }
4249 
fsgnjx64(uint64_t a,uint64_t b,float_status * s)4250 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4251 {
4252     return deposit64(b ^ a, 0, 63, a);
4253 }
4254 
RVVCALL(OPFVV2,vfsgnjx_vv_h,OP_UUU_H,H2,H2,H2,fsgnjx16)4255 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4256 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4257 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4258 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4259 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4260 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4261 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4262 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4263 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4264 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4265 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4266 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4267 
4268 /* Vector Floating-Point Compare Instructions */
4269 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4270 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4271                   CPURISCVState *env, uint32_t desc)          \
4272 {                                                             \
4273     uint32_t vm = vext_vm(desc);                              \
4274     uint32_t vl = env->vl;                                    \
4275     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4276     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4277     uint32_t vma = vext_vma(desc);                            \
4278     uint32_t i;                                               \
4279                                                               \
4280     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4281                                                               \
4282     for (i = env->vstart; i < vl; i++) {                      \
4283         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4284         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4285         if (!vm && !vext_elem_mask(v0, i)) {                  \
4286             /* set masked-off elements to 1s */               \
4287             if (vma) {                                        \
4288                 vext_set_elem_mask(vd, i, 1);                 \
4289             }                                                 \
4290             continue;                                         \
4291         }                                                     \
4292         vext_set_elem_mask(vd, i,                             \
4293                            DO_OP(s2, s1, &env->fp_status));   \
4294     }                                                         \
4295     env->vstart = 0;                                          \
4296     /*
4297      * mask destination register are always tail-agnostic
4298      * set tail elements to 1s
4299      */                                                       \
4300     if (vta_all_1s) {                                         \
4301         for (; i < total_elems; i++) {                        \
4302             vext_set_elem_mask(vd, i, 1);                     \
4303         }                                                     \
4304     }                                                         \
4305 }
4306 
4307 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4308 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4309 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4310 
4311 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4312 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4313                   CPURISCVState *env, uint32_t desc)                \
4314 {                                                                   \
4315     uint32_t vm = vext_vm(desc);                                    \
4316     uint32_t vl = env->vl;                                          \
4317     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4318     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4319     uint32_t vma = vext_vma(desc);                                  \
4320     uint32_t i;                                                     \
4321                                                                     \
4322     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
4323                                                                     \
4324     for (i = env->vstart; i < vl; i++) {                            \
4325         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4326         if (!vm && !vext_elem_mask(v0, i)) {                        \
4327             /* set masked-off elements to 1s */                     \
4328             if (vma) {                                              \
4329                 vext_set_elem_mask(vd, i, 1);                       \
4330             }                                                       \
4331             continue;                                               \
4332         }                                                           \
4333         vext_set_elem_mask(vd, i,                                   \
4334                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4335     }                                                               \
4336     env->vstart = 0;                                                \
4337     /*
4338      * mask destination register are always tail-agnostic
4339      * set tail elements to 1s
4340      */                                                             \
4341     if (vta_all_1s) {                                               \
4342         for (; i < total_elems; i++) {                              \
4343             vext_set_elem_mask(vd, i, 1);                           \
4344         }                                                           \
4345     }                                                               \
4346 }
4347 
4348 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4349 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4350 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4351 
4352 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4353 {
4354     FloatRelation compare = float16_compare_quiet(a, b, s);
4355     return compare != float_relation_equal;
4356 }
4357 
vmfne32(uint32_t a,uint32_t b,float_status * s)4358 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4359 {
4360     FloatRelation compare = float32_compare_quiet(a, b, s);
4361     return compare != float_relation_equal;
4362 }
4363 
vmfne64(uint64_t a,uint64_t b,float_status * s)4364 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4365 {
4366     FloatRelation compare = float64_compare_quiet(a, b, s);
4367     return compare != float_relation_equal;
4368 }
4369 
GEN_VEXT_CMP_VV_ENV(vmfne_vv_h,uint16_t,H2,vmfne16)4370 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4371 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4372 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4373 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4374 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4375 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4376 
4377 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4378 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4379 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4380 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4381 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4382 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4383 
4384 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4385 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4386 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4387 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4388 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4389 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4390 
4391 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4392 {
4393     FloatRelation compare = float16_compare(a, b, s);
4394     return compare == float_relation_greater;
4395 }
4396 
vmfgt32(uint32_t a,uint32_t b,float_status * s)4397 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4398 {
4399     FloatRelation compare = float32_compare(a, b, s);
4400     return compare == float_relation_greater;
4401 }
4402 
vmfgt64(uint64_t a,uint64_t b,float_status * s)4403 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4404 {
4405     FloatRelation compare = float64_compare(a, b, s);
4406     return compare == float_relation_greater;
4407 }
4408 
GEN_VEXT_CMP_VF(vmfgt_vf_h,uint16_t,H2,vmfgt16)4409 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4410 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4411 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4412 
4413 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4414 {
4415     FloatRelation compare = float16_compare(a, b, s);
4416     return compare == float_relation_greater ||
4417            compare == float_relation_equal;
4418 }
4419 
vmfge32(uint32_t a,uint32_t b,float_status * s)4420 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4421 {
4422     FloatRelation compare = float32_compare(a, b, s);
4423     return compare == float_relation_greater ||
4424            compare == float_relation_equal;
4425 }
4426 
vmfge64(uint64_t a,uint64_t b,float_status * s)4427 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4428 {
4429     FloatRelation compare = float64_compare(a, b, s);
4430     return compare == float_relation_greater ||
4431            compare == float_relation_equal;
4432 }
4433 
GEN_VEXT_CMP_VF(vmfge_vf_h,uint16_t,H2,vmfge16)4434 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4435 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4436 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4437 
4438 /* Vector Floating-Point Classify Instruction */
4439 target_ulong fclass_h(uint64_t frs1)
4440 {
4441     float16 f = frs1;
4442     bool sign = float16_is_neg(f);
4443 
4444     if (float16_is_infinity(f)) {
4445         return sign ? 1 << 0 : 1 << 7;
4446     } else if (float16_is_zero(f)) {
4447         return sign ? 1 << 3 : 1 << 4;
4448     } else if (float16_is_zero_or_denormal(f)) {
4449         return sign ? 1 << 2 : 1 << 5;
4450     } else if (float16_is_any_nan(f)) {
4451         float_status s = { }; /* for snan_bit_is_one */
4452         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4453     } else {
4454         return sign ? 1 << 1 : 1 << 6;
4455     }
4456 }
4457 
fclass_s(uint64_t frs1)4458 target_ulong fclass_s(uint64_t frs1)
4459 {
4460     float32 f = frs1;
4461     bool sign = float32_is_neg(f);
4462 
4463     if (float32_is_infinity(f)) {
4464         return sign ? 1 << 0 : 1 << 7;
4465     } else if (float32_is_zero(f)) {
4466         return sign ? 1 << 3 : 1 << 4;
4467     } else if (float32_is_zero_or_denormal(f)) {
4468         return sign ? 1 << 2 : 1 << 5;
4469     } else if (float32_is_any_nan(f)) {
4470         float_status s = { }; /* for snan_bit_is_one */
4471         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4472     } else {
4473         return sign ? 1 << 1 : 1 << 6;
4474     }
4475 }
4476 
fclass_d(uint64_t frs1)4477 target_ulong fclass_d(uint64_t frs1)
4478 {
4479     float64 f = frs1;
4480     bool sign = float64_is_neg(f);
4481 
4482     if (float64_is_infinity(f)) {
4483         return sign ? 1 << 0 : 1 << 7;
4484     } else if (float64_is_zero(f)) {
4485         return sign ? 1 << 3 : 1 << 4;
4486     } else if (float64_is_zero_or_denormal(f)) {
4487         return sign ? 1 << 2 : 1 << 5;
4488     } else if (float64_is_any_nan(f)) {
4489         float_status s = { }; /* for snan_bit_is_one */
4490         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4491     } else {
4492         return sign ? 1 << 1 : 1 << 6;
4493     }
4494 }
4495 
RVVCALL(OPIVV1,vfclass_v_h,OP_UU_H,H2,H2,fclass_h)4496 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4497 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4498 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4499 GEN_VEXT_V(vfclass_v_h, 2)
4500 GEN_VEXT_V(vfclass_v_w, 4)
4501 GEN_VEXT_V(vfclass_v_d, 8)
4502 
4503 /* Vector Floating-Point Merge Instruction */
4504 
4505 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4506 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4507                   CPURISCVState *env, uint32_t desc)          \
4508 {                                                             \
4509     uint32_t vm = vext_vm(desc);                              \
4510     uint32_t vl = env->vl;                                    \
4511     uint32_t esz = sizeof(ETYPE);                             \
4512     uint32_t total_elems =                                    \
4513         vext_get_total_elems(env, desc, esz);                 \
4514     uint32_t vta = vext_vta(desc);                            \
4515     uint32_t i;                                               \
4516                                                               \
4517     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4518                                                               \
4519     for (i = env->vstart; i < vl; i++) {                      \
4520         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4521         *((ETYPE *)vd + H(i)) =                               \
4522             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4523     }                                                         \
4524     env->vstart = 0;                                          \
4525     /* set tail elements to 1s */                             \
4526     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4527 }
4528 
4529 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4530 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4531 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4532 
4533 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4534 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4535 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4536 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4537 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4538 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4539 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4540 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4541 
4542 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4543 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4544 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4545 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4546 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4547 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4548 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4549 
4550 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4551 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4552 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4553 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4554 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4555 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4556 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4557 
4558 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4559 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4560 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4561 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4562 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4563 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4564 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4565 
4566 /* Widening Floating-Point/Integer Type-Convert Instructions */
4567 /* (TD, T2, TX2) */
4568 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4569 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4570 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4571 /*
4572  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4573  */
4574 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4575 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4576 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4577 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4578 
4579 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4580 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4581 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4582 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4583 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4584 
4585 /*
4586  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4587  */
4588 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4589 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4590 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4591 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4592 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4593 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4594 
4595 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4596 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4597 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4598 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4599 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4600 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4601 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4602 
4603 /*
4604  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4605  */
4606 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4607 {
4608     return float16_to_float32(a, true, s);
4609 }
4610 
RVVCALL(OPFVV1,vfwcvt_f_f_v_h,WOP_UU_H,H4,H2,vfwcvtffv16)4611 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4612 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4613 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4614 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4615 
4616 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4617 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4618 
4619 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4620 /* (TD, T2, TX2) */
4621 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4622 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4623 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4624 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4625 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4626 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4627 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4628 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4629 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4630 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4631 
4632 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4633 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4634 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4635 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4636 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4637 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4638 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4639 
4640 /*
4641  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4642  */
4643 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4644 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4645 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4646 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4647 
4648 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4649 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4650 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4651 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4652 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4653 
4654 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4655 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4656 {
4657     return float32_to_float16(a, true, s);
4658 }
4659 
RVVCALL(OPFVV1,vfncvt_f_f_w_h,NOP_UU_H,H2,H4,vfncvtffv16)4660 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4661 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4662 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4663 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4664 
4665 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4666 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4667 
4668 /*
4669  * Vector Reduction Operations
4670  */
4671 /* Vector Single-Width Integer Reduction Instructions */
4672 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4673 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4674                   void *vs2, CPURISCVState *env,          \
4675                   uint32_t desc)                          \
4676 {                                                         \
4677     uint32_t vm = vext_vm(desc);                          \
4678     uint32_t vl = env->vl;                                \
4679     uint32_t esz = sizeof(TD);                            \
4680     uint32_t vlenb = simd_maxsz(desc);                    \
4681     uint32_t vta = vext_vta(desc);                        \
4682     uint32_t i;                                           \
4683     TD s1 =  *((TD *)vs1 + HD(0));                        \
4684                                                           \
4685     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4686                                                           \
4687     for (i = env->vstart; i < vl; i++) {                  \
4688         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4689         if (!vm && !vext_elem_mask(v0, i)) {              \
4690             continue;                                     \
4691         }                                                 \
4692         s1 = OP(s1, (TD)s2);                              \
4693     }                                                     \
4694     if (vl > 0) {                                         \
4695         *((TD *)vd + HD(0)) = s1;                         \
4696     }                                                     \
4697     env->vstart = 0;                                      \
4698     /* set tail elements to 1s */                         \
4699     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4700 }
4701 
4702 /* vd[0] = sum(vs1[0], vs2[*]) */
4703 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4704 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4705 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4706 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4707 
4708 /* vd[0] = maxu(vs1[0], vs2[*]) */
4709 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4710 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4711 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4712 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4713 
4714 /* vd[0] = max(vs1[0], vs2[*]) */
4715 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4716 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4717 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4718 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4719 
4720 /* vd[0] = minu(vs1[0], vs2[*]) */
4721 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4722 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4723 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4724 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4725 
4726 /* vd[0] = min(vs1[0], vs2[*]) */
4727 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4728 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4729 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4730 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4731 
4732 /* vd[0] = and(vs1[0], vs2[*]) */
4733 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4734 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4735 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4736 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4737 
4738 /* vd[0] = or(vs1[0], vs2[*]) */
4739 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4740 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4741 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4742 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4743 
4744 /* vd[0] = xor(vs1[0], vs2[*]) */
4745 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4746 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4747 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4748 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4749 
4750 /* Vector Widening Integer Reduction Instructions */
4751 /* signed sum reduction into double-width accumulator */
4752 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4753 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4754 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4755 
4756 /* Unsigned sum reduction into double-width accumulator */
4757 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4758 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4759 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4760 
4761 /* Vector Single-Width Floating-Point Reduction Instructions */
4762 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4763 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4764                   void *vs2, CPURISCVState *env,           \
4765                   uint32_t desc)                           \
4766 {                                                          \
4767     uint32_t vm = vext_vm(desc);                           \
4768     uint32_t vl = env->vl;                                 \
4769     uint32_t esz = sizeof(TD);                             \
4770     uint32_t vlenb = simd_maxsz(desc);                     \
4771     uint32_t vta = vext_vta(desc);                         \
4772     uint32_t i;                                            \
4773     TD s1 =  *((TD *)vs1 + HD(0));                         \
4774                                                            \
4775     VSTART_CHECK_EARLY_EXIT(env, vl);                      \
4776                                                            \
4777     for (i = env->vstart; i < vl; i++) {                   \
4778         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4779         if (!vm && !vext_elem_mask(v0, i)) {               \
4780             continue;                                      \
4781         }                                                  \
4782         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4783     }                                                      \
4784     if (vl > 0) {                                          \
4785         *((TD *)vd + HD(0)) = s1;                          \
4786     }                                                      \
4787     env->vstart = 0;                                       \
4788     /* set tail elements to 1s */                          \
4789     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4790 }
4791 
4792 /* Unordered sum */
4793 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4794 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4795 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4796 
4797 /* Ordered sum */
4798 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4799 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4800 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4801 
4802 /* Maximum value */
4803 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4804               float16_maximum_number)
4805 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4806               float32_maximum_number)
4807 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4808               float64_maximum_number)
4809 
4810 /* Minimum value */
4811 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4812               float16_minimum_number)
4813 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4814               float32_minimum_number)
4815 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4816               float64_minimum_number)
4817 
4818 /* Vector Widening Floating-Point Add Instructions */
4819 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4820 {
4821     return float32_add(a, float16_to_float32(b, true, s), s);
4822 }
4823 
fwadd32(uint64_t a,uint32_t b,float_status * s)4824 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4825 {
4826     return float64_add(a, float32_to_float64(b, s), s);
4827 }
4828 
4829 /* Vector Widening Floating-Point Reduction Instructions */
4830 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
GEN_VEXT_FRED(vfwredusum_vs_h,uint32_t,uint16_t,H4,H2,fwadd16)4831 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4832 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4833 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4834 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4835 
4836 /*
4837  * Vector Mask Operations
4838  */
4839 /* Vector Mask-Register Logical Instructions */
4840 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4841 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4842                   void *vs2, CPURISCVState *env,          \
4843                   uint32_t desc)                          \
4844 {                                                         \
4845     uint32_t vl = env->vl;                                \
4846     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4847     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4848     uint32_t i;                                           \
4849     int a, b;                                             \
4850                                                           \
4851     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4852                                                           \
4853     for (i = env->vstart; i < vl; i++) {                  \
4854         a = vext_elem_mask(vs1, i);                       \
4855         b = vext_elem_mask(vs2, i);                       \
4856         vext_set_elem_mask(vd, i, OP(b, a));              \
4857     }                                                     \
4858     env->vstart = 0;                                      \
4859     /*
4860      * mask destination register are always tail-agnostic
4861      * set tail elements to 1s
4862      */                                                   \
4863     if (vta_all_1s) {                                     \
4864         for (; i < total_elems; i++) {                    \
4865             vext_set_elem_mask(vd, i, 1);                 \
4866         }                                                 \
4867     }                                                     \
4868 }
4869 
4870 #define DO_NAND(N, M)  (!(N & M))
4871 #define DO_ANDNOT(N, M)  (N & !M)
4872 #define DO_NOR(N, M)  (!(N | M))
4873 #define DO_ORNOT(N, M)  (N | !M)
4874 #define DO_XNOR(N, M)  (!(N ^ M))
4875 
4876 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4877 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4878 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4879 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4880 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4881 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4882 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4883 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4884 
4885 /* Vector count population in mask vcpop */
4886 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4887                              uint32_t desc)
4888 {
4889     target_ulong cnt = 0;
4890     uint32_t vm = vext_vm(desc);
4891     uint32_t vl = env->vl;
4892     int i;
4893 
4894     for (i = env->vstart; i < vl; i++) {
4895         if (vm || vext_elem_mask(v0, i)) {
4896             if (vext_elem_mask(vs2, i)) {
4897                 cnt++;
4898             }
4899         }
4900     }
4901     env->vstart = 0;
4902     return cnt;
4903 }
4904 
4905 /* vfirst find-first-set mask bit */
HELPER(vfirst_m)4906 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4907                               uint32_t desc)
4908 {
4909     uint32_t vm = vext_vm(desc);
4910     uint32_t vl = env->vl;
4911     int i;
4912 
4913     for (i = env->vstart; i < vl; i++) {
4914         if (vm || vext_elem_mask(v0, i)) {
4915             if (vext_elem_mask(vs2, i)) {
4916                 return i;
4917             }
4918         }
4919     }
4920     env->vstart = 0;
4921     return -1LL;
4922 }
4923 
4924 enum set_mask_type {
4925     ONLY_FIRST = 1,
4926     INCLUDE_FIRST,
4927     BEFORE_FIRST,
4928 };
4929 
vmsetm(void * vd,void * v0,void * vs2,CPURISCVState * env,uint32_t desc,enum set_mask_type type)4930 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4931                    uint32_t desc, enum set_mask_type type)
4932 {
4933     uint32_t vm = vext_vm(desc);
4934     uint32_t vl = env->vl;
4935     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4936     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4937     uint32_t vma = vext_vma(desc);
4938     int i;
4939     bool first_mask_bit = false;
4940 
4941     VSTART_CHECK_EARLY_EXIT(env, vl);
4942 
4943     for (i = env->vstart; i < vl; i++) {
4944         if (!vm && !vext_elem_mask(v0, i)) {
4945             /* set masked-off elements to 1s */
4946             if (vma) {
4947                 vext_set_elem_mask(vd, i, 1);
4948             }
4949             continue;
4950         }
4951         /* write a zero to all following active elements */
4952         if (first_mask_bit) {
4953             vext_set_elem_mask(vd, i, 0);
4954             continue;
4955         }
4956         if (vext_elem_mask(vs2, i)) {
4957             first_mask_bit = true;
4958             if (type == BEFORE_FIRST) {
4959                 vext_set_elem_mask(vd, i, 0);
4960             } else {
4961                 vext_set_elem_mask(vd, i, 1);
4962             }
4963         } else {
4964             if (type == ONLY_FIRST) {
4965                 vext_set_elem_mask(vd, i, 0);
4966             } else {
4967                 vext_set_elem_mask(vd, i, 1);
4968             }
4969         }
4970     }
4971     env->vstart = 0;
4972     /*
4973      * mask destination register are always tail-agnostic
4974      * set tail elements to 1s
4975      */
4976     if (vta_all_1s) {
4977         for (; i < total_elems; i++) {
4978             vext_set_elem_mask(vd, i, 1);
4979         }
4980     }
4981 }
4982 
HELPER(vmsbf_m)4983 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4984                      uint32_t desc)
4985 {
4986     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4987 }
4988 
HELPER(vmsif_m)4989 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4990                      uint32_t desc)
4991 {
4992     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4993 }
4994 
HELPER(vmsof_m)4995 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4996                      uint32_t desc)
4997 {
4998     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4999 }
5000 
5001 /* Vector Iota Instruction */
5002 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
5003 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
5004                   uint32_t desc)                                          \
5005 {                                                                         \
5006     uint32_t vm = vext_vm(desc);                                          \
5007     uint32_t vl = env->vl;                                                \
5008     uint32_t esz = sizeof(ETYPE);                                         \
5009     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5010     uint32_t vta = vext_vta(desc);                                        \
5011     uint32_t vma = vext_vma(desc);                                        \
5012     uint32_t sum = 0;                                                     \
5013     int i;                                                                \
5014                                                                           \
5015     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5016                                                                           \
5017     for (i = env->vstart; i < vl; i++) {                                  \
5018         if (!vm && !vext_elem_mask(v0, i)) {                              \
5019             /* set masked-off elements to 1s */                           \
5020             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5021             continue;                                                     \
5022         }                                                                 \
5023         *((ETYPE *)vd + H(i)) = sum;                                      \
5024         if (vext_elem_mask(vs2, i)) {                                     \
5025             sum++;                                                        \
5026         }                                                                 \
5027     }                                                                     \
5028     env->vstart = 0;                                                      \
5029     /* set tail elements to 1s */                                         \
5030     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5031 }
5032 
GEN_VEXT_VIOTA_M(viota_m_b,uint8_t,H1)5033 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
5034 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
5035 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
5036 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
5037 
5038 /* Vector Element Index Instruction */
5039 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
5040 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
5041 {                                                                         \
5042     uint32_t vm = vext_vm(desc);                                          \
5043     uint32_t vl = env->vl;                                                \
5044     uint32_t esz = sizeof(ETYPE);                                         \
5045     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5046     uint32_t vta = vext_vta(desc);                                        \
5047     uint32_t vma = vext_vma(desc);                                        \
5048     int i;                                                                \
5049                                                                           \
5050     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5051                                                                           \
5052     for (i = env->vstart; i < vl; i++) {                                  \
5053         if (!vm && !vext_elem_mask(v0, i)) {                              \
5054             /* set masked-off elements to 1s */                           \
5055             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5056             continue;                                                     \
5057         }                                                                 \
5058         *((ETYPE *)vd + H(i)) = i;                                        \
5059     }                                                                     \
5060     env->vstart = 0;                                                      \
5061     /* set tail elements to 1s */                                         \
5062     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5063 }
5064 
5065 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5066 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5067 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5068 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5069 
5070 /*
5071  * Vector Permutation Instructions
5072  */
5073 
5074 /* Vector Slide Instructions */
5075 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5076 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5077                   CPURISCVState *env, uint32_t desc)                      \
5078 {                                                                         \
5079     uint32_t vm = vext_vm(desc);                                          \
5080     uint32_t vl = env->vl;                                                \
5081     uint32_t esz = sizeof(ETYPE);                                         \
5082     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5083     uint32_t vta = vext_vta(desc);                                        \
5084     uint32_t vma = vext_vma(desc);                                        \
5085     target_ulong offset = s1, i_min, i;                                   \
5086                                                                           \
5087     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5088                                                                           \
5089     i_min = MAX(env->vstart, offset);                                     \
5090     for (i = i_min; i < vl; i++) {                                        \
5091         if (!vm && !vext_elem_mask(v0, i)) {                              \
5092             /* set masked-off elements to 1s */                           \
5093             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5094             continue;                                                     \
5095         }                                                                 \
5096         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5097     }                                                                     \
5098     env->vstart = 0;                                                      \
5099     /* set tail elements to 1s */                                         \
5100     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5101 }
5102 
5103 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5104 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5105 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5106 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5107 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5108 
5109 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5110 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5111                   CPURISCVState *env, uint32_t desc)                      \
5112 {                                                                         \
5113     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5114     uint32_t vm = vext_vm(desc);                                          \
5115     uint32_t vl = env->vl;                                                \
5116     uint32_t esz = sizeof(ETYPE);                                         \
5117     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5118     uint32_t vta = vext_vta(desc);                                        \
5119     uint32_t vma = vext_vma(desc);                                        \
5120     target_ulong i_max, i_min, i;                                         \
5121                                                                           \
5122     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5123                                                                           \
5124     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
5125     i_max = MAX(i_min, env->vstart);                                      \
5126     for (i = env->vstart; i < i_max; ++i) {                               \
5127         if (!vm && !vext_elem_mask(v0, i)) {                              \
5128             /* set masked-off elements to 1s */                           \
5129             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5130             continue;                                                     \
5131         }                                                                 \
5132         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5133     }                                                                     \
5134                                                                           \
5135     for (i = i_max; i < vl; ++i) {                                        \
5136         if (!vm && !vext_elem_mask(v0, i)) {                              \
5137             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5138             continue;                                                     \
5139         }                                                                 \
5140         *((ETYPE *)vd + H(i)) = 0;                                        \
5141     }                                                                     \
5142                                                                           \
5143     env->vstart = 0;                                                      \
5144     /* set tail elements to 1s */                                         \
5145     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5146 }
5147 
5148 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5149 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5150 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5151 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5152 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5153 
5154 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5155 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5156                                  void *vs2, CPURISCVState *env,             \
5157                                  uint32_t desc)                             \
5158 {                                                                           \
5159     typedef uint##BITWIDTH##_t ETYPE;                                       \
5160     uint32_t vm = vext_vm(desc);                                            \
5161     uint32_t vl = env->vl;                                                  \
5162     uint32_t esz = sizeof(ETYPE);                                           \
5163     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5164     uint32_t vta = vext_vta(desc);                                          \
5165     uint32_t vma = vext_vma(desc);                                          \
5166     uint32_t i;                                                             \
5167                                                                             \
5168     VSTART_CHECK_EARLY_EXIT(env, vl);                                       \
5169                                                                             \
5170     for (i = env->vstart; i < vl; i++) {                                    \
5171         if (!vm && !vext_elem_mask(v0, i)) {                                \
5172             /* set masked-off elements to 1s */                             \
5173             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5174             continue;                                                       \
5175         }                                                                   \
5176         if (i == 0) {                                                       \
5177             *((ETYPE *)vd + H(i)) = s1;                                     \
5178         } else {                                                            \
5179             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5180         }                                                                   \
5181     }                                                                       \
5182     env->vstart = 0;                                                        \
5183     /* set tail elements to 1s */                                           \
5184     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5185 }
5186 
5187 GEN_VEXT_VSLIE1UP(8,  H1)
5188 GEN_VEXT_VSLIE1UP(16, H2)
5189 GEN_VEXT_VSLIE1UP(32, H4)
5190 GEN_VEXT_VSLIE1UP(64, H8)
5191 
5192 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5193 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5194                   CPURISCVState *env, uint32_t desc)              \
5195 {                                                                 \
5196     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5197 }
5198 
5199 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5200 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5201 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5202 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5203 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5204 
5205 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5206 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5207                                    void *vs2, CPURISCVState *env,             \
5208                                    uint32_t desc)                             \
5209 {                                                                             \
5210     typedef uint##BITWIDTH##_t ETYPE;                                         \
5211     uint32_t vm = vext_vm(desc);                                              \
5212     uint32_t vl = env->vl;                                                    \
5213     uint32_t esz = sizeof(ETYPE);                                             \
5214     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5215     uint32_t vta = vext_vta(desc);                                            \
5216     uint32_t vma = vext_vma(desc);                                            \
5217     uint32_t i;                                                               \
5218                                                                               \
5219     VSTART_CHECK_EARLY_EXIT(env, vl);                                         \
5220                                                                               \
5221     for (i = env->vstart; i < vl; i++) {                                      \
5222         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5223             /* set masked-off elements to 1s */                               \
5224             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5225             continue;                                                         \
5226         }                                                                     \
5227         if (i == vl - 1) {                                                    \
5228             *((ETYPE *)vd + H(i)) = s1;                                       \
5229         } else {                                                              \
5230             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5231         }                                                                     \
5232     }                                                                         \
5233     env->vstart = 0;                                                          \
5234     /* set tail elements to 1s */                                             \
5235     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5236 }
5237 
5238 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5239 GEN_VEXT_VSLIDE1DOWN(16, H2)
5240 GEN_VEXT_VSLIDE1DOWN(32, H4)
5241 GEN_VEXT_VSLIDE1DOWN(64, H8)
5242 
5243 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5244 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5245                   CPURISCVState *env, uint32_t desc)              \
5246 {                                                                 \
5247     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5248 }
5249 
5250 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5251 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5252 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5253 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5254 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5255 
5256 /* Vector Floating-Point Slide Instructions */
5257 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5258 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5259                   CPURISCVState *env, uint32_t desc)          \
5260 {                                                             \
5261     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5262 }
5263 
5264 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5265 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5266 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5267 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5268 
5269 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5270 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5271                   CPURISCVState *env, uint32_t desc)          \
5272 {                                                             \
5273     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5274 }
5275 
5276 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5277 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5278 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5279 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5280 
5281 /* Vector Register Gather Instruction */
5282 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5283 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5284                   CPURISCVState *env, uint32_t desc)                      \
5285 {                                                                         \
5286     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5287     uint32_t vm = vext_vm(desc);                                          \
5288     uint32_t vl = env->vl;                                                \
5289     uint32_t esz = sizeof(TS2);                                           \
5290     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5291     uint32_t vta = vext_vta(desc);                                        \
5292     uint32_t vma = vext_vma(desc);                                        \
5293     uint64_t index;                                                       \
5294     uint32_t i;                                                           \
5295                                                                           \
5296     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5297                                                                           \
5298     for (i = env->vstart; i < vl; i++) {                                  \
5299         if (!vm && !vext_elem_mask(v0, i)) {                              \
5300             /* set masked-off elements to 1s */                           \
5301             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5302             continue;                                                     \
5303         }                                                                 \
5304         index = *((TS1 *)vs1 + HS1(i));                                   \
5305         if (index >= vlmax) {                                             \
5306             *((TS2 *)vd + HS2(i)) = 0;                                    \
5307         } else {                                                          \
5308             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5309         }                                                                 \
5310     }                                                                     \
5311     env->vstart = 0;                                                      \
5312     /* set tail elements to 1s */                                         \
5313     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5314 }
5315 
5316 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5317 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5318 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5319 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5320 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5321 
5322 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5323 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5324 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5325 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5326 
5327 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5328 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5329                   CPURISCVState *env, uint32_t desc)                      \
5330 {                                                                         \
5331     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5332     uint32_t vm = vext_vm(desc);                                          \
5333     uint32_t vl = env->vl;                                                \
5334     uint32_t esz = sizeof(ETYPE);                                         \
5335     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5336     uint32_t vta = vext_vta(desc);                                        \
5337     uint32_t vma = vext_vma(desc);                                        \
5338     uint64_t index = s1;                                                  \
5339     uint32_t i;                                                           \
5340                                                                           \
5341     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5342                                                                           \
5343     for (i = env->vstart; i < vl; i++) {                                  \
5344         if (!vm && !vext_elem_mask(v0, i)) {                              \
5345             /* set masked-off elements to 1s */                           \
5346             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5347             continue;                                                     \
5348         }                                                                 \
5349         if (index >= vlmax) {                                             \
5350             *((ETYPE *)vd + H(i)) = 0;                                    \
5351         } else {                                                          \
5352             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5353         }                                                                 \
5354     }                                                                     \
5355     env->vstart = 0;                                                      \
5356     /* set tail elements to 1s */                                         \
5357     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5358 }
5359 
5360 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5361 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5362 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5363 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5364 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5365 
5366 /* Vector Compress Instruction */
5367 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5368 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5369                   CPURISCVState *env, uint32_t desc)                      \
5370 {                                                                         \
5371     uint32_t vl = env->vl;                                                \
5372     uint32_t esz = sizeof(ETYPE);                                         \
5373     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5374     uint32_t vta = vext_vta(desc);                                        \
5375     uint32_t num = 0, i;                                                  \
5376                                                                           \
5377     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5378                                                                           \
5379     for (i = env->vstart; i < vl; i++) {                                  \
5380         if (!vext_elem_mask(vs1, i)) {                                    \
5381             continue;                                                     \
5382         }                                                                 \
5383         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5384         num++;                                                            \
5385     }                                                                     \
5386     env->vstart = 0;                                                      \
5387     /* set tail elements to 1s */                                         \
5388     vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
5389 }
5390 
5391 /* Compress into vd elements of vs2 where vs1 is enabled */
5392 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5393 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5394 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5395 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5396 
5397 /* Vector Whole Register Move */
5398 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5399 {
5400     /* EEW = SEW */
5401     uint32_t maxsz = simd_maxsz(desc);
5402     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5403     uint32_t startb = env->vstart * sewb;
5404     uint32_t i = startb;
5405 
5406     if (startb >= maxsz) {
5407         env->vstart = 0;
5408         return;
5409     }
5410 
5411     if (HOST_BIG_ENDIAN && i % 8 != 0) {
5412         uint32_t j = ROUND_UP(i, 8);
5413         memcpy((uint8_t *)vd + H1(j - 1),
5414                (uint8_t *)vs2 + H1(j - 1),
5415                j - i);
5416         i = j;
5417     }
5418 
5419     memcpy((uint8_t *)vd + H1(i),
5420            (uint8_t *)vs2 + H1(i),
5421            maxsz - i);
5422 
5423     env->vstart = 0;
5424 }
5425 
5426 /* Vector Integer Extension */
5427 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5428 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5429                   CPURISCVState *env, uint32_t desc)             \
5430 {                                                                \
5431     uint32_t vl = env->vl;                                       \
5432     uint32_t vm = vext_vm(desc);                                 \
5433     uint32_t esz = sizeof(ETYPE);                                \
5434     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5435     uint32_t vta = vext_vta(desc);                               \
5436     uint32_t vma = vext_vma(desc);                               \
5437     uint32_t i;                                                  \
5438                                                                  \
5439     VSTART_CHECK_EARLY_EXIT(env, vl);                            \
5440                                                                  \
5441     for (i = env->vstart; i < vl; i++) {                         \
5442         if (!vm && !vext_elem_mask(v0, i)) {                     \
5443             /* set masked-off elements to 1s */                  \
5444             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5445             continue;                                            \
5446         }                                                        \
5447         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5448     }                                                            \
5449     env->vstart = 0;                                             \
5450     /* set tail elements to 1s */                                \
5451     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5452 }
5453 
5454 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5455 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5456 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5457 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5458 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5459 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5460 
5461 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5462 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5463 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5464 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5465 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5466 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5467