xref: /qemu/target/loongarch/tcg/vec_helper.c (revision 513e88a24dedee946b0a16f12dbf76fb540a8a57)
1a0c9400aSSong Gao /* SPDX-License-Identifier: GPL-2.0-or-later */
2a0c9400aSSong Gao /*
31dc33f26SSong Gao  * QEMU LoongArch vector helper functions.
4a0c9400aSSong Gao  *
5a0c9400aSSong Gao  * Copyright (c) 2022-2023 Loongson Technology Corporation Limited
6a0c9400aSSong Gao  */
7c037fbc9SSong Gao 
8c037fbc9SSong Gao #include "qemu/osdep.h"
9c037fbc9SSong Gao #include "cpu.h"
10c037fbc9SSong Gao #include "exec/exec-all.h"
11c037fbc9SSong Gao #include "exec/helper-proto.h"
12aca67472SSong Gao #include "fpu/softfloat.h"
13aca67472SSong Gao #include "internals.h"
14d0dfa19aSSong Gao #include "tcg/tcg.h"
15008a3b16SSong Gao #include "vec.h"
1664cf6b99SSong Gao #include "tcg/tcg-gvec-desc.h"
17c037fbc9SSong Gao 
18c037fbc9SSong Gao #define DO_ADD(a, b)  (a + b)
19c037fbc9SSong Gao #define DO_SUB(a, b)  (a - b)
20c037fbc9SSong Gao 
21c037fbc9SSong Gao #define DO_ODD_EVEN(NAME, BIT, E1, E2, DO_OP)                        \
2204711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)       \
23c037fbc9SSong Gao {                                                                    \
24c037fbc9SSong Gao     int i;                                                           \
2504711da1SSong Gao     VReg *Vd = (VReg *)vd;                                           \
2604711da1SSong Gao     VReg *Vj = (VReg *)vj;                                           \
2704711da1SSong Gao     VReg *Vk = (VReg *)vk;                                           \
28c037fbc9SSong Gao     typedef __typeof(Vd->E1(0)) TD;                                  \
2964cf6b99SSong Gao     int oprsz = simd_oprsz(desc);                                    \
30c037fbc9SSong Gao                                                                      \
3164cf6b99SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                        \
32c037fbc9SSong Gao         Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i)); \
33c037fbc9SSong Gao     }                                                                \
34c037fbc9SSong Gao }
35c037fbc9SSong Gao 
36c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_h_b, 16, H, B, DO_ADD)
37c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_w_h, 32, W, H, DO_ADD)
38c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_d_w, 64, D, W, DO_ADD)
39c037fbc9SSong Gao 
4004711da1SSong Gao void HELPER(vhaddw_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
41c037fbc9SSong Gao {
4264cf6b99SSong Gao     int i;
4304711da1SSong Gao     VReg *Vd = (VReg *)vd;
4404711da1SSong Gao     VReg *Vj = (VReg *)vj;
4504711da1SSong Gao     VReg *Vk = (VReg *)vk;
4664cf6b99SSong Gao     int oprsz = simd_oprsz(desc);
47c037fbc9SSong Gao 
4864cf6b99SSong Gao     for (i = 0; i < oprsz / 16 ; i++) {
4964cf6b99SSong Gao         Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i + 1)),
5064cf6b99SSong Gao                               int128_makes64(Vk->D(2 * i)));
5164cf6b99SSong Gao     }
52c037fbc9SSong Gao }
53c037fbc9SSong Gao 
54c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_h_b, 16, H, B, DO_SUB)
55c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_w_h, 32, W, H, DO_SUB)
56c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_d_w, 64, D, W, DO_SUB)
57c037fbc9SSong Gao 
5804711da1SSong Gao void HELPER(vhsubw_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
59c037fbc9SSong Gao {
6064cf6b99SSong Gao     int i;
6104711da1SSong Gao     VReg *Vd = (VReg *)vd;
6204711da1SSong Gao     VReg *Vj = (VReg *)vj;
6304711da1SSong Gao     VReg *Vk = (VReg *)vk;
6464cf6b99SSong Gao     int oprsz = simd_oprsz(desc);
65c037fbc9SSong Gao 
6664cf6b99SSong Gao     for (i = 0; i < oprsz / 16; i++) {
6764cf6b99SSong Gao         Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)),
6864cf6b99SSong Gao                               int128_makes64(Vk->D(2 * i)));
6964cf6b99SSong Gao     }
70c037fbc9SSong Gao }
71c037fbc9SSong Gao 
72c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_hu_bu, 16, UH, UB, DO_ADD)
73c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_wu_hu, 32, UW, UH, DO_ADD)
74c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_du_wu, 64, UD, UW, DO_ADD)
75c037fbc9SSong Gao 
7604711da1SSong Gao void HELPER(vhaddw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc)
77c037fbc9SSong Gao {
7864cf6b99SSong Gao     int i;
7904711da1SSong Gao     VReg *Vd = (VReg *)vd;
8004711da1SSong Gao     VReg *Vj = (VReg *)vj;
8104711da1SSong Gao     VReg *Vk = (VReg *)vk;
8264cf6b99SSong Gao     int oprsz = simd_oprsz(desc);
83c037fbc9SSong Gao 
8464cf6b99SSong Gao     for (i = 0; i < oprsz / 16; i ++) {
8564cf6b99SSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
8664cf6b99SSong Gao                               int128_make64(Vk->UD(2 * i)));
8764cf6b99SSong Gao     }
88c037fbc9SSong Gao }
89c037fbc9SSong Gao 
90c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_hu_bu, 16, UH, UB, DO_SUB)
91c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_wu_hu, 32, UW, UH, DO_SUB)
92c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_du_wu, 64, UD, UW, DO_SUB)
93c037fbc9SSong Gao 
9404711da1SSong Gao void HELPER(vhsubw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc)
95c037fbc9SSong Gao {
9664cf6b99SSong Gao     int i;
9704711da1SSong Gao     VReg *Vd = (VReg *)vd;
9804711da1SSong Gao     VReg *Vj = (VReg *)vj;
9904711da1SSong Gao     VReg *Vk = (VReg *)vk;
10064cf6b99SSong Gao     int oprsz = simd_oprsz(desc);
101c037fbc9SSong Gao 
10264cf6b99SSong Gao     for (i = 0; i < oprsz / 16; i++) {
10364cf6b99SSong Gao         Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)),
10464cf6b99SSong Gao                               int128_make64(Vk->UD(2 * i)));
10564cf6b99SSong Gao     }
106c037fbc9SSong Gao }
1072d5f950cSSong Gao 
1082d5f950cSSong Gao #define DO_EVEN(NAME, BIT, E1, E2, DO_OP)                        \
10985995f07SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)   \
1102d5f950cSSong Gao {                                                                \
1112d5f950cSSong Gao     int i;                                                       \
1122d5f950cSSong Gao     VReg *Vd = (VReg *)vd;                                       \
1132d5f950cSSong Gao     VReg *Vj = (VReg *)vj;                                       \
1142d5f950cSSong Gao     VReg *Vk = (VReg *)vk;                                       \
1152d5f950cSSong Gao     typedef __typeof(Vd->E1(0)) TD;                              \
11685995f07SSong Gao     int oprsz = simd_oprsz(desc);                                \
11785995f07SSong Gao                                                                  \
11885995f07SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                    \
1192d5f950cSSong Gao         Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i) ,(TD)Vk->E2(2 * i)); \
1202d5f950cSSong Gao     }                                                            \
1212d5f950cSSong Gao }
1222d5f950cSSong Gao 
1232d5f950cSSong Gao #define DO_ODD(NAME, BIT, E1, E2, DO_OP)                                 \
12485995f07SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)           \
1252d5f950cSSong Gao {                                                                        \
1262d5f950cSSong Gao     int i;                                                               \
1272d5f950cSSong Gao     VReg *Vd = (VReg *)vd;                                               \
1282d5f950cSSong Gao     VReg *Vj = (VReg *)vj;                                               \
1292d5f950cSSong Gao     VReg *Vk = (VReg *)vk;                                               \
1302d5f950cSSong Gao     typedef __typeof(Vd->E1(0)) TD;                                      \
13185995f07SSong Gao     int oprsz = simd_oprsz(desc);                                        \
13285995f07SSong Gao                                                                          \
13385995f07SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                            \
1342d5f950cSSong Gao         Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i + 1)); \
1352d5f950cSSong Gao     }                                                                    \
1362d5f950cSSong Gao }
1372d5f950cSSong Gao 
13885995f07SSong Gao void HELPER(vaddwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
1392d5f950cSSong Gao {
14085995f07SSong Gao     int i;
1412d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
1422d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
1432d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
14485995f07SSong Gao     int oprsz = simd_oprsz(desc);
1452d5f950cSSong Gao 
14685995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
14785995f07SSong Gao         Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i)),
14885995f07SSong Gao                               int128_makes64(Vk->D(2 * i)));
14985995f07SSong Gao     }
1502d5f950cSSong Gao }
1512d5f950cSSong Gao 
1522d5f950cSSong Gao DO_EVEN(vaddwev_h_b, 16, H, B, DO_ADD)
1532d5f950cSSong Gao DO_EVEN(vaddwev_w_h, 32, W, H, DO_ADD)
1542d5f950cSSong Gao DO_EVEN(vaddwev_d_w, 64, D, W, DO_ADD)
1552d5f950cSSong Gao 
15685995f07SSong Gao void HELPER(vaddwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
1572d5f950cSSong Gao {
15885995f07SSong Gao     int i;
1592d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
1602d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
1612d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
16285995f07SSong Gao     int oprsz = simd_oprsz(desc);
1632d5f950cSSong Gao 
16485995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
16585995f07SSong Gao         Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i +1)),
16685995f07SSong Gao                               int128_makes64(Vk->D(2 * i +1)));
16785995f07SSong Gao     }
1682d5f950cSSong Gao }
1692d5f950cSSong Gao 
1702d5f950cSSong Gao DO_ODD(vaddwod_h_b, 16, H, B, DO_ADD)
1712d5f950cSSong Gao DO_ODD(vaddwod_w_h, 32, W, H, DO_ADD)
1722d5f950cSSong Gao DO_ODD(vaddwod_d_w, 64, D, W, DO_ADD)
1732d5f950cSSong Gao 
17485995f07SSong Gao void HELPER(vsubwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
1752d5f950cSSong Gao {
17685995f07SSong Gao     int i;
1772d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
1782d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
1792d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
18085995f07SSong Gao     int oprsz = simd_oprsz(desc);
1812d5f950cSSong Gao 
18285995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
18385995f07SSong Gao         Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i)),
18485995f07SSong Gao                               int128_makes64(Vk->D(2 * i)));
18585995f07SSong Gao     }
1862d5f950cSSong Gao }
1872d5f950cSSong Gao 
1882d5f950cSSong Gao DO_EVEN(vsubwev_h_b, 16, H, B, DO_SUB)
1892d5f950cSSong Gao DO_EVEN(vsubwev_w_h, 32, W, H, DO_SUB)
1902d5f950cSSong Gao DO_EVEN(vsubwev_d_w, 64, D, W, DO_SUB)
1912d5f950cSSong Gao 
19285995f07SSong Gao void HELPER(vsubwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
1932d5f950cSSong Gao {
19485995f07SSong Gao     int i;
1952d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
1962d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
1972d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
19885995f07SSong Gao     int oprsz = simd_oprsz(desc);
1992d5f950cSSong Gao 
20085995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
20185995f07SSong Gao         Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)),
20285995f07SSong Gao                               int128_makes64(Vk->D(2 * i + 1)));
20385995f07SSong Gao     }
2042d5f950cSSong Gao }
2052d5f950cSSong Gao 
2062d5f950cSSong Gao DO_ODD(vsubwod_h_b, 16, H, B, DO_SUB)
2072d5f950cSSong Gao DO_ODD(vsubwod_w_h, 32, W, H, DO_SUB)
2082d5f950cSSong Gao DO_ODD(vsubwod_d_w, 64, D, W, DO_SUB)
2092d5f950cSSong Gao 
21085995f07SSong Gao void HELPER(vaddwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
2112d5f950cSSong Gao {
21285995f07SSong Gao     int i;
2132d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
2142d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
2152d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
21685995f07SSong Gao     int oprsz = simd_oprsz(desc);
2172d5f950cSSong Gao 
21885995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
21985995f07SSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)),
22085995f07SSong Gao                               int128_make64(Vk->UD(2 * i)));
22185995f07SSong Gao     }
2222d5f950cSSong Gao }
2232d5f950cSSong Gao 
2242d5f950cSSong Gao DO_EVEN(vaddwev_h_bu, 16, UH, UB, DO_ADD)
2252d5f950cSSong Gao DO_EVEN(vaddwev_w_hu, 32, UW, UH, DO_ADD)
2262d5f950cSSong Gao DO_EVEN(vaddwev_d_wu, 64, UD, UW, DO_ADD)
2272d5f950cSSong Gao 
22885995f07SSong Gao void HELPER(vaddwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
2292d5f950cSSong Gao {
23085995f07SSong Gao     int i;
2312d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
2322d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
2332d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
23485995f07SSong Gao     int oprsz = simd_oprsz(desc);
2352d5f950cSSong Gao 
23685995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
23785995f07SSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
23885995f07SSong Gao                               int128_make64(Vk->UD(2 * i + 1)));
23985995f07SSong Gao     }
2402d5f950cSSong Gao }
2412d5f950cSSong Gao 
2422d5f950cSSong Gao DO_ODD(vaddwod_h_bu, 16, UH, UB, DO_ADD)
2432d5f950cSSong Gao DO_ODD(vaddwod_w_hu, 32, UW, UH, DO_ADD)
2442d5f950cSSong Gao DO_ODD(vaddwod_d_wu, 64, UD, UW, DO_ADD)
2452d5f950cSSong Gao 
24685995f07SSong Gao void HELPER(vsubwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
2472d5f950cSSong Gao {
24885995f07SSong Gao     int i;
2492d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
2502d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
2512d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
25285995f07SSong Gao     int oprsz = simd_oprsz(desc);
2532d5f950cSSong Gao 
25485995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
25585995f07SSong Gao         Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i)),
25685995f07SSong Gao                               int128_make64(Vk->UD(2 * i)));
25785995f07SSong Gao     }
2582d5f950cSSong Gao }
2592d5f950cSSong Gao 
2602d5f950cSSong Gao DO_EVEN(vsubwev_h_bu, 16, UH, UB, DO_SUB)
2612d5f950cSSong Gao DO_EVEN(vsubwev_w_hu, 32, UW, UH, DO_SUB)
2622d5f950cSSong Gao DO_EVEN(vsubwev_d_wu, 64, UD, UW, DO_SUB)
2632d5f950cSSong Gao 
26485995f07SSong Gao void HELPER(vsubwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
2652d5f950cSSong Gao {
26685995f07SSong Gao     int i;
2672d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
2682d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
2692d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
27085995f07SSong Gao     int oprsz = simd_oprsz(desc);
2712d5f950cSSong Gao 
27285995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
27385995f07SSong Gao         Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)),
27485995f07SSong Gao                               int128_make64(Vk->UD(2 * i + 1)));
27585995f07SSong Gao     }
2762d5f950cSSong Gao }
2772d5f950cSSong Gao 
2782d5f950cSSong Gao DO_ODD(vsubwod_h_bu, 16, UH, UB, DO_SUB)
2792d5f950cSSong Gao DO_ODD(vsubwod_w_hu, 32, UW, UH, DO_SUB)
2802d5f950cSSong Gao DO_ODD(vsubwod_d_wu, 64, UD, UW, DO_SUB)
2812d5f950cSSong Gao 
2822d5f950cSSong Gao #define DO_EVEN_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP)             \
28385995f07SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)        \
2842d5f950cSSong Gao {                                                                     \
2852d5f950cSSong Gao     int i;                                                            \
2862d5f950cSSong Gao     VReg *Vd = (VReg *)vd;                                            \
2872d5f950cSSong Gao     VReg *Vj = (VReg *)vj;                                            \
2882d5f950cSSong Gao     VReg *Vk = (VReg *)vk;                                            \
2892d5f950cSSong Gao     typedef __typeof(Vd->ES1(0)) TDS;                                 \
2902d5f950cSSong Gao     typedef __typeof(Vd->EU1(0)) TDU;                                 \
29185995f07SSong Gao     int oprsz = simd_oprsz(desc);                                     \
29285995f07SSong Gao                                                                       \
29385995f07SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                         \
2942d5f950cSSong Gao         Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i) ,(TDS)Vk->ES2(2 * i)); \
2952d5f950cSSong Gao     }                                                                 \
2962d5f950cSSong Gao }
2972d5f950cSSong Gao 
2982d5f950cSSong Gao #define DO_ODD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP)                      \
29985995f07SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                \
3002d5f950cSSong Gao {                                                                             \
3012d5f950cSSong Gao     int i;                                                                    \
3022d5f950cSSong Gao     VReg *Vd = (VReg *)vd;                                                    \
3032d5f950cSSong Gao     VReg *Vj = (VReg *)vj;                                                    \
3042d5f950cSSong Gao     VReg *Vk = (VReg *)vk;                                                    \
3052d5f950cSSong Gao     typedef __typeof(Vd->ES1(0)) TDS;                                         \
3062d5f950cSSong Gao     typedef __typeof(Vd->EU1(0)) TDU;                                         \
30785995f07SSong Gao     int oprsz = simd_oprsz(desc);                                             \
30885995f07SSong Gao                                                                               \
30985995f07SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                                 \
3102d5f950cSSong Gao         Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i + 1), (TDS)Vk->ES2(2 * i + 1)); \
3112d5f950cSSong Gao     }                                                                         \
3122d5f950cSSong Gao }
3132d5f950cSSong Gao 
31485995f07SSong Gao void HELPER(vaddwev_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc)
3152d5f950cSSong Gao {
31685995f07SSong Gao     int i;
3172d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
3182d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
3192d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
32085995f07SSong Gao     int oprsz = simd_oprsz(desc);
3212d5f950cSSong Gao 
32285995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
32385995f07SSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)),
32485995f07SSong Gao                               int128_makes64(Vk->D(2 * i)));
32585995f07SSong Gao     }
3262d5f950cSSong Gao }
3272d5f950cSSong Gao 
3282d5f950cSSong Gao DO_EVEN_U_S(vaddwev_h_bu_b, 16, H, UH, B, UB, DO_ADD)
3292d5f950cSSong Gao DO_EVEN_U_S(vaddwev_w_hu_h, 32, W, UW, H, UH, DO_ADD)
3302d5f950cSSong Gao DO_EVEN_U_S(vaddwev_d_wu_w, 64, D, UD, W, UW, DO_ADD)
3312d5f950cSSong Gao 
33285995f07SSong Gao void HELPER(vaddwod_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc)
3332d5f950cSSong Gao {
33485995f07SSong Gao     int i;
3352d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
3362d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
3372d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
33885995f07SSong Gao     int oprsz = simd_oprsz(desc);
3392d5f950cSSong Gao 
34085995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
34185995f07SSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
34285995f07SSong Gao                               int128_makes64(Vk->D(2 * i + 1)));
34385995f07SSong Gao     }
3442d5f950cSSong Gao }
3452d5f950cSSong Gao 
3462d5f950cSSong Gao DO_ODD_U_S(vaddwod_h_bu_b, 16, H, UH, B, UB, DO_ADD)
3472d5f950cSSong Gao DO_ODD_U_S(vaddwod_w_hu_h, 32, W, UW, H, UH, DO_ADD)
3482d5f950cSSong Gao DO_ODD_U_S(vaddwod_d_wu_w, 64, D, UD, W, UW, DO_ADD)
34939e9b0a7SSong Gao 
35039e9b0a7SSong Gao #define DO_VAVG(a, b)  ((a >> 1) + (b >> 1) + (a & b & 1))
35139e9b0a7SSong Gao #define DO_VAVGR(a, b) ((a >> 1) + (b >> 1) + ((a | b) & 1))
35239e9b0a7SSong Gao 
35339e9b0a7SSong Gao #define DO_3OP(NAME, BIT, E, DO_OP)                            \
354ee7250d0SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
35539e9b0a7SSong Gao {                                                              \
35639e9b0a7SSong Gao     int i;                                                     \
35739e9b0a7SSong Gao     VReg *Vd = (VReg *)vd;                                     \
35839e9b0a7SSong Gao     VReg *Vj = (VReg *)vj;                                     \
35939e9b0a7SSong Gao     VReg *Vk = (VReg *)vk;                                     \
360ee7250d0SSong Gao     int oprsz = simd_oprsz(desc);                              \
361ee7250d0SSong Gao                                                                \
362ee7250d0SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
36339e9b0a7SSong Gao         Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i));                  \
36439e9b0a7SSong Gao     }                                                          \
36539e9b0a7SSong Gao }
36639e9b0a7SSong Gao 
36739e9b0a7SSong Gao DO_3OP(vavg_b, 8, B, DO_VAVG)
36839e9b0a7SSong Gao DO_3OP(vavg_h, 16, H, DO_VAVG)
36939e9b0a7SSong Gao DO_3OP(vavg_w, 32, W, DO_VAVG)
37039e9b0a7SSong Gao DO_3OP(vavg_d, 64, D, DO_VAVG)
37139e9b0a7SSong Gao DO_3OP(vavgr_b, 8, B, DO_VAVGR)
37239e9b0a7SSong Gao DO_3OP(vavgr_h, 16, H, DO_VAVGR)
37339e9b0a7SSong Gao DO_3OP(vavgr_w, 32, W, DO_VAVGR)
37439e9b0a7SSong Gao DO_3OP(vavgr_d, 64, D, DO_VAVGR)
37539e9b0a7SSong Gao DO_3OP(vavg_bu, 8, UB, DO_VAVG)
37639e9b0a7SSong Gao DO_3OP(vavg_hu, 16, UH, DO_VAVG)
37739e9b0a7SSong Gao DO_3OP(vavg_wu, 32, UW, DO_VAVG)
37839e9b0a7SSong Gao DO_3OP(vavg_du, 64, UD, DO_VAVG)
37939e9b0a7SSong Gao DO_3OP(vavgr_bu, 8, UB, DO_VAVGR)
38039e9b0a7SSong Gao DO_3OP(vavgr_hu, 16, UH, DO_VAVGR)
38139e9b0a7SSong Gao DO_3OP(vavgr_wu, 32, UW, DO_VAVGR)
38239e9b0a7SSong Gao DO_3OP(vavgr_du, 64, UD, DO_VAVGR)
38349725659SSong Gao 
38449725659SSong Gao #define DO_VABSD(a, b)  ((a > b) ? (a -b) : (b-a))
38549725659SSong Gao 
38649725659SSong Gao DO_3OP(vabsd_b, 8, B, DO_VABSD)
38749725659SSong Gao DO_3OP(vabsd_h, 16, H, DO_VABSD)
38849725659SSong Gao DO_3OP(vabsd_w, 32, W, DO_VABSD)
38949725659SSong Gao DO_3OP(vabsd_d, 64, D, DO_VABSD)
39049725659SSong Gao DO_3OP(vabsd_bu, 8, UB, DO_VABSD)
39149725659SSong Gao DO_3OP(vabsd_hu, 16, UH, DO_VABSD)
39249725659SSong Gao DO_3OP(vabsd_wu, 32, UW, DO_VABSD)
39349725659SSong Gao DO_3OP(vabsd_du, 64, UD, DO_VABSD)
394af448cb3SSong Gao 
395af448cb3SSong Gao #define DO_VABS(a)  ((a < 0) ? (-a) : (a))
396af448cb3SSong Gao 
39727f5485dSSong Gao #define DO_VADDA(NAME, BIT, E)                                 \
39827f5485dSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
399af448cb3SSong Gao {                                                              \
400af448cb3SSong Gao     int i;                                                     \
401af448cb3SSong Gao     VReg *Vd = (VReg *)vd;                                     \
402af448cb3SSong Gao     VReg *Vj = (VReg *)vj;                                     \
403af448cb3SSong Gao     VReg *Vk = (VReg *)vk;                                     \
40427f5485dSSong Gao     int oprsz = simd_oprsz(desc);                              \
40527f5485dSSong Gao                                                                \
40627f5485dSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
40727f5485dSSong Gao         Vd->E(i) = DO_VABS(Vj->E(i)) + DO_VABS(Vk->E(i));      \
408af448cb3SSong Gao     }                                                          \
409af448cb3SSong Gao }
410af448cb3SSong Gao 
41127f5485dSSong Gao DO_VADDA(vadda_b, 8, B)
41227f5485dSSong Gao DO_VADDA(vadda_h, 16, H)
41327f5485dSSong Gao DO_VADDA(vadda_w, 32, W)
41427f5485dSSong Gao DO_VADDA(vadda_d, 64, D)
4159ab29520SSong Gao 
4169ab29520SSong Gao #define DO_MIN(a, b) (a < b ? a : b)
4179ab29520SSong Gao #define DO_MAX(a, b) (a > b ? a : b)
4189ab29520SSong Gao 
4199ab29520SSong Gao #define VMINMAXI(NAME, BIT, E, DO_OP)                              \
420c09360faSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
4219ab29520SSong Gao {                                                                  \
4229ab29520SSong Gao     int i;                                                         \
4239ab29520SSong Gao     VReg *Vd = (VReg *)vd;                                         \
4249ab29520SSong Gao     VReg *Vj = (VReg *)vj;                                         \
4259ab29520SSong Gao     typedef __typeof(Vd->E(0)) TD;                                 \
426c09360faSSong Gao     int oprsz = simd_oprsz(desc);                                  \
4279ab29520SSong Gao                                                                    \
428c09360faSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
4299ab29520SSong Gao         Vd->E(i) = DO_OP(Vj->E(i), (TD)imm);                       \
4309ab29520SSong Gao     }                                                              \
4319ab29520SSong Gao }
4329ab29520SSong Gao 
4339ab29520SSong Gao VMINMAXI(vmini_b, 8, B, DO_MIN)
4349ab29520SSong Gao VMINMAXI(vmini_h, 16, H, DO_MIN)
4359ab29520SSong Gao VMINMAXI(vmini_w, 32, W, DO_MIN)
4369ab29520SSong Gao VMINMAXI(vmini_d, 64, D, DO_MIN)
4379ab29520SSong Gao VMINMAXI(vmaxi_b, 8, B, DO_MAX)
4389ab29520SSong Gao VMINMAXI(vmaxi_h, 16, H, DO_MAX)
4399ab29520SSong Gao VMINMAXI(vmaxi_w, 32, W, DO_MAX)
4409ab29520SSong Gao VMINMAXI(vmaxi_d, 64, D, DO_MAX)
4419ab29520SSong Gao VMINMAXI(vmini_bu, 8, UB, DO_MIN)
4429ab29520SSong Gao VMINMAXI(vmini_hu, 16, UH, DO_MIN)
4439ab29520SSong Gao VMINMAXI(vmini_wu, 32, UW, DO_MIN)
4449ab29520SSong Gao VMINMAXI(vmini_du, 64, UD, DO_MIN)
4459ab29520SSong Gao VMINMAXI(vmaxi_bu, 8, UB, DO_MAX)
4469ab29520SSong Gao VMINMAXI(vmaxi_hu, 16, UH, DO_MAX)
4479ab29520SSong Gao VMINMAXI(vmaxi_wu, 32, UW, DO_MAX)
4489ab29520SSong Gao VMINMAXI(vmaxi_du, 64, UD, DO_MAX)
449cd1c49adSSong Gao 
450cd1c49adSSong Gao #define DO_VMUH(NAME, BIT, E1, E2, DO_OP)                      \
451342dc1cfSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
452cd1c49adSSong Gao {                                                              \
453cd1c49adSSong Gao     int i;                                                     \
454cd1c49adSSong Gao     VReg *Vd = (VReg *)vd;                                     \
455cd1c49adSSong Gao     VReg *Vj = (VReg *)vj;                                     \
456cd1c49adSSong Gao     VReg *Vk = (VReg *)vk;                                     \
457cd1c49adSSong Gao     typedef __typeof(Vd->E1(0)) T;                             \
458342dc1cfSSong Gao     int oprsz = simd_oprsz(desc);                              \
459cd1c49adSSong Gao                                                                \
460342dc1cfSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
461cd1c49adSSong Gao         Vd->E2(i) = ((T)Vj->E2(i)) * ((T)Vk->E2(i)) >> BIT;    \
462cd1c49adSSong Gao     }                                                          \
463cd1c49adSSong Gao }
464cd1c49adSSong Gao 
465342dc1cfSSong Gao void HELPER(vmuh_d)(void *vd, void *vj, void *vk, uint32_t desc)
466cd1c49adSSong Gao {
467342dc1cfSSong Gao     int i;
468342dc1cfSSong Gao     uint64_t l, h;
469cd1c49adSSong Gao     VReg *Vd = (VReg *)vd;
470cd1c49adSSong Gao     VReg *Vj = (VReg *)vj;
471cd1c49adSSong Gao     VReg *Vk = (VReg *)vk;
472342dc1cfSSong Gao     int oprsz = simd_oprsz(desc);
473cd1c49adSSong Gao 
474342dc1cfSSong Gao     for (i = 0; i < oprsz / 8; i++) {
475342dc1cfSSong Gao         muls64(&l, &h, Vj->D(i), Vk->D(i));
476342dc1cfSSong Gao         Vd->D(i) = h;
477342dc1cfSSong Gao     }
478cd1c49adSSong Gao }
479cd1c49adSSong Gao 
480cd1c49adSSong Gao DO_VMUH(vmuh_b, 8, H, B, DO_MUH)
481cd1c49adSSong Gao DO_VMUH(vmuh_h, 16, W, H, DO_MUH)
482cd1c49adSSong Gao DO_VMUH(vmuh_w, 32, D, W, DO_MUH)
483cd1c49adSSong Gao 
484342dc1cfSSong Gao void HELPER(vmuh_du)(void *vd, void *vj, void *vk, uint32_t desc)
485cd1c49adSSong Gao {
486342dc1cfSSong Gao     int i;
487342dc1cfSSong Gao     uint64_t l, h;
488cd1c49adSSong Gao     VReg *Vd = (VReg *)vd;
489cd1c49adSSong Gao     VReg *Vj = (VReg *)vj;
490cd1c49adSSong Gao     VReg *Vk = (VReg *)vk;
491342dc1cfSSong Gao     int oprsz = simd_oprsz(desc);
492cd1c49adSSong Gao 
493342dc1cfSSong Gao     for (i = 0; i < oprsz / 8; i++) {
494342dc1cfSSong Gao         mulu64(&l, &h, Vj->D(i), Vk->D(i));
495342dc1cfSSong Gao         Vd->D(i) = h;
496342dc1cfSSong Gao     }
497cd1c49adSSong Gao }
498cd1c49adSSong Gao 
499cd1c49adSSong Gao DO_VMUH(vmuh_bu, 8, UH, UB, DO_MUH)
500cd1c49adSSong Gao DO_VMUH(vmuh_hu, 16, UW, UH, DO_MUH)
501cd1c49adSSong Gao DO_VMUH(vmuh_wu, 32, UD, UW, DO_MUH)
502cd1c49adSSong Gao 
503cd1c49adSSong Gao #define DO_MUL(a, b) (a * b)
504cd1c49adSSong Gao 
505cd1c49adSSong Gao DO_EVEN(vmulwev_h_b, 16, H, B, DO_MUL)
506cd1c49adSSong Gao DO_EVEN(vmulwev_w_h, 32, W, H, DO_MUL)
507cd1c49adSSong Gao DO_EVEN(vmulwev_d_w, 64, D, W, DO_MUL)
508cd1c49adSSong Gao 
509cd1c49adSSong Gao DO_ODD(vmulwod_h_b, 16, H, B, DO_MUL)
510cd1c49adSSong Gao DO_ODD(vmulwod_w_h, 32, W, H, DO_MUL)
511cd1c49adSSong Gao DO_ODD(vmulwod_d_w, 64, D, W, DO_MUL)
512cd1c49adSSong Gao 
513cd1c49adSSong Gao DO_EVEN(vmulwev_h_bu, 16, UH, UB, DO_MUL)
514cd1c49adSSong Gao DO_EVEN(vmulwev_w_hu, 32, UW, UH, DO_MUL)
515cd1c49adSSong Gao DO_EVEN(vmulwev_d_wu, 64, UD, UW, DO_MUL)
516cd1c49adSSong Gao 
517cd1c49adSSong Gao DO_ODD(vmulwod_h_bu, 16, UH, UB, DO_MUL)
518cd1c49adSSong Gao DO_ODD(vmulwod_w_hu, 32, UW, UH, DO_MUL)
519cd1c49adSSong Gao DO_ODD(vmulwod_d_wu, 64, UD, UW, DO_MUL)
520cd1c49adSSong Gao 
521cd1c49adSSong Gao DO_EVEN_U_S(vmulwev_h_bu_b, 16, H, UH, B, UB, DO_MUL)
522cd1c49adSSong Gao DO_EVEN_U_S(vmulwev_w_hu_h, 32, W, UW, H, UH, DO_MUL)
523cd1c49adSSong Gao DO_EVEN_U_S(vmulwev_d_wu_w, 64, D, UD, W, UW, DO_MUL)
524cd1c49adSSong Gao 
525cd1c49adSSong Gao DO_ODD_U_S(vmulwod_h_bu_b, 16, H, UH, B, UB, DO_MUL)
526cd1c49adSSong Gao DO_ODD_U_S(vmulwod_w_hu_h, 32, W, UW, H, UH, DO_MUL)
527cd1c49adSSong Gao DO_ODD_U_S(vmulwod_d_wu_w, 64, D, UD, W, UW, DO_MUL)
528d3aec65bSSong Gao 
529d3aec65bSSong Gao #define DO_MADD(a, b, c)  (a + b * c)
530d3aec65bSSong Gao #define DO_MSUB(a, b, c)  (a - b * c)
531d3aec65bSSong Gao 
532d3aec65bSSong Gao #define VMADDSUB(NAME, BIT, E, DO_OP)                          \
5333f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
534d3aec65bSSong Gao {                                                              \
535d3aec65bSSong Gao     int i;                                                     \
536d3aec65bSSong Gao     VReg *Vd = (VReg *)vd;                                     \
537d3aec65bSSong Gao     VReg *Vj = (VReg *)vj;                                     \
538d3aec65bSSong Gao     VReg *Vk = (VReg *)vk;                                     \
5393f450c17SSong Gao     int oprsz = simd_oprsz(desc);                              \
5403f450c17SSong Gao                                                                \
5413f450c17SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
542d3aec65bSSong Gao         Vd->E(i) = DO_OP(Vd->E(i), Vj->E(i) ,Vk->E(i));        \
543d3aec65bSSong Gao     }                                                          \
544d3aec65bSSong Gao }
545d3aec65bSSong Gao 
546d3aec65bSSong Gao VMADDSUB(vmadd_b, 8, B, DO_MADD)
547d3aec65bSSong Gao VMADDSUB(vmadd_h, 16, H, DO_MADD)
548d3aec65bSSong Gao VMADDSUB(vmadd_w, 32, W, DO_MADD)
549d3aec65bSSong Gao VMADDSUB(vmadd_d, 64, D, DO_MADD)
550d3aec65bSSong Gao VMADDSUB(vmsub_b, 8, B, DO_MSUB)
551d3aec65bSSong Gao VMADDSUB(vmsub_h, 16, H, DO_MSUB)
552d3aec65bSSong Gao VMADDSUB(vmsub_w, 32, W, DO_MSUB)
553d3aec65bSSong Gao VMADDSUB(vmsub_d, 64, D, DO_MSUB)
554d3aec65bSSong Gao 
555d3aec65bSSong Gao #define VMADDWEV(NAME, BIT, E1, E2, DO_OP)                        \
5563f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)    \
557d3aec65bSSong Gao {                                                                 \
558d3aec65bSSong Gao     int i;                                                        \
559d3aec65bSSong Gao     VReg *Vd = (VReg *)vd;                                        \
560d3aec65bSSong Gao     VReg *Vj = (VReg *)vj;                                        \
561d3aec65bSSong Gao     VReg *Vk = (VReg *)vk;                                        \
562d3aec65bSSong Gao     typedef __typeof(Vd->E1(0)) TD;                               \
5633f450c17SSong Gao     int oprsz = simd_oprsz(desc);                                 \
564d3aec65bSSong Gao                                                                   \
5653f450c17SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                     \
566d3aec65bSSong Gao         Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i), (TD)Vk->E2(2 * i)); \
567d3aec65bSSong Gao     }                                                             \
568d3aec65bSSong Gao }
569d3aec65bSSong Gao 
570d3aec65bSSong Gao VMADDWEV(vmaddwev_h_b, 16, H, B, DO_MUL)
571d3aec65bSSong Gao VMADDWEV(vmaddwev_w_h, 32, W, H, DO_MUL)
572d3aec65bSSong Gao VMADDWEV(vmaddwev_d_w, 64, D, W, DO_MUL)
573d3aec65bSSong Gao VMADDWEV(vmaddwev_h_bu, 16, UH, UB, DO_MUL)
574d3aec65bSSong Gao VMADDWEV(vmaddwev_w_hu, 32, UW, UH, DO_MUL)
575d3aec65bSSong Gao VMADDWEV(vmaddwev_d_wu, 64, UD, UW, DO_MUL)
576d3aec65bSSong Gao 
577d3aec65bSSong Gao #define VMADDWOD(NAME, BIT, E1, E2, DO_OP)                     \
5783f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
579d3aec65bSSong Gao {                                                              \
580d3aec65bSSong Gao     int i;                                                     \
581d3aec65bSSong Gao     VReg *Vd = (VReg *)vd;                                     \
582d3aec65bSSong Gao     VReg *Vj = (VReg *)vj;                                     \
583d3aec65bSSong Gao     VReg *Vk = (VReg *)vk;                                     \
584d3aec65bSSong Gao     typedef __typeof(Vd->E1(0)) TD;                            \
5853f450c17SSong Gao     int oprsz = simd_oprsz(desc);                              \
586d3aec65bSSong Gao                                                                \
5873f450c17SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
588d3aec65bSSong Gao         Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i + 1),              \
589d3aec65bSSong Gao                            (TD)Vk->E2(2 * i + 1));             \
590d3aec65bSSong Gao     }                                                          \
591d3aec65bSSong Gao }
592d3aec65bSSong Gao 
593d3aec65bSSong Gao VMADDWOD(vmaddwod_h_b, 16, H, B, DO_MUL)
594d3aec65bSSong Gao VMADDWOD(vmaddwod_w_h, 32, W, H, DO_MUL)
595d3aec65bSSong Gao VMADDWOD(vmaddwod_d_w, 64, D, W, DO_MUL)
596d3aec65bSSong Gao VMADDWOD(vmaddwod_h_bu, 16,  UH, UB, DO_MUL)
597d3aec65bSSong Gao VMADDWOD(vmaddwod_w_hu, 32,  UW, UH, DO_MUL)
598d3aec65bSSong Gao VMADDWOD(vmaddwod_d_wu, 64,  UD, UW, DO_MUL)
599d3aec65bSSong Gao 
600d3aec65bSSong Gao #define VMADDWEV_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP)     \
6013f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
602d3aec65bSSong Gao {                                                              \
603d3aec65bSSong Gao     int i;                                                     \
604d3aec65bSSong Gao     VReg *Vd = (VReg *)vd;                                     \
605d3aec65bSSong Gao     VReg *Vj = (VReg *)vj;                                     \
606d3aec65bSSong Gao     VReg *Vk = (VReg *)vk;                                     \
607d3aec65bSSong Gao     typedef __typeof(Vd->ES1(0)) TS1;                          \
608d3aec65bSSong Gao     typedef __typeof(Vd->EU1(0)) TU1;                          \
6093f450c17SSong Gao     int oprsz = simd_oprsz(desc);                              \
610d3aec65bSSong Gao                                                                \
6113f450c17SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
612d3aec65bSSong Gao         Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i),               \
613d3aec65bSSong Gao                             (TS1)Vk->ES2(2 * i));              \
614d3aec65bSSong Gao     }                                                          \
615d3aec65bSSong Gao }
616d3aec65bSSong Gao 
617d3aec65bSSong Gao VMADDWEV_U_S(vmaddwev_h_bu_b, 16, H, UH, B, UB, DO_MUL)
618d3aec65bSSong Gao VMADDWEV_U_S(vmaddwev_w_hu_h, 32, W, UW, H, UH, DO_MUL)
619d3aec65bSSong Gao VMADDWEV_U_S(vmaddwev_d_wu_w, 64, D, UD, W, UW, DO_MUL)
620d3aec65bSSong Gao 
621d3aec65bSSong Gao #define VMADDWOD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP)     \
6223f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
623d3aec65bSSong Gao {                                                              \
624d3aec65bSSong Gao     int i;                                                     \
625d3aec65bSSong Gao     VReg *Vd = (VReg *)vd;                                     \
626d3aec65bSSong Gao     VReg *Vj = (VReg *)vj;                                     \
627d3aec65bSSong Gao     VReg *Vk = (VReg *)vk;                                     \
628d3aec65bSSong Gao     typedef __typeof(Vd->ES1(0)) TS1;                          \
629d3aec65bSSong Gao     typedef __typeof(Vd->EU1(0)) TU1;                          \
6303f450c17SSong Gao     int oprsz = simd_oprsz(desc);                              \
631d3aec65bSSong Gao                                                                \
6323f450c17SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
633d3aec65bSSong Gao         Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i + 1),           \
634d3aec65bSSong Gao                             (TS1)Vk->ES2(2 * i + 1));          \
635d3aec65bSSong Gao     }                                                          \
636d3aec65bSSong Gao }
637d3aec65bSSong Gao 
638d3aec65bSSong Gao VMADDWOD_U_S(vmaddwod_h_bu_b, 16, H, UH, B, UB, DO_MUL)
639d3aec65bSSong Gao VMADDWOD_U_S(vmaddwod_w_hu_h, 32, W, UW, H, UH, DO_MUL)
640d3aec65bSSong Gao VMADDWOD_U_S(vmaddwod_d_wu_w, 64, D, UD, W, UW, DO_MUL)
6414cc4c0f7SSong Gao 
6424cc4c0f7SSong Gao #define DO_DIVU(N, M) (unlikely(M == 0) ? 0 : N / M)
6434cc4c0f7SSong Gao #define DO_REMU(N, M) (unlikely(M == 0) ? 0 : N % M)
6444cc4c0f7SSong Gao #define DO_DIV(N, M)  (unlikely(M == 0) ? 0 :\
6454cc4c0f7SSong Gao         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
6464cc4c0f7SSong Gao #define DO_REM(N, M)  (unlikely(M == 0) ? 0 :\
6474cc4c0f7SSong Gao         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
6484cc4c0f7SSong Gao 
6494cc4c0f7SSong Gao #define VDIV(NAME, BIT, E, DO_OP)                              \
65004711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
6514cc4c0f7SSong Gao {                                                              \
6524cc4c0f7SSong Gao     int i;                                                     \
65304711da1SSong Gao     VReg *Vd = (VReg *)vd;                                     \
65404711da1SSong Gao     VReg *Vj = (VReg *)vj;                                     \
65504711da1SSong Gao     VReg *Vk = (VReg *)vk;                                     \
656abb693deSSong Gao     int oprsz = simd_oprsz(desc);                              \
657abb693deSSong Gao                                                                \
658abb693deSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
6594cc4c0f7SSong Gao         Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i));                  \
6604cc4c0f7SSong Gao     }                                                          \
6614cc4c0f7SSong Gao }
6624cc4c0f7SSong Gao 
6634cc4c0f7SSong Gao VDIV(vdiv_b, 8, B, DO_DIV)
6644cc4c0f7SSong Gao VDIV(vdiv_h, 16, H, DO_DIV)
6654cc4c0f7SSong Gao VDIV(vdiv_w, 32, W, DO_DIV)
6664cc4c0f7SSong Gao VDIV(vdiv_d, 64, D, DO_DIV)
6674cc4c0f7SSong Gao VDIV(vdiv_bu, 8, UB, DO_DIVU)
6684cc4c0f7SSong Gao VDIV(vdiv_hu, 16, UH, DO_DIVU)
6694cc4c0f7SSong Gao VDIV(vdiv_wu, 32, UW, DO_DIVU)
6704cc4c0f7SSong Gao VDIV(vdiv_du, 64, UD, DO_DIVU)
6714cc4c0f7SSong Gao VDIV(vmod_b, 8, B, DO_REM)
6724cc4c0f7SSong Gao VDIV(vmod_h, 16, H, DO_REM)
6734cc4c0f7SSong Gao VDIV(vmod_w, 32, W, DO_REM)
6744cc4c0f7SSong Gao VDIV(vmod_d, 64, D, DO_REM)
6754cc4c0f7SSong Gao VDIV(vmod_bu, 8, UB, DO_REMU)
6764cc4c0f7SSong Gao VDIV(vmod_hu, 16, UH, DO_REMU)
6774cc4c0f7SSong Gao VDIV(vmod_wu, 32, UW, DO_REMU)
6784cc4c0f7SSong Gao VDIV(vmod_du, 64, UD, DO_REMU)
679cbe44190SSong Gao 
680cbe44190SSong Gao #define VSAT_S(NAME, BIT, E)                                       \
681e5c7f031SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \
682cbe44190SSong Gao {                                                                  \
683cbe44190SSong Gao     int i;                                                         \
684cbe44190SSong Gao     VReg *Vd = (VReg *)vd;                                         \
685cbe44190SSong Gao     VReg *Vj = (VReg *)vj;                                         \
686cbe44190SSong Gao     typedef __typeof(Vd->E(0)) TD;                                 \
687e5c7f031SSong Gao     int oprsz = simd_oprsz(desc);                                  \
688cbe44190SSong Gao                                                                    \
689e5c7f031SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
690cbe44190SSong Gao         Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max :                  \
691cbe44190SSong Gao                    Vj->E(i) < (TD)~max ? (TD)~max: Vj->E(i);       \
692cbe44190SSong Gao     }                                                              \
693cbe44190SSong Gao }
694cbe44190SSong Gao 
695cbe44190SSong Gao VSAT_S(vsat_b, 8, B)
696cbe44190SSong Gao VSAT_S(vsat_h, 16, H)
697cbe44190SSong Gao VSAT_S(vsat_w, 32, W)
698cbe44190SSong Gao VSAT_S(vsat_d, 64, D)
699cbe44190SSong Gao 
700cbe44190SSong Gao #define VSAT_U(NAME, BIT, E)                                       \
701e5c7f031SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \
702cbe44190SSong Gao {                                                                  \
703cbe44190SSong Gao     int i;                                                         \
704cbe44190SSong Gao     VReg *Vd = (VReg *)vd;                                         \
705cbe44190SSong Gao     VReg *Vj = (VReg *)vj;                                         \
706cbe44190SSong Gao     typedef __typeof(Vd->E(0)) TD;                                 \
707e5c7f031SSong Gao     int oprsz = simd_oprsz(desc);                                  \
708cbe44190SSong Gao                                                                    \
709e5c7f031SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
710cbe44190SSong Gao         Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : Vj->E(i);        \
711cbe44190SSong Gao     }                                                              \
712cbe44190SSong Gao }
713cbe44190SSong Gao 
714cbe44190SSong Gao VSAT_U(vsat_bu, 8, UB)
715cbe44190SSong Gao VSAT_U(vsat_hu, 16, UH)
716cbe44190SSong Gao VSAT_U(vsat_wu, 32, UW)
717cbe44190SSong Gao VSAT_U(vsat_du, 64, UD)
7183734ad93SSong Gao 
7193734ad93SSong Gao #define VEXTH(NAME, BIT, E1, E2)                                 \
720ff27e335SSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc)             \
7213734ad93SSong Gao {                                                                \
722f0db0bebSSong Gao     int i, j, ofs;                                               \
723ff27e335SSong Gao     VReg *Vd = (VReg *)vd;                                       \
724ff27e335SSong Gao     VReg *Vj = (VReg *)vj;                                       \
725f0db0bebSSong Gao     int oprsz = simd_oprsz(desc);                                \
7263734ad93SSong Gao                                                                  \
727f0db0bebSSong Gao     ofs = LSX_LEN / BIT;                                         \
728f0db0bebSSong Gao     for (i = 0; i < oprsz / 16; i++) {                           \
729f0db0bebSSong Gao         for (j = 0; j < ofs; j++) {                              \
730f0db0bebSSong Gao             Vd->E1(j + i * ofs) = Vj->E2(j + ofs + ofs * 2 * i); \
731f0db0bebSSong Gao         }                                                        \
7323734ad93SSong Gao     }                                                            \
7333734ad93SSong Gao }
7343734ad93SSong Gao 
735ff27e335SSong Gao void HELPER(vexth_q_d)(void *vd, void *vj, uint32_t desc)
7363734ad93SSong Gao {
737f0db0bebSSong Gao     int i;
738ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
739ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
740f0db0bebSSong Gao     int oprsz = simd_oprsz(desc);
7413734ad93SSong Gao 
742f0db0bebSSong Gao     for (i = 0; i < oprsz / 16; i++) {
743f0db0bebSSong Gao         Vd->Q(i) = int128_makes64(Vj->D(2 * i + 1));
744f0db0bebSSong Gao     }
7453734ad93SSong Gao }
7463734ad93SSong Gao 
747ff27e335SSong Gao void HELPER(vexth_qu_du)(void *vd, void *vj, uint32_t desc)
7483734ad93SSong Gao {
749f0db0bebSSong Gao     int i;
750ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
751ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
752f0db0bebSSong Gao     int oprsz = simd_oprsz(desc);
7533734ad93SSong Gao 
754f0db0bebSSong Gao     for (i = 0; i < oprsz / 16; i++) {
755f0db0bebSSong Gao         Vd->Q(i) = int128_make64(Vj->UD(2 * i + 1));
756f0db0bebSSong Gao     }
7573734ad93SSong Gao }
7583734ad93SSong Gao 
7593734ad93SSong Gao VEXTH(vexth_h_b, 16, H, B)
7603734ad93SSong Gao VEXTH(vexth_w_h, 32, W, H)
7613734ad93SSong Gao VEXTH(vexth_d_w, 64, D, W)
7623734ad93SSong Gao VEXTH(vexth_hu_bu, 16, UH, UB)
7633734ad93SSong Gao VEXTH(vexth_wu_hu, 32, UW, UH)
7643734ad93SSong Gao VEXTH(vexth_du_wu, 64, UD, UW)
765f0e395dfSSong Gao 
766790acb2aSSong Gao #define VEXT2XV(NAME, BIT, E1, E2)                   \
767790acb2aSSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
768790acb2aSSong Gao {                                                    \
769790acb2aSSong Gao     int i;                                           \
770790acb2aSSong Gao     VReg temp = {};                                  \
771790acb2aSSong Gao     VReg *Vd = (VReg *)vd;                           \
772790acb2aSSong Gao     VReg *Vj = (VReg *)vj;                           \
773790acb2aSSong Gao     int oprsz = simd_oprsz(desc);                    \
774790acb2aSSong Gao                                                      \
775790acb2aSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {        \
776790acb2aSSong Gao         temp.E1(i) = Vj->E2(i);                      \
777790acb2aSSong Gao     }                                                \
778790acb2aSSong Gao     *Vd = temp;                                      \
779790acb2aSSong Gao }
780790acb2aSSong Gao 
781790acb2aSSong Gao VEXT2XV(vext2xv_h_b, 16, H, B)
782790acb2aSSong Gao VEXT2XV(vext2xv_w_b, 32, W, B)
783790acb2aSSong Gao VEXT2XV(vext2xv_d_b, 64, D, B)
784790acb2aSSong Gao VEXT2XV(vext2xv_w_h, 32, W, H)
785790acb2aSSong Gao VEXT2XV(vext2xv_d_h, 64, D, H)
786790acb2aSSong Gao VEXT2XV(vext2xv_d_w, 64, D, W)
787790acb2aSSong Gao VEXT2XV(vext2xv_hu_bu, 16, UH, UB)
788790acb2aSSong Gao VEXT2XV(vext2xv_wu_bu, 32, UW, UB)
789790acb2aSSong Gao VEXT2XV(vext2xv_du_bu, 64, UD, UB)
790790acb2aSSong Gao VEXT2XV(vext2xv_wu_hu, 32, UW, UH)
791790acb2aSSong Gao VEXT2XV(vext2xv_du_hu, 64, UD, UH)
792790acb2aSSong Gao VEXT2XV(vext2xv_du_wu, 64, UD, UW)
793790acb2aSSong Gao 
794f0e395dfSSong Gao #define DO_SIGNCOV(a, b)  (a == 0 ? 0 : a < 0 ? -b : b)
795f0e395dfSSong Gao 
796f0e395dfSSong Gao DO_3OP(vsigncov_b, 8, B, DO_SIGNCOV)
797f0e395dfSSong Gao DO_3OP(vsigncov_h, 16, H, DO_SIGNCOV)
798f0e395dfSSong Gao DO_3OP(vsigncov_w, 32, W, DO_SIGNCOV)
799f0e395dfSSong Gao DO_3OP(vsigncov_d, 64, D, DO_SIGNCOV)
800789f4a4cSSong Gao 
801789f4a4cSSong Gao static uint64_t do_vmskltz_b(int64_t val)
802789f4a4cSSong Gao {
803789f4a4cSSong Gao     uint64_t m = 0x8080808080808080ULL;
804789f4a4cSSong Gao     uint64_t c =  val & m;
805789f4a4cSSong Gao     c |= c << 7;
806789f4a4cSSong Gao     c |= c << 14;
807789f4a4cSSong Gao     c |= c << 28;
808789f4a4cSSong Gao     return c >> 56;
809789f4a4cSSong Gao }
810789f4a4cSSong Gao 
811ff27e335SSong Gao void HELPER(vmskltz_b)(void *vd, void *vj, uint32_t desc)
812789f4a4cSSong Gao {
81397074674SSong Gao     int i;
814789f4a4cSSong Gao     uint16_t temp = 0;
815ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
816ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
81797074674SSong Gao     int oprsz = simd_oprsz(desc);
818789f4a4cSSong Gao 
81997074674SSong Gao     for (i = 0; i < oprsz / 16; i++) {
82097074674SSong Gao         temp = 0;
82197074674SSong Gao         temp = do_vmskltz_b(Vj->D(2 * i));
82297074674SSong Gao         temp |= (do_vmskltz_b(Vj->D(2 * i  + 1)) << 8);
82397074674SSong Gao         Vd->D(2 * i) = temp;
82497074674SSong Gao         Vd->D(2 * i + 1) = 0;
82597074674SSong Gao     }
826789f4a4cSSong Gao }
827789f4a4cSSong Gao 
828789f4a4cSSong Gao static uint64_t do_vmskltz_h(int64_t val)
829789f4a4cSSong Gao {
830789f4a4cSSong Gao     uint64_t m = 0x8000800080008000ULL;
831789f4a4cSSong Gao     uint64_t c =  val & m;
832789f4a4cSSong Gao     c |= c << 15;
833789f4a4cSSong Gao     c |= c << 30;
834789f4a4cSSong Gao     return c >> 60;
835789f4a4cSSong Gao }
836789f4a4cSSong Gao 
837ff27e335SSong Gao void HELPER(vmskltz_h)(void *vd, void *vj, uint32_t desc)
838789f4a4cSSong Gao {
83997074674SSong Gao     int i;
840789f4a4cSSong Gao     uint16_t temp = 0;
841ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
842ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
84397074674SSong Gao     int oprsz = simd_oprsz(desc);
844789f4a4cSSong Gao 
84597074674SSong Gao     for (i = 0; i < oprsz / 16; i++) {
84697074674SSong Gao         temp = 0;
84797074674SSong Gao         temp = do_vmskltz_h(Vj->D(2 * i));
84897074674SSong Gao         temp |= (do_vmskltz_h(Vj->D(2 * i + 1)) << 4);
84997074674SSong Gao         Vd->D(2 * i) = temp;
85097074674SSong Gao         Vd->D(2 * i + 1) = 0;
85197074674SSong Gao     }
852789f4a4cSSong Gao }
853789f4a4cSSong Gao 
854789f4a4cSSong Gao static uint64_t do_vmskltz_w(int64_t val)
855789f4a4cSSong Gao {
856789f4a4cSSong Gao     uint64_t m = 0x8000000080000000ULL;
857789f4a4cSSong Gao     uint64_t c =  val & m;
858789f4a4cSSong Gao     c |= c << 31;
859789f4a4cSSong Gao     return c >> 62;
860789f4a4cSSong Gao }
861789f4a4cSSong Gao 
862ff27e335SSong Gao void HELPER(vmskltz_w)(void *vd, void *vj, uint32_t desc)
863789f4a4cSSong Gao {
86497074674SSong Gao     int i;
865789f4a4cSSong Gao     uint16_t temp = 0;
866ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
867ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
86897074674SSong Gao     int oprsz = simd_oprsz(desc);
869789f4a4cSSong Gao 
87097074674SSong Gao     for (i = 0; i < oprsz / 16; i++) {
87197074674SSong Gao         temp = 0;
87297074674SSong Gao         temp = do_vmskltz_w(Vj->D(2 * i));
87397074674SSong Gao         temp |= (do_vmskltz_w(Vj->D(2 * i + 1)) << 2);
87497074674SSong Gao         Vd->D(2 * i) = temp;
87597074674SSong Gao         Vd->D(2 * i + 1) = 0;
87697074674SSong Gao     }
877789f4a4cSSong Gao }
878789f4a4cSSong Gao 
879789f4a4cSSong Gao static uint64_t do_vmskltz_d(int64_t val)
880789f4a4cSSong Gao {
881789f4a4cSSong Gao     return (uint64_t)val >> 63;
882789f4a4cSSong Gao }
883ff27e335SSong Gao void HELPER(vmskltz_d)(void *vd, void *vj, uint32_t desc)
884789f4a4cSSong Gao {
88597074674SSong Gao     int i;
886789f4a4cSSong Gao     uint16_t temp = 0;
887ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
888ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
88997074674SSong Gao     int oprsz = simd_oprsz(desc);
890789f4a4cSSong Gao 
89197074674SSong Gao     for (i = 0; i < oprsz / 16; i++) {
89297074674SSong Gao         temp = 0;
89397074674SSong Gao         temp = do_vmskltz_d(Vj->D(2 * i));
89497074674SSong Gao         temp |= (do_vmskltz_d(Vj->D(2 * i + 1)) << 1);
89597074674SSong Gao         Vd->D(2 * i) = temp;
89697074674SSong Gao         Vd->D(2 * i + 1) = 0;
89797074674SSong Gao     }
898789f4a4cSSong Gao }
899789f4a4cSSong Gao 
900ff27e335SSong Gao void HELPER(vmskgez_b)(void *vd, void *vj, uint32_t desc)
901789f4a4cSSong Gao {
90297074674SSong Gao     int i;
903789f4a4cSSong Gao     uint16_t temp = 0;
904ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
905ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
90697074674SSong Gao     int oprsz = simd_oprsz(desc);
907789f4a4cSSong Gao 
90897074674SSong Gao     for (i = 0; i < oprsz / 16; i++) {
90997074674SSong Gao         temp = 0;
91097074674SSong Gao         temp =  do_vmskltz_b(Vj->D(2 * i));
91197074674SSong Gao         temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8);
91297074674SSong Gao         Vd->D(2 * i) = (uint16_t)(~temp);
91397074674SSong Gao         Vd->D(2 * i + 1) = 0;
91497074674SSong Gao     }
915789f4a4cSSong Gao }
916789f4a4cSSong Gao 
917789f4a4cSSong Gao static uint64_t do_vmskez_b(uint64_t a)
918789f4a4cSSong Gao {
919789f4a4cSSong Gao     uint64_t m = 0x7f7f7f7f7f7f7f7fULL;
920789f4a4cSSong Gao     uint64_t c = ~(((a & m) + m) | a | m);
921789f4a4cSSong Gao     c |= c << 7;
922789f4a4cSSong Gao     c |= c << 14;
923789f4a4cSSong Gao     c |= c << 28;
924789f4a4cSSong Gao     return c >> 56;
925789f4a4cSSong Gao }
926789f4a4cSSong Gao 
927ff27e335SSong Gao void HELPER(vmsknz_b)(void *vd, void *vj, uint32_t desc)
928789f4a4cSSong Gao {
92997074674SSong Gao     int i;
930789f4a4cSSong Gao     uint16_t temp = 0;
931ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
932ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
93397074674SSong Gao     int oprsz = simd_oprsz(desc);
934789f4a4cSSong Gao 
93597074674SSong Gao     for (i = 0; i < oprsz / 16; i++) {
93697074674SSong Gao         temp = 0;
93797074674SSong Gao         temp = do_vmskez_b(Vj->D(2 * i));
93897074674SSong Gao         temp |= (do_vmskez_b(Vj->D(2 * i + 1)) << 8);
93997074674SSong Gao         Vd->D(2 * i) = (uint16_t)(~temp);
94097074674SSong Gao         Vd->D(2 * i + 1) = 0;
94197074674SSong Gao     }
942789f4a4cSSong Gao }
943f205a539SSong Gao 
9444472a45aSSong Gao void HELPER(vnori_b)(void *vd, void *vj, uint64_t imm, uint32_t desc)
945f205a539SSong Gao {
946f205a539SSong Gao     int i;
947f205a539SSong Gao     VReg *Vd = (VReg *)vd;
948f205a539SSong Gao     VReg *Vj = (VReg *)vj;
949f205a539SSong Gao 
9504472a45aSSong Gao     for (i = 0; i < simd_oprsz(desc); i++) {
951f205a539SSong Gao         Vd->B(i) = ~(Vj->B(i) | (uint8_t)imm);
952f205a539SSong Gao     }
953f205a539SSong Gao }
9549b21a7a5SSong Gao 
9559b21a7a5SSong Gao #define VSLLWIL(NAME, BIT, E1, E2)                                             \
956329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)             \
9579b21a7a5SSong Gao {                                                                              \
9586567eac7SSong Gao     int i, j, ofs;                                                             \
9596567eac7SSong Gao     VReg temp = {};                                                            \
960329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                     \
961329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                     \
9626567eac7SSong Gao     int oprsz = simd_oprsz(desc);                                              \
9639b21a7a5SSong Gao     typedef __typeof(temp.E1(0)) TD;                                           \
9649b21a7a5SSong Gao                                                                                \
9656567eac7SSong Gao     ofs = LSX_LEN / BIT;                                                       \
9666567eac7SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
9676567eac7SSong Gao         for (j = 0; j < ofs; j++) {                                            \
9686567eac7SSong Gao             temp.E1(j + ofs * i) = (TD)Vj->E2(j + ofs * 2 * i) << (imm % BIT); \
9696567eac7SSong Gao         }                                                                      \
9709b21a7a5SSong Gao     }                                                                          \
9719b21a7a5SSong Gao     *Vd = temp;                                                                \
9729b21a7a5SSong Gao }
9739b21a7a5SSong Gao 
9746567eac7SSong Gao 
975ff27e335SSong Gao void HELPER(vextl_q_d)(void *vd, void *vj, uint32_t desc)
9769b21a7a5SSong Gao {
9776567eac7SSong Gao     int i;
978ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
979ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
9806567eac7SSong Gao     int oprsz = simd_oprsz(desc);
9819b21a7a5SSong Gao 
9826567eac7SSong Gao     for (i = 0; i < oprsz / 16; i++) {
9836567eac7SSong Gao         Vd->Q(i) = int128_makes64(Vj->D(2 * i));
9846567eac7SSong Gao     }
9859b21a7a5SSong Gao }
9869b21a7a5SSong Gao 
987ff27e335SSong Gao void HELPER(vextl_qu_du)(void *vd, void *vj, uint32_t desc)
9889b21a7a5SSong Gao {
9896567eac7SSong Gao     int i;
990ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
991ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
9926567eac7SSong Gao     int oprsz = simd_oprsz(desc);
9939b21a7a5SSong Gao 
9946567eac7SSong Gao     for (i = 0; i < oprsz / 16; i++) {
9956567eac7SSong Gao         Vd->Q(i) = int128_make64(Vj->UD(2 * i));
9966567eac7SSong Gao     }
9979b21a7a5SSong Gao }
9989b21a7a5SSong Gao 
9999b21a7a5SSong Gao VSLLWIL(vsllwil_h_b, 16, H, B)
10009b21a7a5SSong Gao VSLLWIL(vsllwil_w_h, 32, W, H)
10019b21a7a5SSong Gao VSLLWIL(vsllwil_d_w, 64, D, W)
10029b21a7a5SSong Gao VSLLWIL(vsllwil_hu_bu, 16, UH, UB)
10039b21a7a5SSong Gao VSLLWIL(vsllwil_wu_hu, 32, UW, UH)
10049b21a7a5SSong Gao VSLLWIL(vsllwil_du_wu, 64, UD, UW)
1005ecb93716SSong Gao 
1006ecb93716SSong Gao #define do_vsrlr(E, T)                                  \
1007ecb93716SSong Gao static T do_vsrlr_ ##E(T s1, int sh)                    \
1008ecb93716SSong Gao {                                                       \
1009ecb93716SSong Gao     if (sh == 0) {                                      \
1010ecb93716SSong Gao         return s1;                                      \
1011ecb93716SSong Gao     } else {                                            \
1012ecb93716SSong Gao         return  (s1 >> sh)  + ((s1 >> (sh - 1)) & 0x1); \
1013ecb93716SSong Gao     }                                                   \
1014ecb93716SSong Gao }
1015ecb93716SSong Gao 
1016ecb93716SSong Gao do_vsrlr(B, uint8_t)
1017ecb93716SSong Gao do_vsrlr(H, uint16_t)
1018ecb93716SSong Gao do_vsrlr(W, uint32_t)
1019ecb93716SSong Gao do_vsrlr(D, uint64_t)
1020ecb93716SSong Gao 
1021ecb93716SSong Gao #define VSRLR(NAME, BIT, T, E)                                  \
102204711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)  \
1023ecb93716SSong Gao {                                                               \
1024ecb93716SSong Gao     int i;                                                      \
102504711da1SSong Gao     VReg *Vd = (VReg *)vd;                                      \
102604711da1SSong Gao     VReg *Vj = (VReg *)vj;                                      \
102704711da1SSong Gao     VReg *Vk = (VReg *)vk;                                      \
10288c272fe8SSong Gao     int oprsz = simd_oprsz(desc);                               \
1029ecb93716SSong Gao                                                                 \
10308c272fe8SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                   \
1031ecb93716SSong Gao         Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \
1032ecb93716SSong Gao     }                                                           \
1033ecb93716SSong Gao }
1034ecb93716SSong Gao 
1035ecb93716SSong Gao VSRLR(vsrlr_b, 8,  uint8_t, B)
1036ecb93716SSong Gao VSRLR(vsrlr_h, 16, uint16_t, H)
1037ecb93716SSong Gao VSRLR(vsrlr_w, 32, uint32_t, W)
1038ecb93716SSong Gao VSRLR(vsrlr_d, 64, uint64_t, D)
1039ecb93716SSong Gao 
1040ecb93716SSong Gao #define VSRLRI(NAME, BIT, E)                                       \
1041329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1042ecb93716SSong Gao {                                                                  \
1043ecb93716SSong Gao     int i;                                                         \
1044329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                         \
1045329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                         \
10468c272fe8SSong Gao     int oprsz = simd_oprsz(desc);                                  \
1047ecb93716SSong Gao                                                                    \
10488c272fe8SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
1049ecb93716SSong Gao         Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), imm);                  \
1050ecb93716SSong Gao     }                                                              \
1051ecb93716SSong Gao }
1052ecb93716SSong Gao 
1053ecb93716SSong Gao VSRLRI(vsrlri_b, 8, B)
1054ecb93716SSong Gao VSRLRI(vsrlri_h, 16, H)
1055ecb93716SSong Gao VSRLRI(vsrlri_w, 32, W)
1056ecb93716SSong Gao VSRLRI(vsrlri_d, 64, D)
1057ecb93716SSong Gao 
1058ecb93716SSong Gao #define do_vsrar(E, T)                                  \
1059ecb93716SSong Gao static T do_vsrar_ ##E(T s1, int sh)                    \
1060ecb93716SSong Gao {                                                       \
1061ecb93716SSong Gao     if (sh == 0) {                                      \
1062ecb93716SSong Gao         return s1;                                      \
1063ecb93716SSong Gao     } else {                                            \
1064ecb93716SSong Gao         return  (s1 >> sh)  + ((s1 >> (sh - 1)) & 0x1); \
1065ecb93716SSong Gao     }                                                   \
1066ecb93716SSong Gao }
1067ecb93716SSong Gao 
1068ecb93716SSong Gao do_vsrar(B, int8_t)
1069ecb93716SSong Gao do_vsrar(H, int16_t)
1070ecb93716SSong Gao do_vsrar(W, int32_t)
1071ecb93716SSong Gao do_vsrar(D, int64_t)
1072ecb93716SSong Gao 
1073ecb93716SSong Gao #define VSRAR(NAME, BIT, T, E)                                  \
107404711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)  \
1075ecb93716SSong Gao {                                                               \
1076ecb93716SSong Gao     int i;                                                      \
107704711da1SSong Gao     VReg *Vd = (VReg *)vd;                                      \
107804711da1SSong Gao     VReg *Vj = (VReg *)vj;                                      \
107904711da1SSong Gao     VReg *Vk = (VReg *)vk;                                      \
10808c272fe8SSong Gao     int oprsz = simd_oprsz(desc);                               \
1081ecb93716SSong Gao                                                                 \
10828c272fe8SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                   \
1083ecb93716SSong Gao         Vd->E(i) = do_vsrar_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \
1084ecb93716SSong Gao     }                                                           \
1085ecb93716SSong Gao }
1086ecb93716SSong Gao 
1087ecb93716SSong Gao VSRAR(vsrar_b, 8,  uint8_t, B)
1088ecb93716SSong Gao VSRAR(vsrar_h, 16, uint16_t, H)
1089ecb93716SSong Gao VSRAR(vsrar_w, 32, uint32_t, W)
1090ecb93716SSong Gao VSRAR(vsrar_d, 64, uint64_t, D)
1091ecb93716SSong Gao 
1092ecb93716SSong Gao #define VSRARI(NAME, BIT, E)                                       \
1093329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1094ecb93716SSong Gao {                                                                  \
1095ecb93716SSong Gao     int i;                                                         \
1096329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                         \
1097329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                         \
10988c272fe8SSong Gao     int oprsz = simd_oprsz(desc);                                  \
1099ecb93716SSong Gao                                                                    \
11008c272fe8SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
1101ecb93716SSong Gao         Vd->E(i) = do_vsrar_ ## E(Vj->E(i), imm);                  \
1102ecb93716SSong Gao     }                                                              \
1103ecb93716SSong Gao }
1104ecb93716SSong Gao 
1105ecb93716SSong Gao VSRARI(vsrari_b, 8, B)
1106ecb93716SSong Gao VSRARI(vsrari_h, 16, H)
1107ecb93716SSong Gao VSRARI(vsrari_w, 32, W)
1108ecb93716SSong Gao VSRARI(vsrari_d, 64, D)
1109d79fb8ddSSong Gao 
1110d79fb8ddSSong Gao #define R_SHIFT(a, b) (a >> b)
1111d79fb8ddSSong Gao 
111240c7674eSSong Gao #define VSRLN(NAME, BIT, E1, E2)                                          \
111304711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)            \
1114d79fb8ddSSong Gao {                                                                         \
111540c7674eSSong Gao     int i, j, ofs;                                                        \
111604711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                \
111704711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                \
111804711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                \
111940c7674eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
1120d79fb8ddSSong Gao                                                                           \
112140c7674eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
112240c7674eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
112340c7674eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
112440c7674eSSong Gao             Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i),        \
112540c7674eSSong Gao                                               Vk->E2(j + ofs * i) % BIT); \
1126d79fb8ddSSong Gao         }                                                                 \
112740c7674eSSong Gao         Vd->D(2 * i + 1) = 0;                                             \
112840c7674eSSong Gao     }                                                                     \
1129d79fb8ddSSong Gao }
1130d79fb8ddSSong Gao 
113140c7674eSSong Gao VSRLN(vsrln_b_h, 16, B, UH)
113240c7674eSSong Gao VSRLN(vsrln_h_w, 32, H, UW)
113340c7674eSSong Gao VSRLN(vsrln_w_d, 64, W, UD)
1134d79fb8ddSSong Gao 
113540c7674eSSong Gao #define VSRAN(NAME, BIT, E1, E2, E3)                                      \
113604711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)            \
1137d79fb8ddSSong Gao {                                                                         \
113840c7674eSSong Gao     int i, j, ofs;                                                        \
113904711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                \
114004711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                \
114104711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                \
114240c7674eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
1143d79fb8ddSSong Gao                                                                           \
114440c7674eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
114540c7674eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
114640c7674eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
114740c7674eSSong Gao             Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i),        \
114840c7674eSSong Gao                                               Vk->E3(j + ofs * i) % BIT); \
1149d79fb8ddSSong Gao         }                                                                 \
115040c7674eSSong Gao         Vd->D(2 * i + 1) = 0;                                             \
115140c7674eSSong Gao     }                                                                     \
1152d79fb8ddSSong Gao }
1153d79fb8ddSSong Gao 
115440c7674eSSong Gao VSRAN(vsran_b_h, 16, B, H, UH)
115540c7674eSSong Gao VSRAN(vsran_h_w, 32, H, W, UW)
115640c7674eSSong Gao VSRAN(vsran_w_d, 64, W, D, UD)
1157d79fb8ddSSong Gao 
115840c7674eSSong Gao #define VSRLNI(NAME, BIT, E1, E2)                                         \
1159329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)        \
1160d79fb8ddSSong Gao {                                                                         \
116140c7674eSSong Gao     int i, j, ofs;                                                        \
116240c7674eSSong Gao     VReg temp = {};                                                       \
1163329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                \
1164329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                \
116540c7674eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
1166d79fb8ddSSong Gao                                                                           \
116740c7674eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
116840c7674eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
116940c7674eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
117040c7674eSSong Gao             temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \
117140c7674eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \
117240c7674eSSong Gao                                                      imm);                \
117340c7674eSSong Gao         }                                                                 \
1174d79fb8ddSSong Gao     }                                                                     \
1175d79fb8ddSSong Gao     *Vd = temp;                                                           \
1176d79fb8ddSSong Gao }
1177d79fb8ddSSong Gao 
1178329517d5SSong Gao void HELPER(vsrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1179d79fb8ddSSong Gao {
118040c7674eSSong Gao     int i;
118140c7674eSSong Gao     VReg temp = {};
1182329517d5SSong Gao     VReg *Vd = (VReg *)vd;
1183329517d5SSong Gao     VReg *Vj = (VReg *)vj;
1184d79fb8ddSSong Gao 
118540c7674eSSong Gao     for (i = 0; i < 2; i++) {
118640c7674eSSong Gao         temp.D(2 * i) = int128_getlo(int128_urshift(Vj->Q(i), imm % 128));
118740c7674eSSong Gao         temp.D(2 * i +1) = int128_getlo(int128_urshift(Vd->Q(i), imm % 128));
118840c7674eSSong Gao     }
1189d79fb8ddSSong Gao     *Vd = temp;
1190d79fb8ddSSong Gao }
1191d79fb8ddSSong Gao 
119240c7674eSSong Gao VSRLNI(vsrlni_b_h, 16, B, UH)
119340c7674eSSong Gao VSRLNI(vsrlni_h_w, 32, H, UW)
119440c7674eSSong Gao VSRLNI(vsrlni_w_d, 64, W, UD)
1195d79fb8ddSSong Gao 
1196d79fb8ddSSong Gao #define VSRANI(NAME, BIT, E1, E2)                                         \
1197329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)        \
1198d79fb8ddSSong Gao {                                                                         \
119940c7674eSSong Gao     int i, j, ofs;                                                        \
120040c7674eSSong Gao     VReg temp = {};                                                       \
1201329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                \
1202329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                \
120340c7674eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
1204d79fb8ddSSong Gao                                                                           \
120540c7674eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
120640c7674eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
120740c7674eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
120840c7674eSSong Gao             temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \
120940c7674eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \
121040c7674eSSong Gao                                                      imm);                \
121140c7674eSSong Gao         }                                                                 \
1212d79fb8ddSSong Gao     }                                                                     \
1213d79fb8ddSSong Gao     *Vd = temp;                                                           \
1214d79fb8ddSSong Gao }
1215d79fb8ddSSong Gao 
1216329517d5SSong Gao void HELPER(vsrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1217d79fb8ddSSong Gao {
121840c7674eSSong Gao     int i;
121940c7674eSSong Gao     VReg temp = {};
1220329517d5SSong Gao     VReg *Vd = (VReg *)vd;
1221329517d5SSong Gao     VReg *Vj = (VReg *)vj;
1222d79fb8ddSSong Gao 
122340c7674eSSong Gao     for (i = 0; i < 2; i++) {
122440c7674eSSong Gao         temp.D(2 * i) = int128_getlo(int128_rshift(Vj->Q(i), imm % 128));
122540c7674eSSong Gao         temp.D(2 * i + 1) = int128_getlo(int128_rshift(Vd->Q(i), imm % 128));
122640c7674eSSong Gao     }
1227d79fb8ddSSong Gao     *Vd = temp;
1228d79fb8ddSSong Gao }
1229d79fb8ddSSong Gao 
1230d79fb8ddSSong Gao VSRANI(vsrani_b_h, 16, B, H)
1231d79fb8ddSSong Gao VSRANI(vsrani_h_w, 32, H, W)
1232d79fb8ddSSong Gao VSRANI(vsrani_w_d, 64, W, D)
1233a5200a17SSong Gao 
1234c50ce38aSSong Gao #define VSRLRN(NAME, BIT, E1, E2, E3)                                      \
123504711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)             \
1236a5200a17SSong Gao {                                                                          \
1237c50ce38aSSong Gao     int i, j, ofs;                                                         \
123804711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                 \
123904711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                 \
124004711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                 \
1241c50ce38aSSong Gao     int oprsz = simd_oprsz(desc);                                          \
1242a5200a17SSong Gao                                                                            \
1243c50ce38aSSong Gao     ofs = LSX_LEN / BIT;                                                   \
1244c50ce38aSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                     \
1245c50ce38aSSong Gao         for (j = 0; j < ofs; j++) {                                        \
1246c50ce38aSSong Gao             Vd->E1(j + ofs * 2 * i) = do_vsrlr_ ##E2(Vj->E2(j + ofs * i),  \
1247c50ce38aSSong Gao                                                Vk->E3(j + ofs * i) % BIT); \
1248a5200a17SSong Gao         }                                                                  \
1249c50ce38aSSong Gao         Vd->D(2 * i + 1) = 0;                                              \
1250c50ce38aSSong Gao     }                                                                      \
1251a5200a17SSong Gao }
1252a5200a17SSong Gao 
1253c50ce38aSSong Gao VSRLRN(vsrlrn_b_h, 16, B, H, UH)
1254c50ce38aSSong Gao VSRLRN(vsrlrn_h_w, 32, H, W, UW)
1255c50ce38aSSong Gao VSRLRN(vsrlrn_w_d, 64, W, D, UD)
1256a5200a17SSong Gao 
1257c50ce38aSSong Gao #define VSRARN(NAME, BIT, E1, E2, E3)                                       \
125804711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)              \
1259a5200a17SSong Gao {                                                                           \
1260c50ce38aSSong Gao     int i, j, ofs;                                                          \
126104711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                  \
126204711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                  \
126304711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                  \
1264c50ce38aSSong Gao     int oprsz = simd_oprsz(desc);                                           \
1265a5200a17SSong Gao                                                                             \
1266c50ce38aSSong Gao     ofs = LSX_LEN / BIT;                                                    \
1267c50ce38aSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                      \
1268c50ce38aSSong Gao         for (j = 0; j < ofs; j++) {                                         \
1269c50ce38aSSong Gao             Vd->E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i),  \
1270c50ce38aSSong Gao                                                 Vk->E3(j + ofs * i) % BIT); \
1271a5200a17SSong Gao         }                                                                   \
1272c50ce38aSSong Gao         Vd->D(2 * i + 1) = 0;                                               \
1273c50ce38aSSong Gao     }                                                                       \
1274a5200a17SSong Gao }
1275a5200a17SSong Gao 
1276c50ce38aSSong Gao VSRARN(vsrarn_b_h, 16, B, H, UH)
1277c50ce38aSSong Gao VSRARN(vsrarn_h_w, 32, H, W, UW)
1278c50ce38aSSong Gao VSRARN(vsrarn_w_d, 64, W, D, UD)
1279a5200a17SSong Gao 
1280a5200a17SSong Gao #define VSRLRNI(NAME, BIT, E1, E2)                                                \
1281329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                \
1282a5200a17SSong Gao {                                                                                 \
1283c50ce38aSSong Gao     int i, j, ofs;                                                                \
1284c50ce38aSSong Gao     VReg temp = {};                                                               \
1285329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                        \
1286329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                        \
1287c50ce38aSSong Gao     int oprsz = simd_oprsz(desc);                                                 \
1288a5200a17SSong Gao                                                                                   \
1289c50ce38aSSong Gao     ofs = LSX_LEN / BIT;                                                          \
1290c50ce38aSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                            \
1291c50ce38aSSong Gao         for (j = 0; j < ofs; j++) {                                               \
1292c50ce38aSSong Gao             temp.E1(j + ofs * 2 * i) = do_vsrlr_ ## E2(Vj->E2(j + ofs * i), imm); \
1293c50ce38aSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_vsrlr_ ## E2(Vd->E2(j + ofs * i), \
1294c50ce38aSSong Gao                                                                  imm);            \
1295c50ce38aSSong Gao         }                                                                         \
1296a5200a17SSong Gao     }                                                                             \
1297a5200a17SSong Gao     *Vd = temp;                                                                   \
1298a5200a17SSong Gao }
1299a5200a17SSong Gao 
1300329517d5SSong Gao void HELPER(vsrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1301a5200a17SSong Gao {
1302c50ce38aSSong Gao     int i;
1303c50ce38aSSong Gao     VReg temp = {};
1304329517d5SSong Gao     VReg *Vd = (VReg *)vd;
1305329517d5SSong Gao     VReg *Vj = (VReg *)vj;
1306c50ce38aSSong Gao     Int128 r[4];
1307c50ce38aSSong Gao     int oprsz = simd_oprsz(desc);
1308a5200a17SSong Gao 
1309c50ce38aSSong Gao     for (i = 0; i < oprsz / 16; i++) {
1310a5200a17SSong Gao         if (imm == 0) {
1311c50ce38aSSong Gao             temp.D(2 * i) = int128_getlo(Vj->Q(i));
1312c50ce38aSSong Gao             temp.D(2 * i + 1) = int128_getlo(Vd->Q(i));
1313a5200a17SSong Gao         } else {
1314c50ce38aSSong Gao             r[2 * i] = int128_and(int128_urshift(Vj->Q(i), (imm - 1)),
1315c50ce38aSSong Gao                                   int128_one());
1316c50ce38aSSong Gao             r[2 * i + 1] = int128_and(int128_urshift(Vd->Q(i), (imm - 1)),
1317c50ce38aSSong Gao                                       int128_one());
1318c50ce38aSSong Gao             temp.D(2 * i) = int128_getlo(int128_add(int128_urshift(Vj->Q(i),
1319c50ce38aSSong Gao                                                     imm), r[2 * i]));
1320c50ce38aSSong Gao             temp.D(2 * i + 1) = int128_getlo(int128_add(int128_urshift(Vd->Q(i),
1321c50ce38aSSong Gao                                                         imm), r[ 2 * i + 1]));
1322c50ce38aSSong Gao         }
1323a5200a17SSong Gao     }
1324a5200a17SSong Gao     *Vd = temp;
1325a5200a17SSong Gao }
1326a5200a17SSong Gao 
1327a5200a17SSong Gao VSRLRNI(vsrlrni_b_h, 16, B, H)
1328a5200a17SSong Gao VSRLRNI(vsrlrni_h_w, 32, H, W)
1329a5200a17SSong Gao VSRLRNI(vsrlrni_w_d, 64, W, D)
1330a5200a17SSong Gao 
1331a5200a17SSong Gao #define VSRARNI(NAME, BIT, E1, E2)                                                \
1332329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                \
1333a5200a17SSong Gao {                                                                                 \
1334c50ce38aSSong Gao     int i, j, ofs;                                                                \
1335c50ce38aSSong Gao     VReg temp = {};                                                               \
1336329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                        \
1337329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                        \
1338c50ce38aSSong Gao     int oprsz = simd_oprsz(desc);                                                 \
1339a5200a17SSong Gao                                                                                   \
1340c50ce38aSSong Gao     ofs = LSX_LEN / BIT;                                                          \
1341c50ce38aSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                            \
1342c50ce38aSSong Gao         for (j = 0; j < ofs; j++) {                                               \
1343c50ce38aSSong Gao             temp.E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i), imm); \
1344c50ce38aSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_vsrar_ ## E2(Vd->E2(j + ofs * i), \
1345c50ce38aSSong Gao                                                              imm);                \
1346c50ce38aSSong Gao         }                                                                         \
1347a5200a17SSong Gao     }                                                                             \
1348a5200a17SSong Gao     *Vd = temp;                                                                   \
1349a5200a17SSong Gao }
1350a5200a17SSong Gao 
1351329517d5SSong Gao void HELPER(vsrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1352a5200a17SSong Gao {
1353c50ce38aSSong Gao     int i;
1354c50ce38aSSong Gao     VReg temp = {};
1355329517d5SSong Gao     VReg *Vd = (VReg *)vd;
1356329517d5SSong Gao     VReg *Vj = (VReg *)vj;
1357c50ce38aSSong Gao     Int128 r[4];
1358c50ce38aSSong Gao     int oprsz = simd_oprsz(desc);
1359a5200a17SSong Gao 
1360c50ce38aSSong Gao     for (i = 0; i < oprsz / 16; i++) {
1361a5200a17SSong Gao         if (imm == 0) {
1362c50ce38aSSong Gao             temp.D(2 * i) = int128_getlo(Vj->Q(i));
1363c50ce38aSSong Gao             temp.D(2 * i + 1) = int128_getlo(Vd->Q(i));
1364a5200a17SSong Gao         } else {
1365c50ce38aSSong Gao             r[2 * i] = int128_and(int128_rshift(Vj->Q(i), (imm - 1)),
1366c50ce38aSSong Gao                                   int128_one());
1367c50ce38aSSong Gao             r[2 * i + 1] = int128_and(int128_rshift(Vd->Q(i), (imm - 1)),
1368c50ce38aSSong Gao                                       int128_one());
1369c50ce38aSSong Gao             temp.D(2 * i) = int128_getlo(int128_add(int128_rshift(Vj->Q(i),
1370c50ce38aSSong Gao                                                     imm), r[2 * i]));
1371c50ce38aSSong Gao             temp.D(2 * i + 1) = int128_getlo(int128_add(int128_rshift(Vd->Q(i),
1372c50ce38aSSong Gao                                                         imm), r[2 * i + 1]));
1373c50ce38aSSong Gao         }
1374a5200a17SSong Gao     }
1375a5200a17SSong Gao     *Vd = temp;
1376a5200a17SSong Gao }
1377a5200a17SSong Gao 
1378a5200a17SSong Gao VSRARNI(vsrarni_b_h, 16, B, H)
1379a5200a17SSong Gao VSRARNI(vsrarni_h_w, 32, H, W)
1380a5200a17SSong Gao VSRARNI(vsrarni_w_d, 64, W, D)
138183b3815dSSong Gao 
138283b3815dSSong Gao #define SSRLNS(NAME, T1, T2, T3)                    \
138383b3815dSSong Gao static T1 do_ssrlns_ ## NAME(T2 e2, int sa, int sh) \
138483b3815dSSong Gao {                                                   \
138583b3815dSSong Gao         T1 shft_res;                                \
138683b3815dSSong Gao         if (sa == 0) {                              \
138783b3815dSSong Gao             shft_res = e2;                          \
138883b3815dSSong Gao         } else {                                    \
138983b3815dSSong Gao             shft_res = (((T1)e2) >> sa);            \
139083b3815dSSong Gao         }                                           \
139183b3815dSSong Gao         T3 mask;                                    \
139283b3815dSSong Gao         mask = (1ull << sh) -1;                     \
139383b3815dSSong Gao         if (shft_res > mask) {                      \
139483b3815dSSong Gao             return mask;                            \
139583b3815dSSong Gao         } else {                                    \
139683b3815dSSong Gao             return  shft_res;                       \
139783b3815dSSong Gao         }                                           \
139883b3815dSSong Gao }
139983b3815dSSong Gao 
140083b3815dSSong Gao SSRLNS(B, uint16_t, int16_t, uint8_t)
140183b3815dSSong Gao SSRLNS(H, uint32_t, int32_t, uint16_t)
140283b3815dSSong Gao SSRLNS(W, uint64_t, int64_t, uint32_t)
140383b3815dSSong Gao 
14046256c8caSSong Gao #define VSSRLN(NAME, BIT, E1, E2, E3)                                       \
140504711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)              \
140683b3815dSSong Gao {                                                                           \
14076256c8caSSong Gao     int i, j, ofs;                                                          \
140804711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                  \
140904711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                  \
141004711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                  \
14116256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                           \
141283b3815dSSong Gao                                                                             \
14136256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                    \
14146256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                      \
14156256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                         \
14166256c8caSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i), \
14176256c8caSSong Gao                                                 Vk->E3(j + ofs * i) % BIT,  \
14186256c8caSSong Gao                                                 BIT / 2 - 1);               \
141983b3815dSSong Gao         }                                                                   \
14206256c8caSSong Gao         Vd->D(2 * i + 1) = 0;                                               \
14216256c8caSSong Gao     }                                                                       \
142283b3815dSSong Gao }
142383b3815dSSong Gao 
14246256c8caSSong Gao VSSRLN(vssrln_b_h, 16, B, H, UH)
14256256c8caSSong Gao VSSRLN(vssrln_h_w, 32, H, W, UW)
14266256c8caSSong Gao VSSRLN(vssrln_w_d, 64, W, D, UD)
142783b3815dSSong Gao 
142883b3815dSSong Gao #define SSRANS(E, T1, T2)                        \
142983b3815dSSong Gao static T1 do_ssrans_ ## E(T1 e2, int sa, int sh) \
143083b3815dSSong Gao {                                                \
143183b3815dSSong Gao         T1 shft_res;                             \
143283b3815dSSong Gao         if (sa == 0) {                           \
143383b3815dSSong Gao             shft_res = e2;                       \
143483b3815dSSong Gao         } else {                                 \
143583b3815dSSong Gao             shft_res = e2 >> sa;                 \
143683b3815dSSong Gao         }                                        \
143783b3815dSSong Gao         T2 mask;                                 \
143883b3815dSSong Gao         mask = (1ll << sh) - 1;                  \
143983b3815dSSong Gao         if (shft_res > mask) {                   \
144083b3815dSSong Gao             return  mask;                        \
144183b3815dSSong Gao         } else if (shft_res < -(mask + 1)) {     \
144283b3815dSSong Gao             return  ~mask;                       \
144383b3815dSSong Gao         } else {                                 \
144483b3815dSSong Gao             return shft_res;                     \
144583b3815dSSong Gao         }                                        \
144683b3815dSSong Gao }
144783b3815dSSong Gao 
144883b3815dSSong Gao SSRANS(B, int16_t, int8_t)
144983b3815dSSong Gao SSRANS(H, int32_t, int16_t)
145083b3815dSSong Gao SSRANS(W, int64_t, int32_t)
145183b3815dSSong Gao 
14526256c8caSSong Gao #define VSSRAN(NAME, BIT, E1, E2, E3)                                       \
145304711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)              \
145483b3815dSSong Gao {                                                                           \
14556256c8caSSong Gao     int i, j, ofs;                                                          \
145604711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                  \
145704711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                  \
145804711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                  \
14596256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                           \
146083b3815dSSong Gao                                                                             \
14616256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                    \
14626256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                      \
14636256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                         \
14646256c8caSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i), \
14656256c8caSSong Gao                                                 Vk->E3(j + ofs * i) % BIT,  \
14666256c8caSSong Gao                                                 BIT / 2 - 1);               \
146783b3815dSSong Gao         }                                                                   \
14686256c8caSSong Gao         Vd->D(2 * i + 1) = 0;                                               \
14696256c8caSSong Gao     }                                                                       \
147083b3815dSSong Gao }
147183b3815dSSong Gao 
14726256c8caSSong Gao VSSRAN(vssran_b_h, 16, B, H, UH)
14736256c8caSSong Gao VSSRAN(vssran_h_w, 32, H, W, UW)
14746256c8caSSong Gao VSSRAN(vssran_w_d, 64, W, D, UD)
147583b3815dSSong Gao 
147683b3815dSSong Gao #define SSRLNU(E, T1, T2, T3)                    \
147783b3815dSSong Gao static T1 do_ssrlnu_ ## E(T3 e2, int sa, int sh) \
147883b3815dSSong Gao {                                                \
147983b3815dSSong Gao         T1 shft_res;                             \
148083b3815dSSong Gao         if (sa == 0) {                           \
148183b3815dSSong Gao             shft_res = e2;                       \
148283b3815dSSong Gao         } else {                                 \
148383b3815dSSong Gao             shft_res = (((T1)e2) >> sa);         \
148483b3815dSSong Gao         }                                        \
148583b3815dSSong Gao         T2 mask;                                 \
148683b3815dSSong Gao         mask = (1ull << sh) - 1;                 \
148783b3815dSSong Gao         if (shft_res > mask) {                   \
148883b3815dSSong Gao             return mask;                         \
148983b3815dSSong Gao         } else {                                 \
149083b3815dSSong Gao             return shft_res;                     \
149183b3815dSSong Gao         }                                        \
149283b3815dSSong Gao }
149383b3815dSSong Gao 
149483b3815dSSong Gao SSRLNU(B, uint16_t, uint8_t,  int16_t)
149583b3815dSSong Gao SSRLNU(H, uint32_t, uint16_t, int32_t)
149683b3815dSSong Gao SSRLNU(W, uint64_t, uint32_t, int64_t)
149783b3815dSSong Gao 
14986256c8caSSong Gao #define VSSRLNU(NAME, BIT, E1, E2, E3)                                      \
149904711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)              \
150083b3815dSSong Gao {                                                                           \
15016256c8caSSong Gao     int i, j, ofs;                                                          \
150204711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                  \
150304711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                  \
150404711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                  \
15056256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                           \
150683b3815dSSong Gao                                                                             \
15076256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                    \
15086256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                      \
15096256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                         \
15106256c8caSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i), \
15116256c8caSSong Gao                                                 Vk->E3(j + ofs * i) % BIT,  \
15126256c8caSSong Gao                                                 BIT / 2);                   \
151383b3815dSSong Gao         }                                                                   \
15146256c8caSSong Gao         Vd->D(2 * i + 1) = 0;                                               \
15156256c8caSSong Gao     }                                                                       \
151683b3815dSSong Gao }
151783b3815dSSong Gao 
15186256c8caSSong Gao VSSRLNU(vssrln_bu_h, 16, B, H, UH)
15196256c8caSSong Gao VSSRLNU(vssrln_hu_w, 32, H, W, UW)
15206256c8caSSong Gao VSSRLNU(vssrln_wu_d, 64, W, D, UD)
152183b3815dSSong Gao 
152283b3815dSSong Gao #define SSRANU(E, T1, T2, T3)                    \
152383b3815dSSong Gao static T1 do_ssranu_ ## E(T3 e2, int sa, int sh) \
152483b3815dSSong Gao {                                                \
152583b3815dSSong Gao         T1 shft_res;                             \
152683b3815dSSong Gao         if (sa == 0) {                           \
152783b3815dSSong Gao             shft_res = e2;                       \
152883b3815dSSong Gao         } else {                                 \
152983b3815dSSong Gao             shft_res = e2 >> sa;                 \
153083b3815dSSong Gao         }                                        \
153183b3815dSSong Gao         if (e2 < 0) {                            \
153283b3815dSSong Gao             shft_res = 0;                        \
153383b3815dSSong Gao         }                                        \
153483b3815dSSong Gao         T2 mask;                                 \
153583b3815dSSong Gao         mask = (1ull << sh) - 1;                 \
153683b3815dSSong Gao         if (shft_res > mask) {                   \
153783b3815dSSong Gao             return mask;                         \
153883b3815dSSong Gao         } else {                                 \
153983b3815dSSong Gao             return shft_res;                     \
154083b3815dSSong Gao         }                                        \
154183b3815dSSong Gao }
154283b3815dSSong Gao 
154383b3815dSSong Gao SSRANU(B, uint16_t, uint8_t,  int16_t)
154483b3815dSSong Gao SSRANU(H, uint32_t, uint16_t, int32_t)
154583b3815dSSong Gao SSRANU(W, uint64_t, uint32_t, int64_t)
154683b3815dSSong Gao 
15476256c8caSSong Gao #define VSSRANU(NAME, BIT, E1, E2, E3)                                         \
154804711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                 \
154983b3815dSSong Gao {                                                                              \
15506256c8caSSong Gao     int i, j, ofs;                                                             \
155104711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                     \
155204711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                     \
155304711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                     \
15546256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                              \
155583b3815dSSong Gao                                                                                \
15566256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                       \
15576256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
15586256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                            \
15596256c8caSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i),    \
15606256c8caSSong Gao                                                     Vk->E3(j + ofs * i) % BIT, \
15616256c8caSSong Gao                                                     BIT / 2);                  \
156283b3815dSSong Gao         }                                                                      \
15636256c8caSSong Gao         Vd->D(2 * i + 1) = 0;                                                  \
15646256c8caSSong Gao     }                                                                          \
156583b3815dSSong Gao }
156683b3815dSSong Gao 
15676256c8caSSong Gao VSSRANU(vssran_bu_h, 16, B, H, UH)
15686256c8caSSong Gao VSSRANU(vssran_hu_w, 32, H, W, UW)
15696256c8caSSong Gao VSSRANU(vssran_wu_d, 64, W, D, UD)
157083b3815dSSong Gao 
157183b3815dSSong Gao #define VSSRLNI(NAME, BIT, E1, E2)                                                 \
1572329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                 \
157383b3815dSSong Gao {                                                                                  \
15746256c8caSSong Gao     int i, j, ofs;                                                                 \
15756256c8caSSong Gao     VReg temp = {};                                                                \
1576329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                         \
1577329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                         \
15786256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                                  \
157983b3815dSSong Gao                                                                                    \
15806256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                           \
15816256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                             \
15826256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                                \
15836256c8caSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i),       \
15846256c8caSSong Gao                                                      imm, BIT / 2 - 1);            \
15856256c8caSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrlns_ ## E1(Vd->E2(j + ofs * i), \
15866256c8caSSong Gao                                                            imm, BIT / 2 - 1);      \
15876256c8caSSong Gao         }                                                                          \
158883b3815dSSong Gao     }                                                                              \
158983b3815dSSong Gao     *Vd = temp;                                                                    \
159083b3815dSSong Gao }
159183b3815dSSong Gao 
15926256c8caSSong Gao static void do_vssrlni_q(VReg *Vd, VReg *Vj,
15936256c8caSSong Gao                          uint64_t imm, int idx, Int128 mask)
159483b3815dSSong Gao {
15956256c8caSSong Gao     Int128 shft_res1, shft_res2;
159683b3815dSSong Gao 
159783b3815dSSong Gao     if (imm == 0) {
15986256c8caSSong Gao         shft_res1 = Vj->Q(idx);
15996256c8caSSong Gao         shft_res2 = Vd->Q(idx);
160083b3815dSSong Gao     } else {
16016256c8caSSong Gao         shft_res1 = int128_urshift(Vj->Q(idx), imm);
16026256c8caSSong Gao         shft_res2 = int128_urshift(Vd->Q(idx), imm);
160383b3815dSSong Gao     }
160483b3815dSSong Gao 
160583b3815dSSong Gao     if (int128_ult(mask, shft_res1)) {
16066256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(mask);
160783b3815dSSong Gao     }else {
16086256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
160983b3815dSSong Gao     }
161083b3815dSSong Gao 
161183b3815dSSong Gao     if (int128_ult(mask, shft_res2)) {
16126256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask);
161383b3815dSSong Gao     }else {
16146256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
16156256c8caSSong Gao     }
16166256c8caSSong Gao }
16176256c8caSSong Gao 
16186256c8caSSong Gao void HELPER(vssrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
16196256c8caSSong Gao {
16206256c8caSSong Gao     int i;
16216256c8caSSong Gao     Int128 mask;
16226256c8caSSong Gao     VReg *Vd = (VReg *)vd;
16236256c8caSSong Gao     VReg *Vj = (VReg *)vj;
16246256c8caSSong Gao     int oprsz = simd_oprsz(desc);
16256256c8caSSong Gao 
16266256c8caSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
16276256c8caSSong Gao 
16286256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {
16296256c8caSSong Gao         do_vssrlni_q(Vd, Vj, imm, i, mask);
163083b3815dSSong Gao     }
163183b3815dSSong Gao }
163283b3815dSSong Gao 
163383b3815dSSong Gao VSSRLNI(vssrlni_b_h, 16, B, H)
163483b3815dSSong Gao VSSRLNI(vssrlni_h_w, 32, H, W)
163583b3815dSSong Gao VSSRLNI(vssrlni_w_d, 64, W, D)
163683b3815dSSong Gao 
163783b3815dSSong Gao #define VSSRANI(NAME, BIT, E1, E2)                                                 \
1638329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                 \
163983b3815dSSong Gao {                                                                                  \
16406256c8caSSong Gao     int i, j, ofs;                                                                 \
16416256c8caSSong Gao     VReg temp = {};                                                                \
1642329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                         \
1643329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                         \
16446256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                                  \
164583b3815dSSong Gao                                                                                    \
16466256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                           \
16476256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                             \
16486256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                                \
16496256c8caSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i),       \
16506256c8caSSong Gao                                                         imm, BIT / 2 - 1);         \
16516256c8caSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrans_ ## E1(Vd->E2(j + ofs * i), \
16526256c8caSSong Gao                                                               imm, BIT / 2 - 1);   \
16536256c8caSSong Gao         }                                                                          \
165483b3815dSSong Gao     }                                                                              \
165583b3815dSSong Gao     *Vd = temp;                                                                    \
165683b3815dSSong Gao }
165783b3815dSSong Gao 
16586256c8caSSong Gao static void do_vssrani_d_q(VReg *Vd, VReg *Vj,
16596256c8caSSong Gao                            uint64_t imm, int idx, Int128 mask, Int128 min)
166083b3815dSSong Gao {
16616256c8caSSong Gao     Int128 shft_res1, shft_res2;
166283b3815dSSong Gao 
166383b3815dSSong Gao     if (imm == 0) {
16646256c8caSSong Gao         shft_res1 = Vj->Q(idx);
16656256c8caSSong Gao         shft_res2 = Vd->Q(idx);
166683b3815dSSong Gao     } else {
16676256c8caSSong Gao         shft_res1 = int128_rshift(Vj->Q(idx), imm);
16686256c8caSSong Gao         shft_res2 = int128_rshift(Vd->Q(idx), imm);
166983b3815dSSong Gao     }
167083b3815dSSong Gao 
167183b3815dSSong Gao     if (int128_gt(shft_res1, mask)) {
16726256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(mask);
167383b3815dSSong Gao     } else if (int128_lt(shft_res1, int128_neg(min))) {
16746256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(min);
167583b3815dSSong Gao     } else {
16766256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
167783b3815dSSong Gao     }
167883b3815dSSong Gao 
167983b3815dSSong Gao     if (int128_gt(shft_res2, mask)) {
16806256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask);
168183b3815dSSong Gao     } else if (int128_lt(shft_res2, int128_neg(min))) {
16826256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(min);
168383b3815dSSong Gao     } else {
16846256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
168583b3815dSSong Gao     }
168683b3815dSSong Gao }
168783b3815dSSong Gao 
16886256c8caSSong Gao void HELPER(vssrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
16896256c8caSSong Gao {
16906256c8caSSong Gao     int i;
16916256c8caSSong Gao     Int128 mask, min;
16926256c8caSSong Gao     VReg *Vd = (VReg *)vd;
16936256c8caSSong Gao     VReg *Vj = (VReg *)vj;
16946256c8caSSong Gao     int oprsz = simd_oprsz(desc);
16956256c8caSSong Gao 
16966256c8caSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
16976256c8caSSong Gao     min  = int128_lshift(int128_one(), 63);
16986256c8caSSong Gao 
16996256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {
17006256c8caSSong Gao         do_vssrani_d_q(Vd, Vj, imm, i, mask, min);
17016256c8caSSong Gao     }
17026256c8caSSong Gao }
17036256c8caSSong Gao 
17046256c8caSSong Gao 
170583b3815dSSong Gao VSSRANI(vssrani_b_h, 16, B, H)
170683b3815dSSong Gao VSSRANI(vssrani_h_w, 32, H, W)
170783b3815dSSong Gao VSSRANI(vssrani_w_d, 64, W, D)
170883b3815dSSong Gao 
170983b3815dSSong Gao #define VSSRLNUI(NAME, BIT, E1, E2)                                                \
1710329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                 \
171183b3815dSSong Gao {                                                                                  \
17126256c8caSSong Gao     int i, j, ofs;                                                                 \
17136256c8caSSong Gao     VReg temp = {};                                                                \
1714329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                         \
1715329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                         \
17166256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                                  \
171783b3815dSSong Gao                                                                                    \
17186256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                           \
17196256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                             \
17206256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                                \
17216256c8caSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i),       \
17226256c8caSSong Gao                                                         imm, BIT / 2);             \
17236256c8caSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrlnu_ ## E1(Vd->E2(j + ofs * i), \
17246256c8caSSong Gao                                                               imm, BIT / 2);       \
17256256c8caSSong Gao         }                                                                          \
172683b3815dSSong Gao     }                                                                              \
172783b3815dSSong Gao     *Vd = temp;                                                                    \
172883b3815dSSong Gao }
172983b3815dSSong Gao 
1730329517d5SSong Gao void HELPER(vssrlni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
173183b3815dSSong Gao {
17326256c8caSSong Gao     int i;
17336256c8caSSong Gao     Int128 mask;
1734329517d5SSong Gao     VReg *Vd = (VReg *)vd;
1735329517d5SSong Gao     VReg *Vj = (VReg *)vj;
17366256c8caSSong Gao     int oprsz = simd_oprsz(desc);
173783b3815dSSong Gao 
173883b3815dSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
173983b3815dSSong Gao 
17406256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {
17416256c8caSSong Gao         do_vssrlni_q(Vd, Vj, imm, i, mask);
174283b3815dSSong Gao     }
174383b3815dSSong Gao }
174483b3815dSSong Gao 
174583b3815dSSong Gao VSSRLNUI(vssrlni_bu_h, 16, B, H)
174683b3815dSSong Gao VSSRLNUI(vssrlni_hu_w, 32, H, W)
174783b3815dSSong Gao VSSRLNUI(vssrlni_wu_d, 64, W, D)
174883b3815dSSong Gao 
174983b3815dSSong Gao #define VSSRANUI(NAME, BIT, E1, E2)                                                \
1750329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                 \
175183b3815dSSong Gao {                                                                                  \
17526256c8caSSong Gao     int i, j, ofs;                                                                 \
17536256c8caSSong Gao     VReg temp = {};                                                                \
1754329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                         \
1755329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                         \
17566256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                                  \
175783b3815dSSong Gao                                                                                    \
17586256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                           \
17596256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                             \
17606256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                                \
17616256c8caSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i),       \
17626256c8caSSong Gao                                                         imm, BIT / 2);             \
17636256c8caSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssranu_ ## E1(Vd->E2(j + ofs * i), \
17646256c8caSSong Gao                                                               imm, BIT / 2);       \
17656256c8caSSong Gao         }                                                                          \
176683b3815dSSong Gao     }                                                                              \
176783b3815dSSong Gao     *Vd = temp;                                                                    \
176883b3815dSSong Gao }
176983b3815dSSong Gao 
17706256c8caSSong Gao static void do_vssrani_du_q(VReg *Vd, VReg *Vj,
17716256c8caSSong Gao                             uint64_t imm, int idx, Int128 mask)
177283b3815dSSong Gao {
17736256c8caSSong Gao     Int128 shft_res1, shft_res2;
177483b3815dSSong Gao 
177583b3815dSSong Gao     if (imm == 0) {
17766256c8caSSong Gao         shft_res1 = Vj->Q(idx);
17776256c8caSSong Gao         shft_res2 = Vd->Q(idx);
177883b3815dSSong Gao     } else {
17796256c8caSSong Gao         shft_res1 = int128_rshift(Vj->Q(idx), imm);
17806256c8caSSong Gao         shft_res2 = int128_rshift(Vd->Q(idx), imm);
178183b3815dSSong Gao     }
178283b3815dSSong Gao 
17836256c8caSSong Gao     if (int128_lt(Vj->Q(idx), int128_zero())) {
178483b3815dSSong Gao         shft_res1 = int128_zero();
178583b3815dSSong Gao     }
178683b3815dSSong Gao 
17876256c8caSSong Gao     if (int128_lt(Vd->Q(idx), int128_zero())) {
178883b3815dSSong Gao         shft_res2 = int128_zero();
178983b3815dSSong Gao     }
179083b3815dSSong Gao     if (int128_ult(mask, shft_res1)) {
17916256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(mask);
179283b3815dSSong Gao     }else {
17936256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
179483b3815dSSong Gao     }
179583b3815dSSong Gao 
179683b3815dSSong Gao     if (int128_ult(mask, shft_res2)) {
17976256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask);
179883b3815dSSong Gao     }else {
17996256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
18006256c8caSSong Gao     }
18016256c8caSSong Gao 
18026256c8caSSong Gao }
18036256c8caSSong Gao 
18046256c8caSSong Gao void HELPER(vssrani_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
18056256c8caSSong Gao {
18066256c8caSSong Gao     int i;
18076256c8caSSong Gao     Int128 mask;
18086256c8caSSong Gao     VReg *Vd = (VReg *)vd;
18096256c8caSSong Gao     VReg *Vj = (VReg *)vj;
18106256c8caSSong Gao     int oprsz = simd_oprsz(desc);
18116256c8caSSong Gao 
18126256c8caSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
18136256c8caSSong Gao 
18146256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {
18156256c8caSSong Gao         do_vssrani_du_q(Vd, Vj, imm, i, mask);
181683b3815dSSong Gao     }
181783b3815dSSong Gao }
181883b3815dSSong Gao 
181983b3815dSSong Gao VSSRANUI(vssrani_bu_h, 16, B, H)
182083b3815dSSong Gao VSSRANUI(vssrani_hu_w, 32, H, W)
182183b3815dSSong Gao VSSRANUI(vssrani_wu_d, 64, W, D)
1822162cd32cSSong Gao 
1823162cd32cSSong Gao #define SSRLRNS(E1, E2, T1, T2, T3)                \
1824162cd32cSSong Gao static T1 do_ssrlrns_ ## E1(T2 e2, int sa, int sh) \
1825162cd32cSSong Gao {                                                  \
1826162cd32cSSong Gao     T1 shft_res;                                   \
1827162cd32cSSong Gao                                                    \
1828162cd32cSSong Gao     shft_res = do_vsrlr_ ## E2(e2, sa);            \
1829162cd32cSSong Gao     T1 mask;                                       \
1830162cd32cSSong Gao     mask = (1ull << sh) - 1;                       \
1831162cd32cSSong Gao     if (shft_res > mask) {                         \
1832162cd32cSSong Gao         return mask;                               \
1833162cd32cSSong Gao     } else {                                       \
1834162cd32cSSong Gao         return  shft_res;                          \
1835162cd32cSSong Gao     }                                              \
1836162cd32cSSong Gao }
1837162cd32cSSong Gao 
1838162cd32cSSong Gao SSRLRNS(B, H, uint16_t, int16_t, uint8_t)
1839162cd32cSSong Gao SSRLRNS(H, W, uint32_t, int32_t, uint16_t)
1840162cd32cSSong Gao SSRLRNS(W, D, uint64_t, int64_t, uint32_t)
1841162cd32cSSong Gao 
184277fca794SSong Gao #define VSSRLRN(NAME, BIT, E1, E2, E3)                                         \
184304711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                 \
1844162cd32cSSong Gao {                                                                              \
184577fca794SSong Gao     int i, j, ofs;                                                             \
184604711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                     \
184704711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                     \
184804711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                     \
184977fca794SSong Gao     int oprsz = simd_oprsz(desc);                                              \
1850162cd32cSSong Gao                                                                                \
185177fca794SSong Gao     ofs = LSX_LEN / BIT;                                                       \
185277fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
185377fca794SSong Gao         for (j = 0; j < ofs; j++) {                                            \
185477fca794SSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i),   \
185577fca794SSong Gao                                                     Vk->E3(j + ofs * i) % BIT, \
185677fca794SSong Gao                                                     BIT / 2 - 1);              \
1857162cd32cSSong Gao         }                                                                      \
185877fca794SSong Gao         Vd->D(2 * i + 1) = 0;                                                  \
185977fca794SSong Gao     }                                                                          \
1860162cd32cSSong Gao }
1861162cd32cSSong Gao 
186277fca794SSong Gao VSSRLRN(vssrlrn_b_h, 16, B, H, UH)
186377fca794SSong Gao VSSRLRN(vssrlrn_h_w, 32, H, W, UW)
186477fca794SSong Gao VSSRLRN(vssrlrn_w_d, 64, W, D, UD)
1865162cd32cSSong Gao 
1866162cd32cSSong Gao #define SSRARNS(E1, E2, T1, T2)                    \
1867162cd32cSSong Gao static T1 do_ssrarns_ ## E1(T1 e2, int sa, int sh) \
1868162cd32cSSong Gao {                                                  \
1869162cd32cSSong Gao     T1 shft_res;                                   \
1870162cd32cSSong Gao                                                    \
1871162cd32cSSong Gao     shft_res = do_vsrar_ ## E2(e2, sa);            \
1872162cd32cSSong Gao     T2 mask;                                       \
1873162cd32cSSong Gao     mask = (1ll << sh) - 1;                        \
1874162cd32cSSong Gao     if (shft_res > mask) {                         \
1875162cd32cSSong Gao         return  mask;                              \
1876162cd32cSSong Gao     } else if (shft_res < -(mask +1)) {            \
1877162cd32cSSong Gao         return  ~mask;                             \
1878162cd32cSSong Gao     } else {                                       \
1879162cd32cSSong Gao         return shft_res;                           \
1880162cd32cSSong Gao     }                                              \
1881162cd32cSSong Gao }
1882162cd32cSSong Gao 
1883162cd32cSSong Gao SSRARNS(B, H, int16_t, int8_t)
1884162cd32cSSong Gao SSRARNS(H, W, int32_t, int16_t)
1885162cd32cSSong Gao SSRARNS(W, D, int64_t, int32_t)
1886162cd32cSSong Gao 
188777fca794SSong Gao #define VSSRARN(NAME, BIT, E1, E2, E3)                                         \
188804711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                 \
1889162cd32cSSong Gao {                                                                              \
189077fca794SSong Gao     int i, j, ofs;                                                             \
189104711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                     \
189204711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                     \
189304711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                     \
189477fca794SSong Gao     int oprsz = simd_oprsz(desc);                                              \
1895162cd32cSSong Gao                                                                                \
189677fca794SSong Gao     ofs = LSX_LEN / BIT;                                                       \
189777fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
189877fca794SSong Gao         for (j = 0; j < ofs; j++) {                                            \
189977fca794SSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i),   \
190077fca794SSong Gao                                                     Vk->E3(j + ofs * i) % BIT, \
190177fca794SSong Gao                                                     BIT/ 2 - 1);               \
1902162cd32cSSong Gao         }                                                                      \
190377fca794SSong Gao         Vd->D(2 * i + 1) = 0;                                                  \
190477fca794SSong Gao     }                                                                          \
1905162cd32cSSong Gao }
1906162cd32cSSong Gao 
190777fca794SSong Gao VSSRARN(vssrarn_b_h, 16, B, H, UH)
190877fca794SSong Gao VSSRARN(vssrarn_h_w, 32, H, W, UW)
190977fca794SSong Gao VSSRARN(vssrarn_w_d, 64, W, D, UD)
1910162cd32cSSong Gao 
1911162cd32cSSong Gao #define SSRLRNU(E1, E2, T1, T2, T3)                \
1912162cd32cSSong Gao static T1 do_ssrlrnu_ ## E1(T3 e2, int sa, int sh) \
1913162cd32cSSong Gao {                                                  \
1914162cd32cSSong Gao     T1 shft_res;                                   \
1915162cd32cSSong Gao                                                    \
1916162cd32cSSong Gao     shft_res = do_vsrlr_ ## E2(e2, sa);            \
1917162cd32cSSong Gao                                                    \
1918162cd32cSSong Gao     T2 mask;                                       \
1919162cd32cSSong Gao     mask = (1ull << sh) - 1;                       \
1920162cd32cSSong Gao     if (shft_res > mask) {                         \
1921162cd32cSSong Gao         return mask;                               \
1922162cd32cSSong Gao     } else {                                       \
1923162cd32cSSong Gao         return shft_res;                           \
1924162cd32cSSong Gao     }                                              \
1925162cd32cSSong Gao }
1926162cd32cSSong Gao 
1927162cd32cSSong Gao SSRLRNU(B, H, uint16_t, uint8_t, int16_t)
1928162cd32cSSong Gao SSRLRNU(H, W, uint32_t, uint16_t, int32_t)
1929162cd32cSSong Gao SSRLRNU(W, D, uint64_t, uint32_t, int64_t)
1930162cd32cSSong Gao 
193177fca794SSong Gao #define VSSRLRNU(NAME, BIT, E1, E2, E3)                                        \
193204711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                 \
1933162cd32cSSong Gao {                                                                              \
193477fca794SSong Gao     int i, j, ofs;                                                             \
193504711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                     \
193604711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                     \
193704711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                     \
193877fca794SSong Gao     int oprsz = simd_oprsz(desc);                                              \
1939162cd32cSSong Gao                                                                                \
194077fca794SSong Gao     ofs = LSX_LEN / BIT;                                                       \
194177fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
194277fca794SSong Gao         for (j = 0; j < ofs; j++) {                                            \
194377fca794SSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i),   \
194477fca794SSong Gao                                                     Vk->E3(j + ofs * i) % BIT, \
194577fca794SSong Gao                                                     BIT / 2);                  \
1946162cd32cSSong Gao         }                                                                      \
194777fca794SSong Gao         Vd->D(2 * i + 1) = 0;                                                  \
194877fca794SSong Gao     }                                                                          \
1949162cd32cSSong Gao }
1950162cd32cSSong Gao 
195177fca794SSong Gao VSSRLRNU(vssrlrn_bu_h, 16, B, H, UH)
195277fca794SSong Gao VSSRLRNU(vssrlrn_hu_w, 32, H, W, UW)
195377fca794SSong Gao VSSRLRNU(vssrlrn_wu_d, 64, W, D, UD)
1954162cd32cSSong Gao 
1955162cd32cSSong Gao #define SSRARNU(E1, E2, T1, T2, T3)                \
1956162cd32cSSong Gao static T1 do_ssrarnu_ ## E1(T3 e2, int sa, int sh) \
1957162cd32cSSong Gao {                                                  \
1958162cd32cSSong Gao     T1 shft_res;                                   \
1959162cd32cSSong Gao                                                    \
1960162cd32cSSong Gao     if (e2 < 0) {                                  \
1961162cd32cSSong Gao         shft_res = 0;                              \
1962162cd32cSSong Gao     } else {                                       \
1963162cd32cSSong Gao         shft_res = do_vsrar_ ## E2(e2, sa);        \
1964162cd32cSSong Gao     }                                              \
1965162cd32cSSong Gao     T2 mask;                                       \
1966162cd32cSSong Gao     mask = (1ull << sh) - 1;                       \
1967162cd32cSSong Gao     if (shft_res > mask) {                         \
1968162cd32cSSong Gao         return mask;                               \
1969162cd32cSSong Gao     } else {                                       \
1970162cd32cSSong Gao         return shft_res;                           \
1971162cd32cSSong Gao     }                                              \
1972162cd32cSSong Gao }
1973162cd32cSSong Gao 
1974162cd32cSSong Gao SSRARNU(B, H, uint16_t, uint8_t, int16_t)
1975162cd32cSSong Gao SSRARNU(H, W, uint32_t, uint16_t, int32_t)
1976162cd32cSSong Gao SSRARNU(W, D, uint64_t, uint32_t, int64_t)
1977162cd32cSSong Gao 
197877fca794SSong Gao #define VSSRARNU(NAME, BIT, E1, E2, E3)                                      \
197904711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void  *vk, uint32_t desc)              \
1980162cd32cSSong Gao {                                                                            \
198177fca794SSong Gao     int i, j, ofs;                                                           \
198204711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                   \
198304711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                   \
198404711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                   \
198577fca794SSong Gao     int oprsz = simd_oprsz(desc);                                            \
1986162cd32cSSong Gao                                                                              \
198777fca794SSong Gao     ofs = LSX_LEN / BIT;                                                     \
198877fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                       \
198977fca794SSong Gao         for (j = 0; j < ofs; j++) {                                          \
199077fca794SSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i), \
199177fca794SSong Gao                                                 Vk->E3(j + ofs * i) % BIT,   \
199277fca794SSong Gao                                                 BIT / 2);                    \
1993162cd32cSSong Gao         }                                                                    \
199477fca794SSong Gao         Vd->D(2 * i + 1) = 0;                                                \
199577fca794SSong Gao     }                                                                        \
1996162cd32cSSong Gao }
1997162cd32cSSong Gao 
199877fca794SSong Gao VSSRARNU(vssrarn_bu_h, 16, B, H, UH)
199977fca794SSong Gao VSSRARNU(vssrarn_hu_w, 32, H, W, UW)
200077fca794SSong Gao VSSRARNU(vssrarn_wu_d, 64, W, D, UD)
2001162cd32cSSong Gao 
2002162cd32cSSong Gao #define VSSRLRNI(NAME, BIT, E1, E2)                                                 \
2003329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                  \
2004162cd32cSSong Gao {                                                                                   \
200577fca794SSong Gao     int i, j, ofs;                                                                  \
200677fca794SSong Gao     VReg temp = {};                                                                 \
2007329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                          \
2008329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                          \
200977fca794SSong Gao     int oprsz = simd_oprsz(desc);                                                   \
2010162cd32cSSong Gao                                                                                     \
201177fca794SSong Gao     ofs = LSX_LEN / BIT;                                                            \
201277fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                              \
201377fca794SSong Gao         for (j = 0; j < ofs; j++) {                                                 \
201477fca794SSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i),       \
201577fca794SSong Gao                                                          imm, BIT / 2 - 1);         \
201677fca794SSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrns_ ## E1(Vd->E2(j + ofs * i), \
201777fca794SSong Gao                                                                imm, BIT / 2 - 1);   \
201877fca794SSong Gao         }                                                                           \
2019162cd32cSSong Gao     }                                                                               \
2020162cd32cSSong Gao     *Vd = temp;                                                                     \
2021162cd32cSSong Gao }
2022162cd32cSSong Gao 
202377fca794SSong Gao static void do_vssrlrni_q(VReg *Vd, VReg * Vj,
202477fca794SSong Gao                           uint64_t imm, int idx, Int128 mask)
202577fca794SSong Gao {
202677fca794SSong Gao     Int128 shft_res1, shft_res2, r1, r2;
202777fca794SSong Gao     if (imm == 0) {
202877fca794SSong Gao         shft_res1 = Vj->Q(idx);
202977fca794SSong Gao         shft_res2 = Vd->Q(idx);
203077fca794SSong Gao     } else {
203177fca794SSong Gao         r1 = int128_and(int128_urshift(Vj->Q(idx), (imm - 1)), int128_one());
203277fca794SSong Gao         r2 = int128_and(int128_urshift(Vd->Q(idx), (imm - 1)), int128_one());
203377fca794SSong Gao         shft_res1 = (int128_add(int128_urshift(Vj->Q(idx), imm), r1));
203477fca794SSong Gao         shft_res2 = (int128_add(int128_urshift(Vd->Q(idx), imm), r2));
203577fca794SSong Gao     }
203677fca794SSong Gao 
203777fca794SSong Gao     if (int128_ult(mask, shft_res1)) {
203877fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(mask);
203977fca794SSong Gao     }else {
204077fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
204177fca794SSong Gao     }
204277fca794SSong Gao 
204377fca794SSong Gao     if (int128_ult(mask, shft_res2)) {
204477fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask);
204577fca794SSong Gao     }else {
204677fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
204777fca794SSong Gao     }
204877fca794SSong Gao }
204977fca794SSong Gao 
205077fca794SSong Gao void HELPER(vssrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
205177fca794SSong Gao {
205277fca794SSong Gao     int i;
205377fca794SSong Gao     Int128 mask;
205477fca794SSong Gao     VReg *Vd = (VReg *)vd;
205577fca794SSong Gao     VReg *Vj = (VReg *)vj;
205677fca794SSong Gao     int oprsz = simd_oprsz(desc);
205777fca794SSong Gao 
205877fca794SSong Gao     mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
205977fca794SSong Gao 
206077fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {
206177fca794SSong Gao         do_vssrlrni_q(Vd, Vj, imm, i, mask);
206277fca794SSong Gao     }
2063162cd32cSSong Gao }
2064162cd32cSSong Gao 
2065162cd32cSSong Gao VSSRLRNI(vssrlrni_b_h, 16, B, H)
2066162cd32cSSong Gao VSSRLRNI(vssrlrni_h_w, 32, H, W)
2067162cd32cSSong Gao VSSRLRNI(vssrlrni_w_d, 64, W, D)
2068162cd32cSSong Gao 
2069162cd32cSSong Gao #define VSSRARNI(NAME, BIT, E1, E2)                                                 \
2070329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                  \
2071162cd32cSSong Gao {                                                                                   \
207277fca794SSong Gao     int i, j, ofs;                                                                  \
207377fca794SSong Gao     VReg temp = {};                                                                 \
2074329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                          \
2075329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                          \
207677fca794SSong Gao     int oprsz = simd_oprsz(desc);                                                   \
2077162cd32cSSong Gao                                                                                     \
207877fca794SSong Gao     ofs = LSX_LEN / BIT;                                                            \
207977fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                              \
208077fca794SSong Gao         for (j = 0; j < ofs; j++) {                                                 \
208177fca794SSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i),       \
208277fca794SSong Gao                                                          imm, BIT / 2 - 1);         \
208377fca794SSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrarns_ ## E1(Vd->E2(j + ofs * i), \
208477fca794SSong Gao                                                                imm, BIT / 2 - 1);   \
208577fca794SSong Gao         }                                                                           \
2086162cd32cSSong Gao     }                                                                               \
2087162cd32cSSong Gao     *Vd = temp;                                                                     \
2088162cd32cSSong Gao }
2089162cd32cSSong Gao 
209077fca794SSong Gao static void do_vssrarni_d_q(VReg *Vd, VReg *Vj,
209177fca794SSong Gao                            uint64_t imm, int idx, Int128 mask1, Int128 mask2)
2092162cd32cSSong Gao {
209377fca794SSong Gao     Int128 shft_res1, shft_res2, r1, r2;
2094162cd32cSSong Gao 
2095162cd32cSSong Gao     if (imm == 0) {
209677fca794SSong Gao         shft_res1 = Vj->Q(idx);
209777fca794SSong Gao         shft_res2 = Vd->Q(idx);
2098162cd32cSSong Gao     } else {
209977fca794SSong Gao         r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one());
210077fca794SSong Gao         r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one());
210177fca794SSong Gao         shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1);
210277fca794SSong Gao         shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2);
2103162cd32cSSong Gao     }
210477fca794SSong Gao     if (int128_gt(shft_res1, mask1)) {
210577fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(mask1);
210677fca794SSong Gao     } else if (int128_lt(shft_res1, int128_neg(mask2))) {
210777fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(mask2);
210877fca794SSong Gao     } else {
210977fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
211077fca794SSong Gao     }
211177fca794SSong Gao 
211277fca794SSong Gao     if (int128_gt(shft_res2, mask1)) {
211377fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask1);
211477fca794SSong Gao     } else if (int128_lt(shft_res2, int128_neg(mask2))) {
211577fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask2);
211677fca794SSong Gao     } else {
211777fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
211877fca794SSong Gao     }
211977fca794SSong Gao }
212077fca794SSong Gao 
212177fca794SSong Gao void HELPER(vssrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
212277fca794SSong Gao {
212377fca794SSong Gao     int i;
212477fca794SSong Gao     Int128 mask1, mask2;
212577fca794SSong Gao     VReg *Vd = (VReg *)vd;
212677fca794SSong Gao     VReg *Vj = (VReg *)vj;
212777fca794SSong Gao     int oprsz = simd_oprsz(desc);
2128162cd32cSSong Gao 
2129162cd32cSSong Gao     mask1 = int128_sub(int128_lshift(int128_one(), 63), int128_one());
2130162cd32cSSong Gao     mask2  = int128_lshift(int128_one(), 63);
2131162cd32cSSong Gao 
213277fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {
213377fca794SSong Gao         do_vssrarni_d_q(Vd, Vj, imm, i, mask1, mask2);
2134162cd32cSSong Gao     }
2135162cd32cSSong Gao }
2136162cd32cSSong Gao 
2137162cd32cSSong Gao VSSRARNI(vssrarni_b_h, 16, B, H)
2138162cd32cSSong Gao VSSRARNI(vssrarni_h_w, 32, H, W)
2139162cd32cSSong Gao VSSRARNI(vssrarni_w_d, 64, W, D)
2140162cd32cSSong Gao 
2141162cd32cSSong Gao #define VSSRLRNUI(NAME, BIT, E1, E2)                                                \
2142329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                  \
2143162cd32cSSong Gao {                                                                                   \
214477fca794SSong Gao     int i, j, ofs;                                                                  \
214577fca794SSong Gao     VReg temp = {};                                                                 \
2146329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                          \
2147329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                          \
214877fca794SSong Gao     int oprsz = simd_oprsz(desc);                                                   \
2149162cd32cSSong Gao                                                                                     \
215077fca794SSong Gao     ofs = LSX_LEN / BIT;                                                            \
215177fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                              \
215277fca794SSong Gao         for (j = 0; j < ofs; j++) {                                                 \
215377fca794SSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i),       \
215477fca794SSong Gao                                                          imm, BIT / 2);             \
215577fca794SSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrnu_ ## E1(Vd->E2(j + ofs * i), \
215677fca794SSong Gao                                                                imm, BIT / 2);       \
215777fca794SSong Gao         }                                                                           \
2158162cd32cSSong Gao     }                                                                               \
2159162cd32cSSong Gao     *Vd = temp;                                                                     \
2160162cd32cSSong Gao }
2161162cd32cSSong Gao 
216277fca794SSong Gao void HELPER(vssrlrni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
216377fca794SSong Gao {
216477fca794SSong Gao     int i;
216577fca794SSong Gao     Int128 mask;
216677fca794SSong Gao     VReg *Vd = (VReg *)vd;
216777fca794SSong Gao     VReg *Vj = (VReg *)vj;
216877fca794SSong Gao     int oprsz = simd_oprsz(desc);
216977fca794SSong Gao 
217077fca794SSong Gao     mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
217177fca794SSong Gao 
217277fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {
217377fca794SSong Gao         do_vssrlrni_q(Vd, Vj, imm, i, mask);
217477fca794SSong Gao     }
217577fca794SSong Gao }
217677fca794SSong Gao 
2177162cd32cSSong Gao VSSRLRNUI(vssrlrni_bu_h, 16, B, H)
2178162cd32cSSong Gao VSSRLRNUI(vssrlrni_hu_w, 32, H, W)
2179162cd32cSSong Gao VSSRLRNUI(vssrlrni_wu_d, 64, W, D)
2180162cd32cSSong Gao 
2181162cd32cSSong Gao #define VSSRARNUI(NAME, BIT, E1, E2)                                                \
2182329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                  \
2183162cd32cSSong Gao {                                                                                   \
218477fca794SSong Gao     int i, j, ofs;                                                                  \
218577fca794SSong Gao     VReg temp = {};                                                                 \
2186329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                          \
2187329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                          \
218877fca794SSong Gao     int oprsz = simd_oprsz(desc);                                                   \
2189162cd32cSSong Gao                                                                                     \
219077fca794SSong Gao     ofs = LSX_LEN / BIT;                                                            \
219177fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                              \
219277fca794SSong Gao         for (j = 0; j < ofs; j++) {                                                 \
219377fca794SSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i),       \
219477fca794SSong Gao                                                          imm, BIT / 2);             \
219577fca794SSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrarnu_ ## E1(Vd->E2(j + ofs * i), \
219677fca794SSong Gao                                                                imm, BIT / 2);       \
219777fca794SSong Gao         }                                                                           \
2198162cd32cSSong Gao     }                                                                               \
2199162cd32cSSong Gao     *Vd = temp;                                                                     \
2200162cd32cSSong Gao }
2201162cd32cSSong Gao 
220277fca794SSong Gao static void do_vssrarni_du_q(VReg *Vd, VReg *Vj,
220377fca794SSong Gao                              uint64_t imm, int idx, Int128 mask1, Int128 mask2)
2204162cd32cSSong Gao {
220577fca794SSong Gao     Int128 shft_res1, shft_res2, r1, r2;
2206162cd32cSSong Gao 
2207162cd32cSSong Gao     if (imm == 0) {
220877fca794SSong Gao         shft_res1 = Vj->Q(idx);
220977fca794SSong Gao         shft_res2 = Vd->Q(idx);
2210162cd32cSSong Gao     } else {
221177fca794SSong Gao         r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one());
221277fca794SSong Gao         r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one());
221377fca794SSong Gao         shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1);
221477fca794SSong Gao         shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2);
2215162cd32cSSong Gao     }
2216162cd32cSSong Gao 
221777fca794SSong Gao     if (int128_lt(Vj->Q(idx), int128_zero())) {
2218162cd32cSSong Gao         shft_res1 = int128_zero();
2219162cd32cSSong Gao     }
222077fca794SSong Gao     if (int128_lt(Vd->Q(idx), int128_zero())) {
2221162cd32cSSong Gao         shft_res2 = int128_zero();
2222162cd32cSSong Gao     }
2223162cd32cSSong Gao 
222477fca794SSong Gao     if (int128_gt(shft_res1,  mask1)) {
222577fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(mask1);
222677fca794SSong Gao     } else if (int128_lt(shft_res1, int128_neg(mask2))) {
222777fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(mask2);
222877fca794SSong Gao     } else {
222977fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
223077fca794SSong Gao     }
223177fca794SSong Gao 
223277fca794SSong Gao     if (int128_gt(shft_res2, mask1)) {
223377fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask1);
223477fca794SSong Gao     } else if (int128_lt(shft_res2, int128_neg(mask2))) {
223577fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask2);
223677fca794SSong Gao     } else {
223777fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
223877fca794SSong Gao     }
223977fca794SSong Gao }
224077fca794SSong Gao 
224177fca794SSong Gao void HELPER(vssrarni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
224277fca794SSong Gao {
224377fca794SSong Gao     int i;
224477fca794SSong Gao     Int128 mask1, mask2;
224577fca794SSong Gao     VReg *Vd = (VReg *)vd;
224677fca794SSong Gao     VReg *Vj = (VReg *)vj;
224777fca794SSong Gao     int oprsz = simd_oprsz(desc);
224877fca794SSong Gao 
2249162cd32cSSong Gao     mask1 = int128_sub(int128_lshift(int128_one(), 64), int128_one());
2250162cd32cSSong Gao     mask2  = int128_lshift(int128_one(), 64);
2251162cd32cSSong Gao 
225277fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {
225377fca794SSong Gao         do_vssrarni_du_q(Vd, Vj, imm, i, mask1, mask2);
2254162cd32cSSong Gao     }
2255162cd32cSSong Gao }
2256162cd32cSSong Gao 
2257162cd32cSSong Gao VSSRARNUI(vssrarni_bu_h, 16, B, H)
2258162cd32cSSong Gao VSSRARNUI(vssrarni_hu_w, 32, H, W)
2259162cd32cSSong Gao VSSRARNUI(vssrarni_wu_d, 64, W, D)
22602e105e12SSong Gao 
22612e105e12SSong Gao #define DO_2OP(NAME, BIT, E, DO_OP)                  \
2262ff27e335SSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
22632e105e12SSong Gao {                                                    \
22642e105e12SSong Gao     int i;                                           \
2265ff27e335SSong Gao     VReg *Vd = (VReg *)vd;                           \
2266ff27e335SSong Gao     VReg *Vj = (VReg *)vj;                           \
226712ad133fSSong Gao     int oprsz = simd_oprsz(desc);                    \
22682e105e12SSong Gao                                                      \
226912ad133fSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++)          \
22702e105e12SSong Gao     {                                                \
22712e105e12SSong Gao         Vd->E(i) = DO_OP(Vj->E(i));                  \
22722e105e12SSong Gao     }                                                \
22732e105e12SSong Gao }
22742e105e12SSong Gao 
22752e105e12SSong Gao #define DO_CLO_B(N)  (clz32(~N & 0xff) - 24)
22762e105e12SSong Gao #define DO_CLO_H(N)  (clz32(~N & 0xffff) - 16)
22772e105e12SSong Gao #define DO_CLO_W(N)  (clz32(~N))
22782e105e12SSong Gao #define DO_CLO_D(N)  (clz64(~N))
22792e105e12SSong Gao #define DO_CLZ_B(N)  (clz32(N) - 24)
22802e105e12SSong Gao #define DO_CLZ_H(N)  (clz32(N) - 16)
22812e105e12SSong Gao #define DO_CLZ_W(N)  (clz32(N))
22822e105e12SSong Gao #define DO_CLZ_D(N)  (clz64(N))
22832e105e12SSong Gao 
22842e105e12SSong Gao DO_2OP(vclo_b, 8, UB, DO_CLO_B)
22852e105e12SSong Gao DO_2OP(vclo_h, 16, UH, DO_CLO_H)
22862e105e12SSong Gao DO_2OP(vclo_w, 32, UW, DO_CLO_W)
22872e105e12SSong Gao DO_2OP(vclo_d, 64, UD, DO_CLO_D)
22882e105e12SSong Gao DO_2OP(vclz_b, 8, UB, DO_CLZ_B)
22892e105e12SSong Gao DO_2OP(vclz_h, 16, UH, DO_CLZ_H)
22902e105e12SSong Gao DO_2OP(vclz_w, 32, UW, DO_CLZ_W)
22912e105e12SSong Gao DO_2OP(vclz_d, 64, UD, DO_CLZ_D)
2292bb22ee57SSong Gao 
2293bb22ee57SSong Gao #define VPCNT(NAME, BIT, E, FN)                      \
2294ff27e335SSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
2295bb22ee57SSong Gao {                                                    \
2296bb22ee57SSong Gao     int i;                                           \
2297ff27e335SSong Gao     VReg *Vd = (VReg *)vd;                           \
2298ff27e335SSong Gao     VReg *Vj = (VReg *)vj;                           \
2299956dec74SSong Gao     int oprsz = simd_oprsz(desc);                    \
2300bb22ee57SSong Gao                                                      \
2301956dec74SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++)          \
2302bb22ee57SSong Gao     {                                                \
2303bb22ee57SSong Gao         Vd->E(i) = FN(Vj->E(i));                     \
2304bb22ee57SSong Gao     }                                                \
2305bb22ee57SSong Gao }
2306bb22ee57SSong Gao 
2307bb22ee57SSong Gao VPCNT(vpcnt_b, 8, UB, ctpop8)
2308bb22ee57SSong Gao VPCNT(vpcnt_h, 16, UH, ctpop16)
2309bb22ee57SSong Gao VPCNT(vpcnt_w, 32, UW, ctpop32)
2310bb22ee57SSong Gao VPCNT(vpcnt_d, 64, UD, ctpop64)
23110b1e6705SSong Gao 
23120b1e6705SSong Gao #define DO_BITCLR(a, bit) (a & ~(1ull << bit))
23130b1e6705SSong Gao #define DO_BITSET(a, bit) (a | 1ull << bit)
23140b1e6705SSong Gao #define DO_BITREV(a, bit) (a ^ (1ull << bit))
23150b1e6705SSong Gao 
23160b1e6705SSong Gao #define DO_BIT(NAME, BIT, E, DO_OP)                            \
23171b3e242fSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
23180b1e6705SSong Gao {                                                              \
23190b1e6705SSong Gao     int i;                                                     \
23200b1e6705SSong Gao     VReg *Vd = (VReg *)vd;                                     \
23210b1e6705SSong Gao     VReg *Vj = (VReg *)vj;                                     \
23220b1e6705SSong Gao     VReg *Vk = (VReg *)vk;                                     \
23231b3e242fSSong Gao     int oprsz = simd_oprsz(desc);                              \
23240b1e6705SSong Gao                                                                \
23251b3e242fSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
23260b1e6705SSong Gao         Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)%BIT);              \
23270b1e6705SSong Gao     }                                                          \
23280b1e6705SSong Gao }
23290b1e6705SSong Gao 
23300b1e6705SSong Gao DO_BIT(vbitclr_b, 8, UB, DO_BITCLR)
23310b1e6705SSong Gao DO_BIT(vbitclr_h, 16, UH, DO_BITCLR)
23320b1e6705SSong Gao DO_BIT(vbitclr_w, 32, UW, DO_BITCLR)
23330b1e6705SSong Gao DO_BIT(vbitclr_d, 64, UD, DO_BITCLR)
23340b1e6705SSong Gao DO_BIT(vbitset_b, 8, UB, DO_BITSET)
23350b1e6705SSong Gao DO_BIT(vbitset_h, 16, UH, DO_BITSET)
23360b1e6705SSong Gao DO_BIT(vbitset_w, 32, UW, DO_BITSET)
23370b1e6705SSong Gao DO_BIT(vbitset_d, 64, UD, DO_BITSET)
23380b1e6705SSong Gao DO_BIT(vbitrev_b, 8, UB, DO_BITREV)
23390b1e6705SSong Gao DO_BIT(vbitrev_h, 16, UH, DO_BITREV)
23400b1e6705SSong Gao DO_BIT(vbitrev_w, 32, UW, DO_BITREV)
23410b1e6705SSong Gao DO_BIT(vbitrev_d, 64, UD, DO_BITREV)
23420b1e6705SSong Gao 
23430b1e6705SSong Gao #define DO_BITI(NAME, BIT, E, DO_OP)                               \
23441b3e242fSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
23450b1e6705SSong Gao {                                                                  \
23460b1e6705SSong Gao     int i;                                                         \
23470b1e6705SSong Gao     VReg *Vd = (VReg *)vd;                                         \
23480b1e6705SSong Gao     VReg *Vj = (VReg *)vj;                                         \
23491b3e242fSSong Gao     int oprsz = simd_oprsz(desc);                                  \
23500b1e6705SSong Gao                                                                    \
23511b3e242fSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
23520b1e6705SSong Gao         Vd->E(i) = DO_OP(Vj->E(i), imm);                           \
23530b1e6705SSong Gao     }                                                              \
23540b1e6705SSong Gao }
23550b1e6705SSong Gao 
23560b1e6705SSong Gao DO_BITI(vbitclri_b, 8, UB, DO_BITCLR)
23570b1e6705SSong Gao DO_BITI(vbitclri_h, 16, UH, DO_BITCLR)
23580b1e6705SSong Gao DO_BITI(vbitclri_w, 32, UW, DO_BITCLR)
23590b1e6705SSong Gao DO_BITI(vbitclri_d, 64, UD, DO_BITCLR)
23600b1e6705SSong Gao DO_BITI(vbitseti_b, 8, UB, DO_BITSET)
23610b1e6705SSong Gao DO_BITI(vbitseti_h, 16, UH, DO_BITSET)
23620b1e6705SSong Gao DO_BITI(vbitseti_w, 32, UW, DO_BITSET)
23630b1e6705SSong Gao DO_BITI(vbitseti_d, 64, UD, DO_BITSET)
23640b1e6705SSong Gao DO_BITI(vbitrevi_b, 8, UB, DO_BITREV)
23650b1e6705SSong Gao DO_BITI(vbitrevi_h, 16, UH, DO_BITREV)
23660b1e6705SSong Gao DO_BITI(vbitrevi_w, 32, UW, DO_BITREV)
23670b1e6705SSong Gao DO_BITI(vbitrevi_d, 64, UD, DO_BITREV)
2368ac95a0b9SSong Gao 
2369ac95a0b9SSong Gao #define VFRSTP(NAME, BIT, MASK, E)                             \
237004711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2371ac95a0b9SSong Gao {                                                              \
2372abee168eSSong Gao     int i, j, m, ofs;                                          \
237304711da1SSong Gao     VReg *Vd = (VReg *)vd;                                     \
237404711da1SSong Gao     VReg *Vj = (VReg *)vj;                                     \
237504711da1SSong Gao     VReg *Vk = (VReg *)vk;                                     \
2376abee168eSSong Gao     int oprsz = simd_oprsz(desc);                              \
2377ac95a0b9SSong Gao                                                                \
2378abee168eSSong Gao     ofs = LSX_LEN / BIT;                                       \
2379abee168eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                         \
2380abee168eSSong Gao         m = Vk->E(i * ofs) & MASK;                             \
2381abee168eSSong Gao         for (j = 0; j < ofs; j++) {                            \
2382abee168eSSong Gao             if (Vj->E(j + ofs * i) < 0) {                      \
2383ac95a0b9SSong Gao                 break;                                         \
2384ac95a0b9SSong Gao             }                                                  \
2385ac95a0b9SSong Gao         }                                                      \
2386abee168eSSong Gao         Vd->E(m + i * ofs) = j;                                \
2387abee168eSSong Gao     }                                                          \
2388ac95a0b9SSong Gao }
2389ac95a0b9SSong Gao 
2390ac95a0b9SSong Gao VFRSTP(vfrstp_b, 8, 0xf, B)
2391ac95a0b9SSong Gao VFRSTP(vfrstp_h, 16, 0x7, H)
2392ac95a0b9SSong Gao 
2393ac95a0b9SSong Gao #define VFRSTPI(NAME, BIT, E)                                      \
2394329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
2395ac95a0b9SSong Gao {                                                                  \
2396abee168eSSong Gao     int i, j, m, ofs;                                              \
2397329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                         \
2398329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                         \
2399abee168eSSong Gao     int oprsz = simd_oprsz(desc);                                  \
2400ac95a0b9SSong Gao                                                                    \
2401abee168eSSong Gao     ofs = LSX_LEN / BIT;                                           \
2402abee168eSSong Gao     m = imm % ofs;                                                 \
2403abee168eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                             \
2404abee168eSSong Gao         for (j = 0; j < ofs; j++) {                                \
2405abee168eSSong Gao             if (Vj->E(j + ofs * i) < 0) {                          \
2406ac95a0b9SSong Gao                 break;                                             \
2407ac95a0b9SSong Gao             }                                                      \
2408ac95a0b9SSong Gao         }                                                          \
2409abee168eSSong Gao         Vd->E(m + i * ofs) = j;                                    \
2410abee168eSSong Gao     }                                                              \
2411ac95a0b9SSong Gao }
2412ac95a0b9SSong Gao 
2413ac95a0b9SSong Gao VFRSTPI(vfrstpi_b, 8,  B)
2414ac95a0b9SSong Gao VFRSTPI(vfrstpi_h, 16, H)
2415aca67472SSong Gao 
2416aca67472SSong Gao static void vec_update_fcsr0_mask(CPULoongArchState *env,
2417aca67472SSong Gao                                   uintptr_t pc, int mask)
2418aca67472SSong Gao {
2419aca67472SSong Gao     int flags = get_float_exception_flags(&env->fp_status);
2420aca67472SSong Gao 
2421aca67472SSong Gao     set_float_exception_flags(0, &env->fp_status);
2422aca67472SSong Gao 
2423aca67472SSong Gao     flags &= ~mask;
2424aca67472SSong Gao 
2425aca67472SSong Gao     if (flags) {
2426aca67472SSong Gao         flags = ieee_ex_to_loongarch(flags);
2427aca67472SSong Gao         UPDATE_FP_CAUSE(env->fcsr0, flags);
2428aca67472SSong Gao     }
2429aca67472SSong Gao 
2430aca67472SSong Gao     if (GET_FP_ENABLES(env->fcsr0) & flags) {
2431aca67472SSong Gao         do_raise_exception(env, EXCCODE_FPE, pc);
2432aca67472SSong Gao     } else {
2433aca67472SSong Gao         UPDATE_FP_FLAGS(env->fcsr0, flags);
2434aca67472SSong Gao     }
2435aca67472SSong Gao }
2436aca67472SSong Gao 
2437aca67472SSong Gao static void vec_update_fcsr0(CPULoongArchState *env, uintptr_t pc)
2438aca67472SSong Gao {
2439aca67472SSong Gao     vec_update_fcsr0_mask(env, pc, 0);
2440aca67472SSong Gao }
2441aca67472SSong Gao 
2442aca67472SSong Gao static inline void vec_clear_cause(CPULoongArchState *env)
2443aca67472SSong Gao {
2444aca67472SSong Gao     SET_FP_CAUSE(env->fcsr0, 0);
2445aca67472SSong Gao }
2446aca67472SSong Gao 
2447aca67472SSong Gao #define DO_3OP_F(NAME, BIT, E, FN)                          \
24483b286753SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk,             \
24493b286753SSong Gao                   CPULoongArchState *env, uint32_t desc)    \
2450aca67472SSong Gao {                                                           \
2451aca67472SSong Gao     int i;                                                  \
24523b286753SSong Gao     VReg *Vd = (VReg *)vd;                                  \
24533b286753SSong Gao     VReg *Vj = (VReg *)vj;                                  \
24543b286753SSong Gao     VReg *Vk = (VReg *)vk;                                  \
2455c9caf158SSong Gao     int oprsz = simd_oprsz(desc);                           \
2456aca67472SSong Gao                                                             \
2457aca67472SSong Gao     vec_clear_cause(env);                                   \
2458c9caf158SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {               \
2459aca67472SSong Gao         Vd->E(i) = FN(Vj->E(i), Vk->E(i), &env->fp_status); \
2460aca67472SSong Gao         vec_update_fcsr0(env, GETPC());                     \
2461aca67472SSong Gao     }                                                       \
2462aca67472SSong Gao }
2463aca67472SSong Gao 
2464aca67472SSong Gao DO_3OP_F(vfadd_s, 32, UW, float32_add)
2465aca67472SSong Gao DO_3OP_F(vfadd_d, 64, UD, float64_add)
2466aca67472SSong Gao DO_3OP_F(vfsub_s, 32, UW, float32_sub)
2467aca67472SSong Gao DO_3OP_F(vfsub_d, 64, UD, float64_sub)
2468aca67472SSong Gao DO_3OP_F(vfmul_s, 32, UW, float32_mul)
2469aca67472SSong Gao DO_3OP_F(vfmul_d, 64, UD, float64_mul)
2470aca67472SSong Gao DO_3OP_F(vfdiv_s, 32, UW, float32_div)
2471aca67472SSong Gao DO_3OP_F(vfdiv_d, 64, UD, float64_div)
2472aca67472SSong Gao DO_3OP_F(vfmax_s, 32, UW, float32_maxnum)
2473aca67472SSong Gao DO_3OP_F(vfmax_d, 64, UD, float64_maxnum)
2474aca67472SSong Gao DO_3OP_F(vfmin_s, 32, UW, float32_minnum)
2475aca67472SSong Gao DO_3OP_F(vfmin_d, 64, UD, float64_minnum)
2476aca67472SSong Gao DO_3OP_F(vfmaxa_s, 32, UW, float32_maxnummag)
2477aca67472SSong Gao DO_3OP_F(vfmaxa_d, 64, UD, float64_maxnummag)
2478aca67472SSong Gao DO_3OP_F(vfmina_s, 32, UW, float32_minnummag)
2479aca67472SSong Gao DO_3OP_F(vfmina_d, 64, UD, float64_minnummag)
2480aca67472SSong Gao 
2481aca67472SSong Gao #define DO_4OP_F(NAME, BIT, E, FN, flags)                                    \
2482e2600dadSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, void *va,                    \
2483e2600dadSSong Gao                   CPULoongArchState *env, uint32_t desc)                     \
2484aca67472SSong Gao {                                                                            \
2485aca67472SSong Gao     int i;                                                                   \
2486e2600dadSSong Gao     VReg *Vd = (VReg *)vd;                                                   \
2487e2600dadSSong Gao     VReg *Vj = (VReg *)vj;                                                   \
2488e2600dadSSong Gao     VReg *Vk = (VReg *)vk;                                                   \
2489e2600dadSSong Gao     VReg *Va = (VReg *)va;                                                   \
2490c9caf158SSong Gao     int oprsz = simd_oprsz(desc);                                            \
2491aca67472SSong Gao                                                                              \
2492aca67472SSong Gao     vec_clear_cause(env);                                                    \
2493c9caf158SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                                \
2494aca67472SSong Gao         Vd->E(i) = FN(Vj->E(i), Vk->E(i), Va->E(i), flags, &env->fp_status); \
2495aca67472SSong Gao         vec_update_fcsr0(env, GETPC());                                      \
2496aca67472SSong Gao     }                                                                        \
2497aca67472SSong Gao }
2498aca67472SSong Gao 
2499aca67472SSong Gao DO_4OP_F(vfmadd_s, 32, UW, float32_muladd, 0)
2500aca67472SSong Gao DO_4OP_F(vfmadd_d, 64, UD, float64_muladd, 0)
2501aca67472SSong Gao DO_4OP_F(vfmsub_s, 32, UW, float32_muladd, float_muladd_negate_c)
2502aca67472SSong Gao DO_4OP_F(vfmsub_d, 64, UD, float64_muladd, float_muladd_negate_c)
2503aca67472SSong Gao DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_result)
2504aca67472SSong Gao DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_result)
2505aca67472SSong Gao DO_4OP_F(vfnmsub_s, 32, UW, float32_muladd,
2506aca67472SSong Gao          float_muladd_negate_c | float_muladd_negate_result)
2507aca67472SSong Gao DO_4OP_F(vfnmsub_d, 64, UD, float64_muladd,
2508aca67472SSong Gao          float_muladd_negate_c | float_muladd_negate_result)
2509aca67472SSong Gao 
2510aca67472SSong Gao #define DO_2OP_F(NAME, BIT, E, FN)                       \
2511226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj,                    \
2512226bf881SSong Gao                   CPULoongArchState *env, uint32_t desc) \
2513aca67472SSong Gao {                                                        \
2514aca67472SSong Gao     int i;                                               \
2515226bf881SSong Gao     VReg *Vd = (VReg *)vd;                               \
2516226bf881SSong Gao     VReg *Vj = (VReg *)vj;                               \
2517c9caf158SSong Gao     int oprsz = simd_oprsz(desc);                        \
2518aca67472SSong Gao                                                          \
2519aca67472SSong Gao     vec_clear_cause(env);                                \
2520c9caf158SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {            \
2521aca67472SSong Gao         Vd->E(i) = FN(env, Vj->E(i));                    \
2522aca67472SSong Gao     }                                                    \
2523aca67472SSong Gao }
2524aca67472SSong Gao 
2525aca67472SSong Gao #define FLOGB(BIT, T)                                            \
2526aca67472SSong Gao static T do_flogb_## BIT(CPULoongArchState *env, T fj)           \
2527aca67472SSong Gao {                                                                \
2528aca67472SSong Gao     T fp, fd;                                                    \
2529aca67472SSong Gao     float_status *status = &env->fp_status;                      \
2530aca67472SSong Gao     FloatRoundMode old_mode = get_float_rounding_mode(status);   \
2531aca67472SSong Gao                                                                  \
2532aca67472SSong Gao     set_float_rounding_mode(float_round_down, status);           \
2533aca67472SSong Gao     fp = float ## BIT ##_log2(fj, status);                       \
2534aca67472SSong Gao     fd = float ## BIT ##_round_to_int(fp, status);               \
2535aca67472SSong Gao     set_float_rounding_mode(old_mode, status);                   \
2536aca67472SSong Gao     vec_update_fcsr0_mask(env, GETPC(), float_flag_inexact);     \
2537aca67472SSong Gao     return fd;                                                   \
2538aca67472SSong Gao }
2539aca67472SSong Gao 
2540aca67472SSong Gao FLOGB(32, uint32_t)
2541aca67472SSong Gao FLOGB(64, uint64_t)
2542aca67472SSong Gao 
2543aca67472SSong Gao #define FCLASS(NAME, BIT, E, FN)                         \
2544226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj,                    \
2545226bf881SSong Gao                   CPULoongArchState *env, uint32_t desc) \
2546aca67472SSong Gao {                                                        \
2547aca67472SSong Gao     int i;                                               \
2548226bf881SSong Gao     VReg *Vd = (VReg *)vd;                               \
2549226bf881SSong Gao     VReg *Vj = (VReg *)vj;                               \
2550c9caf158SSong Gao     int oprsz = simd_oprsz(desc);                        \
2551aca67472SSong Gao                                                          \
2552c9caf158SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {            \
2553aca67472SSong Gao         Vd->E(i) = FN(env, Vj->E(i));                    \
2554aca67472SSong Gao     }                                                    \
2555aca67472SSong Gao }
2556aca67472SSong Gao 
2557aca67472SSong Gao FCLASS(vfclass_s, 32, UW, helper_fclass_s)
2558aca67472SSong Gao FCLASS(vfclass_d, 64, UD, helper_fclass_d)
2559aca67472SSong Gao 
2560aca67472SSong Gao #define FSQRT(BIT, T)                                  \
2561aca67472SSong Gao static T do_fsqrt_## BIT(CPULoongArchState *env, T fj) \
2562aca67472SSong Gao {                                                      \
2563aca67472SSong Gao     T fd;                                              \
2564aca67472SSong Gao     fd = float ## BIT ##_sqrt(fj, &env->fp_status);    \
2565aca67472SSong Gao     vec_update_fcsr0(env, GETPC());                    \
2566aca67472SSong Gao     return fd;                                         \
2567aca67472SSong Gao }
2568aca67472SSong Gao 
2569aca67472SSong Gao FSQRT(32, uint32_t)
2570aca67472SSong Gao FSQRT(64, uint64_t)
2571aca67472SSong Gao 
2572aca67472SSong Gao #define FRECIP(BIT, T)                                                  \
2573aca67472SSong Gao static T do_frecip_## BIT(CPULoongArchState *env, T fj)                 \
2574aca67472SSong Gao {                                                                       \
2575aca67472SSong Gao     T fd;                                                               \
2576aca67472SSong Gao     fd = float ## BIT ##_div(float ## BIT ##_one, fj, &env->fp_status); \
2577aca67472SSong Gao     vec_update_fcsr0(env, GETPC());                                     \
2578aca67472SSong Gao     return fd;                                                          \
2579aca67472SSong Gao }
2580aca67472SSong Gao 
2581aca67472SSong Gao FRECIP(32, uint32_t)
2582aca67472SSong Gao FRECIP(64, uint64_t)
2583aca67472SSong Gao 
2584aca67472SSong Gao #define FRSQRT(BIT, T)                                                  \
2585aca67472SSong Gao static T do_frsqrt_## BIT(CPULoongArchState *env, T fj)                 \
2586aca67472SSong Gao {                                                                       \
2587aca67472SSong Gao     T fd, fp;                                                           \
2588aca67472SSong Gao     fp = float ## BIT ##_sqrt(fj, &env->fp_status);                     \
2589aca67472SSong Gao     fd = float ## BIT ##_div(float ## BIT ##_one, fp, &env->fp_status); \
2590aca67472SSong Gao     vec_update_fcsr0(env, GETPC());                                     \
2591aca67472SSong Gao     return fd;                                                          \
2592aca67472SSong Gao }
2593aca67472SSong Gao 
2594aca67472SSong Gao FRSQRT(32, uint32_t)
2595aca67472SSong Gao FRSQRT(64, uint64_t)
2596aca67472SSong Gao 
2597aca67472SSong Gao DO_2OP_F(vflogb_s, 32, UW, do_flogb_32)
2598aca67472SSong Gao DO_2OP_F(vflogb_d, 64, UD, do_flogb_64)
2599aca67472SSong Gao DO_2OP_F(vfsqrt_s, 32, UW, do_fsqrt_32)
2600aca67472SSong Gao DO_2OP_F(vfsqrt_d, 64, UD, do_fsqrt_64)
2601aca67472SSong Gao DO_2OP_F(vfrecip_s, 32, UW, do_frecip_32)
2602aca67472SSong Gao DO_2OP_F(vfrecip_d, 64, UD, do_frecip_64)
2603aca67472SSong Gao DO_2OP_F(vfrsqrt_s, 32, UW, do_frsqrt_32)
2604aca67472SSong Gao DO_2OP_F(vfrsqrt_d, 64, UD, do_frsqrt_64)
2605399665d2SSong Gao 
2606399665d2SSong Gao static uint32_t float16_cvt_float32(uint16_t h, float_status *status)
2607399665d2SSong Gao {
2608399665d2SSong Gao     return float16_to_float32(h, true, status);
2609399665d2SSong Gao }
2610399665d2SSong Gao static uint64_t float32_cvt_float64(uint32_t s, float_status *status)
2611399665d2SSong Gao {
2612399665d2SSong Gao     return float32_to_float64(s, status);
2613399665d2SSong Gao }
2614399665d2SSong Gao 
2615399665d2SSong Gao static uint16_t float32_cvt_float16(uint32_t s, float_status *status)
2616399665d2SSong Gao {
2617399665d2SSong Gao     return float32_to_float16(s, true, status);
2618399665d2SSong Gao }
2619399665d2SSong Gao static uint32_t float64_cvt_float32(uint64_t d, float_status *status)
2620399665d2SSong Gao {
2621399665d2SSong Gao     return float64_to_float32(d, status);
2622399665d2SSong Gao }
2623399665d2SSong Gao 
2624226bf881SSong Gao void HELPER(vfcvtl_s_h)(void *vd, void *vj,
2625226bf881SSong Gao                         CPULoongArchState *env, uint32_t desc)
2626399665d2SSong Gao {
262760df31a2SSong Gao     int i, j, ofs;
262860df31a2SSong Gao     VReg temp = {};
2629226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2630226bf881SSong Gao     VReg *Vj = (VReg *)vj;
263160df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2632399665d2SSong Gao 
263360df31a2SSong Gao     ofs = LSX_LEN / 32;
2634399665d2SSong Gao     vec_clear_cause(env);
263560df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
263660df31a2SSong Gao         for (j = 0; j < ofs; j++) {
263760df31a2SSong Gao             temp.UW(j + ofs * i) =float16_cvt_float32(Vj->UH(j + ofs * 2 * i),
263860df31a2SSong Gao                                                       &env->fp_status);
263960df31a2SSong Gao         }
2640399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2641399665d2SSong Gao     }
2642399665d2SSong Gao     *Vd = temp;
2643399665d2SSong Gao }
2644399665d2SSong Gao 
2645226bf881SSong Gao void HELPER(vfcvtl_d_s)(void *vd, void *vj,
2646226bf881SSong Gao                         CPULoongArchState *env, uint32_t desc)
2647399665d2SSong Gao {
264860df31a2SSong Gao     int i, j, ofs;
264960df31a2SSong Gao     VReg temp = {};
2650226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2651226bf881SSong Gao     VReg *Vj = (VReg *)vj;
265260df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2653399665d2SSong Gao 
265460df31a2SSong Gao     ofs = LSX_LEN / 64;
2655399665d2SSong Gao     vec_clear_cause(env);
265660df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
265760df31a2SSong Gao         for (j = 0; j < ofs; j++) {
265860df31a2SSong Gao             temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * 2 * i),
265960df31a2SSong Gao                                                        &env->fp_status);
266060df31a2SSong Gao         }
2661399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2662399665d2SSong Gao     }
2663399665d2SSong Gao     *Vd = temp;
2664399665d2SSong Gao }
2665399665d2SSong Gao 
2666226bf881SSong Gao void HELPER(vfcvth_s_h)(void *vd, void *vj,
2667226bf881SSong Gao                         CPULoongArchState *env, uint32_t desc)
2668399665d2SSong Gao {
266960df31a2SSong Gao     int i, j, ofs;
267060df31a2SSong Gao     VReg temp = {};
2671226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2672226bf881SSong Gao     VReg *Vj = (VReg *)vj;
267360df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2674399665d2SSong Gao 
267560df31a2SSong Gao     ofs = LSX_LEN / 32;
2676399665d2SSong Gao     vec_clear_cause(env);
267760df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
267860df31a2SSong Gao         for (j = 0; j < ofs; j++) {
267960df31a2SSong Gao             temp.UW(j + ofs * i) = float16_cvt_float32(Vj->UH(j + ofs * (2 * i + 1)),
268060df31a2SSong Gao                                                        &env->fp_status);
268160df31a2SSong Gao         }
2682399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2683399665d2SSong Gao     }
2684399665d2SSong Gao     *Vd = temp;
2685399665d2SSong Gao }
2686399665d2SSong Gao 
2687226bf881SSong Gao void HELPER(vfcvth_d_s)(void *vd, void *vj,
2688226bf881SSong Gao                         CPULoongArchState *env, uint32_t desc)
2689399665d2SSong Gao {
269060df31a2SSong Gao     int i, j, ofs;
269160df31a2SSong Gao     VReg temp = {};
2692226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2693226bf881SSong Gao     VReg *Vj = (VReg *)vj;
269460df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2695399665d2SSong Gao 
269660df31a2SSong Gao     ofs = LSX_LEN / 64;
2697399665d2SSong Gao     vec_clear_cause(env);
269860df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
269960df31a2SSong Gao         for (j = 0; j < ofs; j++) {
270060df31a2SSong Gao             temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * (2 * i + 1)),
270160df31a2SSong Gao                                                         &env->fp_status);
270260df31a2SSong Gao         }
2703399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2704399665d2SSong Gao     }
2705399665d2SSong Gao     *Vd = temp;
2706399665d2SSong Gao }
2707399665d2SSong Gao 
27083b286753SSong Gao void HELPER(vfcvt_h_s)(void *vd, void *vj, void *vk,
27093b286753SSong Gao                        CPULoongArchState *env, uint32_t desc)
2710399665d2SSong Gao {
271160df31a2SSong Gao     int i, j, ofs;
271260df31a2SSong Gao     VReg temp = {};
27133b286753SSong Gao     VReg *Vd = (VReg *)vd;
27143b286753SSong Gao     VReg *Vj = (VReg *)vj;
27153b286753SSong Gao     VReg *Vk = (VReg *)vk;
271660df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2717399665d2SSong Gao 
271860df31a2SSong Gao     ofs = LSX_LEN / 32;
2719399665d2SSong Gao     vec_clear_cause(env);
272060df31a2SSong Gao     for(i = 0; i < oprsz / 16; i++) {
272160df31a2SSong Gao         for (j = 0; j < ofs; j++) {
272260df31a2SSong Gao             temp.UH(j + ofs * (2 * i + 1)) = float32_cvt_float16(Vj->UW(j + ofs * i),
272360df31a2SSong Gao                                                                  &env->fp_status);
272460df31a2SSong Gao             temp.UH(j + ofs * 2 * i) = float32_cvt_float16(Vk->UW(j + ofs * i),
272560df31a2SSong Gao                                                            &env->fp_status);
272660df31a2SSong Gao         }
2727399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2728399665d2SSong Gao     }
2729399665d2SSong Gao     *Vd = temp;
2730399665d2SSong Gao }
2731399665d2SSong Gao 
27323b286753SSong Gao void HELPER(vfcvt_s_d)(void *vd, void *vj, void *vk,
27333b286753SSong Gao                        CPULoongArchState *env, uint32_t desc)
2734399665d2SSong Gao {
273560df31a2SSong Gao     int i, j, ofs;
273660df31a2SSong Gao     VReg temp = {};
27373b286753SSong Gao     VReg *Vd = (VReg *)vd;
27383b286753SSong Gao     VReg *Vj = (VReg *)vj;
27393b286753SSong Gao     VReg *Vk = (VReg *)vk;
274060df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2741399665d2SSong Gao 
274260df31a2SSong Gao     ofs = LSX_LEN / 64;
2743399665d2SSong Gao     vec_clear_cause(env);
274460df31a2SSong Gao     for(i = 0; i < oprsz / 16; i++) {
274560df31a2SSong Gao         for (j = 0; j < ofs; j++) {
274660df31a2SSong Gao             temp.UW(j + ofs * (2 * i + 1)) = float64_cvt_float32(Vj->UD(j + ofs * i),
274760df31a2SSong Gao                                                                  &env->fp_status);
274860df31a2SSong Gao             temp.UW(j + ofs * 2 * i) = float64_cvt_float32(Vk->UD(j + ofs * i),
274960df31a2SSong Gao                                                            &env->fp_status);
275060df31a2SSong Gao         }
2751399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2752399665d2SSong Gao     }
2753399665d2SSong Gao     *Vd = temp;
2754399665d2SSong Gao }
2755399665d2SSong Gao 
2756226bf881SSong Gao void HELPER(vfrint_s)(void *vd, void *vj,
2757226bf881SSong Gao                       CPULoongArchState *env, uint32_t desc)
2758399665d2SSong Gao {
2759399665d2SSong Gao     int i;
2760226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2761226bf881SSong Gao     VReg *Vj = (VReg *)vj;
276260df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2763399665d2SSong Gao 
2764399665d2SSong Gao     vec_clear_cause(env);
276560df31a2SSong Gao     for (i = 0; i < oprsz / 4; i++) {
2766399665d2SSong Gao         Vd->W(i) = float32_round_to_int(Vj->UW(i), &env->fp_status);
2767399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2768399665d2SSong Gao     }
2769399665d2SSong Gao }
2770399665d2SSong Gao 
2771226bf881SSong Gao void HELPER(vfrint_d)(void *vd, void *vj,
2772226bf881SSong Gao                       CPULoongArchState *env, uint32_t desc)
2773399665d2SSong Gao {
2774399665d2SSong Gao     int i;
2775226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2776226bf881SSong Gao     VReg *Vj = (VReg *)vj;
277760df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2778399665d2SSong Gao 
2779399665d2SSong Gao     vec_clear_cause(env);
278060df31a2SSong Gao     for (i = 0; i < oprsz / 8; i++) {
2781399665d2SSong Gao         Vd->D(i) = float64_round_to_int(Vj->UD(i), &env->fp_status);
2782399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2783399665d2SSong Gao     }
2784399665d2SSong Gao }
2785399665d2SSong Gao 
2786399665d2SSong Gao #define FCVT_2OP(NAME, BIT, E, MODE)                                        \
2787226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj,                                       \
2788226bf881SSong Gao                   CPULoongArchState *env, uint32_t desc)                    \
2789399665d2SSong Gao {                                                                           \
2790399665d2SSong Gao     int i;                                                                  \
2791226bf881SSong Gao     VReg *Vd = (VReg *)vd;                                                  \
2792226bf881SSong Gao     VReg *Vj = (VReg *)vj;                                                  \
279360df31a2SSong Gao     int oprsz = simd_oprsz(desc);                                           \
2794399665d2SSong Gao                                                                             \
2795399665d2SSong Gao     vec_clear_cause(env);                                                   \
279660df31a2SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                               \
2797399665d2SSong Gao         FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \
2798399665d2SSong Gao         set_float_rounding_mode(MODE, &env->fp_status);                     \
2799399665d2SSong Gao         Vd->E(i) = float## BIT ## _round_to_int(Vj->E(i), &env->fp_status); \
2800399665d2SSong Gao         set_float_rounding_mode(old_mode, &env->fp_status);                 \
2801399665d2SSong Gao         vec_update_fcsr0(env, GETPC());                                     \
2802399665d2SSong Gao     }                                                                       \
2803399665d2SSong Gao }
2804399665d2SSong Gao 
2805399665d2SSong Gao FCVT_2OP(vfrintrne_s, 32, UW, float_round_nearest_even)
2806399665d2SSong Gao FCVT_2OP(vfrintrne_d, 64, UD, float_round_nearest_even)
2807399665d2SSong Gao FCVT_2OP(vfrintrz_s, 32, UW, float_round_to_zero)
2808399665d2SSong Gao FCVT_2OP(vfrintrz_d, 64, UD, float_round_to_zero)
2809399665d2SSong Gao FCVT_2OP(vfrintrp_s, 32, UW, float_round_up)
2810399665d2SSong Gao FCVT_2OP(vfrintrp_d, 64, UD, float_round_up)
2811399665d2SSong Gao FCVT_2OP(vfrintrm_s, 32, UW, float_round_down)
2812399665d2SSong Gao FCVT_2OP(vfrintrm_d, 64, UD, float_round_down)
2813399665d2SSong Gao 
2814399665d2SSong Gao #define FTINT(NAME, FMT1, FMT2, T1, T2,  MODE)                          \
2815399665d2SSong Gao static T2 do_ftint ## NAME(CPULoongArchState *env, T1 fj)               \
2816399665d2SSong Gao {                                                                       \
2817399665d2SSong Gao     T2 fd;                                                              \
2818399665d2SSong Gao     FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \
2819399665d2SSong Gao                                                                         \
2820399665d2SSong Gao     set_float_rounding_mode(MODE, &env->fp_status);                     \
2821399665d2SSong Gao     fd = do_## FMT1 ##_to_## FMT2(env, fj);                             \
2822399665d2SSong Gao     set_float_rounding_mode(old_mode, &env->fp_status);                 \
2823399665d2SSong Gao     return fd;                                                          \
2824399665d2SSong Gao }
2825399665d2SSong Gao 
2826399665d2SSong Gao #define DO_FTINT(FMT1, FMT2, T1, T2)                                         \
2827399665d2SSong Gao static T2 do_## FMT1 ##_to_## FMT2(CPULoongArchState *env, T1 fj)            \
2828399665d2SSong Gao {                                                                            \
2829399665d2SSong Gao     T2 fd;                                                                   \
2830399665d2SSong Gao                                                                              \
2831399665d2SSong Gao     fd = FMT1 ##_to_## FMT2(fj, &env->fp_status);                            \
2832399665d2SSong Gao     if (get_float_exception_flags(&env->fp_status) & (float_flag_invalid)) { \
2833399665d2SSong Gao         if (FMT1 ##_is_any_nan(fj)) {                                        \
2834399665d2SSong Gao             fd = 0;                                                          \
2835399665d2SSong Gao         }                                                                    \
2836399665d2SSong Gao     }                                                                        \
2837399665d2SSong Gao     vec_update_fcsr0(env, GETPC());                                          \
2838399665d2SSong Gao     return fd;                                                               \
2839399665d2SSong Gao }
2840399665d2SSong Gao 
2841399665d2SSong Gao DO_FTINT(float32, int32, uint32_t, uint32_t)
2842399665d2SSong Gao DO_FTINT(float64, int64, uint64_t, uint64_t)
2843399665d2SSong Gao DO_FTINT(float32, uint32, uint32_t, uint32_t)
2844399665d2SSong Gao DO_FTINT(float64, uint64, uint64_t, uint64_t)
2845399665d2SSong Gao DO_FTINT(float64, int32, uint64_t, uint32_t)
2846399665d2SSong Gao DO_FTINT(float32, int64, uint32_t, uint64_t)
2847399665d2SSong Gao 
2848399665d2SSong Gao FTINT(rne_w_s, float32, int32, uint32_t, uint32_t, float_round_nearest_even)
2849399665d2SSong Gao FTINT(rne_l_d, float64, int64, uint64_t, uint64_t, float_round_nearest_even)
2850399665d2SSong Gao FTINT(rp_w_s, float32, int32, uint32_t, uint32_t, float_round_up)
2851399665d2SSong Gao FTINT(rp_l_d, float64, int64, uint64_t, uint64_t, float_round_up)
2852399665d2SSong Gao FTINT(rz_w_s, float32, int32, uint32_t, uint32_t, float_round_to_zero)
2853399665d2SSong Gao FTINT(rz_l_d, float64, int64, uint64_t, uint64_t, float_round_to_zero)
2854399665d2SSong Gao FTINT(rm_w_s, float32, int32, uint32_t, uint32_t, float_round_down)
2855399665d2SSong Gao FTINT(rm_l_d, float64, int64, uint64_t, uint64_t, float_round_down)
2856399665d2SSong Gao 
2857399665d2SSong Gao DO_2OP_F(vftintrne_w_s, 32, UW, do_ftintrne_w_s)
2858399665d2SSong Gao DO_2OP_F(vftintrne_l_d, 64, UD, do_ftintrne_l_d)
2859399665d2SSong Gao DO_2OP_F(vftintrp_w_s, 32, UW, do_ftintrp_w_s)
2860399665d2SSong Gao DO_2OP_F(vftintrp_l_d, 64, UD, do_ftintrp_l_d)
2861399665d2SSong Gao DO_2OP_F(vftintrz_w_s, 32, UW, do_ftintrz_w_s)
2862399665d2SSong Gao DO_2OP_F(vftintrz_l_d, 64, UD, do_ftintrz_l_d)
2863399665d2SSong Gao DO_2OP_F(vftintrm_w_s, 32, UW, do_ftintrm_w_s)
2864399665d2SSong Gao DO_2OP_F(vftintrm_l_d, 64, UD, do_ftintrm_l_d)
2865399665d2SSong Gao DO_2OP_F(vftint_w_s, 32, UW, do_float32_to_int32)
2866399665d2SSong Gao DO_2OP_F(vftint_l_d, 64, UD, do_float64_to_int64)
2867399665d2SSong Gao 
2868399665d2SSong Gao FTINT(rz_wu_s, float32, uint32, uint32_t, uint32_t, float_round_to_zero)
2869399665d2SSong Gao FTINT(rz_lu_d, float64, uint64, uint64_t, uint64_t, float_round_to_zero)
2870399665d2SSong Gao 
2871399665d2SSong Gao DO_2OP_F(vftintrz_wu_s, 32, UW, do_ftintrz_wu_s)
2872399665d2SSong Gao DO_2OP_F(vftintrz_lu_d, 64, UD, do_ftintrz_lu_d)
2873399665d2SSong Gao DO_2OP_F(vftint_wu_s, 32, UW, do_float32_to_uint32)
2874399665d2SSong Gao DO_2OP_F(vftint_lu_d, 64, UD, do_float64_to_uint64)
2875399665d2SSong Gao 
2876399665d2SSong Gao FTINT(rm_w_d, float64, int32, uint64_t, uint32_t, float_round_down)
2877399665d2SSong Gao FTINT(rp_w_d, float64, int32, uint64_t, uint32_t, float_round_up)
2878399665d2SSong Gao FTINT(rz_w_d, float64, int32, uint64_t, uint32_t, float_round_to_zero)
2879399665d2SSong Gao FTINT(rne_w_d, float64, int32, uint64_t, uint32_t, float_round_nearest_even)
2880399665d2SSong Gao 
2881399665d2SSong Gao #define FTINT_W_D(NAME, FN)                                               \
28823b286753SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk,                           \
28833b286753SSong Gao                   CPULoongArchState *env, uint32_t desc)                  \
2884399665d2SSong Gao {                                                                         \
288560df31a2SSong Gao     int i, j, ofs;                                                        \
288660df31a2SSong Gao     VReg temp = {};                                                       \
28873b286753SSong Gao     VReg *Vd = (VReg *)vd;                                                \
28883b286753SSong Gao     VReg *Vj = (VReg *)vj;                                                \
28893b286753SSong Gao     VReg *Vk = (VReg *)vk;                                                \
289060df31a2SSong Gao     int oprsz = simd_oprsz(desc);                                         \
2891399665d2SSong Gao                                                                           \
289260df31a2SSong Gao     ofs = LSX_LEN / 64;                                                   \
2893399665d2SSong Gao     vec_clear_cause(env);                                                 \
289460df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
289560df31a2SSong Gao         for (j = 0; j < ofs; j++) {                                       \
289660df31a2SSong Gao             temp.W(j + ofs * (2 * i + 1)) = FN(env, Vj->UD(j + ofs * i)); \
289760df31a2SSong Gao             temp.W(j + ofs * 2 * i) = FN(env, Vk->UD(j + ofs * i));       \
289860df31a2SSong Gao         }                                                                 \
2899399665d2SSong Gao     }                                                                     \
2900399665d2SSong Gao     *Vd = temp;                                                           \
2901399665d2SSong Gao }
2902399665d2SSong Gao 
2903399665d2SSong Gao FTINT_W_D(vftint_w_d, do_float64_to_int32)
2904399665d2SSong Gao FTINT_W_D(vftintrm_w_d, do_ftintrm_w_d)
2905399665d2SSong Gao FTINT_W_D(vftintrp_w_d, do_ftintrp_w_d)
2906399665d2SSong Gao FTINT_W_D(vftintrz_w_d, do_ftintrz_w_d)
2907399665d2SSong Gao FTINT_W_D(vftintrne_w_d, do_ftintrne_w_d)
2908399665d2SSong Gao 
2909399665d2SSong Gao FTINT(rml_l_s, float32, int64, uint32_t, uint64_t, float_round_down)
2910399665d2SSong Gao FTINT(rpl_l_s, float32, int64, uint32_t, uint64_t, float_round_up)
2911399665d2SSong Gao FTINT(rzl_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero)
2912399665d2SSong Gao FTINT(rnel_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even)
2913399665d2SSong Gao FTINT(rmh_l_s, float32, int64, uint32_t, uint64_t, float_round_down)
2914399665d2SSong Gao FTINT(rph_l_s, float32, int64, uint32_t, uint64_t, float_round_up)
2915399665d2SSong Gao FTINT(rzh_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero)
2916399665d2SSong Gao FTINT(rneh_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even)
2917399665d2SSong Gao 
2918399665d2SSong Gao #define FTINTL_L_S(NAME, FN)                                        \
2919226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj,                               \
2920226bf881SSong Gao                   CPULoongArchState *env, uint32_t desc)            \
2921399665d2SSong Gao {                                                                   \
292260df31a2SSong Gao     int i, j, ofs;                                                  \
2923399665d2SSong Gao     VReg temp;                                                      \
2924226bf881SSong Gao     VReg *Vd = (VReg *)vd;                                          \
2925226bf881SSong Gao     VReg *Vj = (VReg *)vj;                                          \
292660df31a2SSong Gao     int oprsz = simd_oprsz(desc);                                   \
2927399665d2SSong Gao                                                                     \
292860df31a2SSong Gao     ofs = LSX_LEN / 64;                                             \
2929399665d2SSong Gao     vec_clear_cause(env);                                           \
293060df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {                              \
293160df31a2SSong Gao         for (j = 0; j < ofs; j++) {                                 \
293260df31a2SSong Gao             temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * 2 * i)); \
293360df31a2SSong Gao         }                                                           \
2934399665d2SSong Gao     }                                                               \
2935399665d2SSong Gao     *Vd = temp;                                                     \
2936399665d2SSong Gao }
2937399665d2SSong Gao 
2938399665d2SSong Gao FTINTL_L_S(vftintl_l_s, do_float32_to_int64)
2939399665d2SSong Gao FTINTL_L_S(vftintrml_l_s, do_ftintrml_l_s)
2940399665d2SSong Gao FTINTL_L_S(vftintrpl_l_s, do_ftintrpl_l_s)
2941399665d2SSong Gao FTINTL_L_S(vftintrzl_l_s, do_ftintrzl_l_s)
2942399665d2SSong Gao FTINTL_L_S(vftintrnel_l_s, do_ftintrnel_l_s)
2943399665d2SSong Gao 
2944399665d2SSong Gao #define FTINTH_L_S(NAME, FN)                                              \
2945226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj,                                     \
2946226bf881SSong Gao                   CPULoongArchState *env, uint32_t desc)                  \
2947399665d2SSong Gao {                                                                         \
294860df31a2SSong Gao     int i, j, ofs;                                                        \
294960df31a2SSong Gao     VReg temp = {};                                                       \
2950226bf881SSong Gao     VReg *Vd = (VReg *)vd;                                                \
2951226bf881SSong Gao     VReg *Vj = (VReg *)vj;                                                \
295260df31a2SSong Gao     int oprsz = simd_oprsz(desc);                                         \
2953399665d2SSong Gao                                                                           \
295460df31a2SSong Gao     ofs = LSX_LEN / 64;                                                   \
2955399665d2SSong Gao     vec_clear_cause(env);                                                 \
295660df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
295760df31a2SSong Gao         for (j = 0; j < ofs; j++) {                                       \
295860df31a2SSong Gao             temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * (2 * i + 1))); \
295960df31a2SSong Gao         }                                                                 \
2960399665d2SSong Gao     }                                                                     \
2961399665d2SSong Gao     *Vd = temp;                                                           \
2962399665d2SSong Gao }
2963399665d2SSong Gao 
2964399665d2SSong Gao FTINTH_L_S(vftinth_l_s, do_float32_to_int64)
2965399665d2SSong Gao FTINTH_L_S(vftintrmh_l_s, do_ftintrmh_l_s)
2966399665d2SSong Gao FTINTH_L_S(vftintrph_l_s, do_ftintrph_l_s)
2967399665d2SSong Gao FTINTH_L_S(vftintrzh_l_s, do_ftintrzh_l_s)
2968399665d2SSong Gao FTINTH_L_S(vftintrneh_l_s, do_ftintrneh_l_s)
2969399665d2SSong Gao 
2970399665d2SSong Gao #define FFINT(NAME, FMT1, FMT2, T1, T2)                    \
2971399665d2SSong Gao static T2 do_ffint_ ## NAME(CPULoongArchState *env, T1 fj) \
2972399665d2SSong Gao {                                                          \
2973399665d2SSong Gao     T2 fd;                                                 \
2974399665d2SSong Gao                                                            \
2975399665d2SSong Gao     fd = FMT1 ##_to_## FMT2(fj, &env->fp_status);          \
2976399665d2SSong Gao     vec_update_fcsr0(env, GETPC());                        \
2977399665d2SSong Gao     return fd;                                             \
2978399665d2SSong Gao }
2979399665d2SSong Gao 
2980399665d2SSong Gao FFINT(s_w, int32, float32, int32_t, uint32_t)
2981399665d2SSong Gao FFINT(d_l, int64, float64, int64_t, uint64_t)
2982399665d2SSong Gao FFINT(s_wu, uint32, float32, uint32_t, uint32_t)
2983399665d2SSong Gao FFINT(d_lu, uint64, float64, uint64_t, uint64_t)
2984399665d2SSong Gao 
2985399665d2SSong Gao DO_2OP_F(vffint_s_w, 32, W, do_ffint_s_w)
2986399665d2SSong Gao DO_2OP_F(vffint_d_l, 64, D, do_ffint_d_l)
2987399665d2SSong Gao DO_2OP_F(vffint_s_wu, 32, UW, do_ffint_s_wu)
2988399665d2SSong Gao DO_2OP_F(vffint_d_lu, 64, UD, do_ffint_d_lu)
2989399665d2SSong Gao 
2990226bf881SSong Gao void HELPER(vffintl_d_w)(void *vd, void *vj,
2991226bf881SSong Gao                          CPULoongArchState *env, uint32_t desc)
2992399665d2SSong Gao {
299360df31a2SSong Gao     int i, j, ofs;
299460df31a2SSong Gao     VReg temp = {};
2995226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2996226bf881SSong Gao     VReg *Vj = (VReg *)vj;
299760df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2998399665d2SSong Gao 
299960df31a2SSong Gao     ofs = LSX_LEN / 64;
3000399665d2SSong Gao     vec_clear_cause(env);
300160df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
300260df31a2SSong Gao         for (j = 0; j < ofs; j++) {
300360df31a2SSong Gao             temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * 2 * i),
300460df31a2SSong Gao                                                    &env->fp_status);
300560df31a2SSong Gao         }
3006399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
3007399665d2SSong Gao     }
3008399665d2SSong Gao     *Vd = temp;
3009399665d2SSong Gao }
3010399665d2SSong Gao 
3011226bf881SSong Gao void HELPER(vffinth_d_w)(void *vd, void *vj,
3012226bf881SSong Gao                          CPULoongArchState *env, uint32_t desc)
3013399665d2SSong Gao {
301460df31a2SSong Gao     int i, j, ofs;
301560df31a2SSong Gao     VReg temp = {};
3016226bf881SSong Gao     VReg *Vd = (VReg *)vd;
3017226bf881SSong Gao     VReg *Vj = (VReg *)vj;
301860df31a2SSong Gao     int oprsz = simd_oprsz(desc);
3019399665d2SSong Gao 
302060df31a2SSong Gao     ofs = LSX_LEN / 64;
3021399665d2SSong Gao     vec_clear_cause(env);
302260df31a2SSong Gao     for (i = 0; i < oprsz /16; i++) {
302360df31a2SSong Gao         for (j = 0; j < ofs; j++) {
302460df31a2SSong Gao             temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * (2 * i + 1)),
302560df31a2SSong Gao                                                    &env->fp_status);
302660df31a2SSong Gao         }
3027399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
3028399665d2SSong Gao     }
3029399665d2SSong Gao     *Vd = temp;
3030399665d2SSong Gao }
3031399665d2SSong Gao 
30323b286753SSong Gao void HELPER(vffint_s_l)(void *vd, void *vj, void *vk,
30333b286753SSong Gao                         CPULoongArchState *env, uint32_t desc)
3034399665d2SSong Gao {
303560df31a2SSong Gao     int i, j, ofs;
303660df31a2SSong Gao     VReg temp = {};
30373b286753SSong Gao     VReg *Vd = (VReg *)vd;
30383b286753SSong Gao     VReg *Vj = (VReg *)vj;
30393b286753SSong Gao     VReg *Vk = (VReg *)vk;
304060df31a2SSong Gao     int oprsz = simd_oprsz(desc);
3041399665d2SSong Gao 
304260df31a2SSong Gao     ofs = LSX_LEN / 64;
3043399665d2SSong Gao     vec_clear_cause(env);
304460df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
304560df31a2SSong Gao         for (j = 0; j < ofs; j++) {
304660df31a2SSong Gao             temp.W(j + ofs * (2 * i + 1)) = int64_to_float32(Vj->D(j + ofs * i),
304760df31a2SSong Gao                                                              &env->fp_status);
304860df31a2SSong Gao             temp.W(j + ofs * 2 * i) = int64_to_float32(Vk->D(j + ofs * i),
304960df31a2SSong Gao                                                        &env->fp_status);
305060df31a2SSong Gao         }
3051399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
3052399665d2SSong Gao     }
3053399665d2SSong Gao     *Vd = temp;
3054399665d2SSong Gao }
3055f435e1e5SSong Gao 
3056f435e1e5SSong Gao #define VSEQ(a, b) (a == b ? -1 : 0)
3057f435e1e5SSong Gao #define VSLE(a, b) (a <= b ? -1 : 0)
3058f435e1e5SSong Gao #define VSLT(a, b) (a < b ? -1 : 0)
3059f435e1e5SSong Gao 
3060f435e1e5SSong Gao #define VCMPI(NAME, BIT, E, DO_OP)                                 \
30614da72d43SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3062f435e1e5SSong Gao {                                                                  \
3063f435e1e5SSong Gao     int i;                                                         \
3064f435e1e5SSong Gao     VReg *Vd = (VReg *)vd;                                         \
3065f435e1e5SSong Gao     VReg *Vj = (VReg *)vj;                                         \
3066f435e1e5SSong Gao     typedef __typeof(Vd->E(0)) TD;                                 \
30674da72d43SSong Gao     int oprsz = simd_oprsz(desc);                                  \
3068f435e1e5SSong Gao                                                                    \
30694da72d43SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
3070f435e1e5SSong Gao         Vd->E(i) = DO_OP(Vj->E(i), (TD)imm);                       \
3071f435e1e5SSong Gao     }                                                              \
3072f435e1e5SSong Gao }
3073f435e1e5SSong Gao 
3074f435e1e5SSong Gao VCMPI(vseqi_b, 8, B, VSEQ)
3075f435e1e5SSong Gao VCMPI(vseqi_h, 16, H, VSEQ)
3076f435e1e5SSong Gao VCMPI(vseqi_w, 32, W, VSEQ)
3077f435e1e5SSong Gao VCMPI(vseqi_d, 64, D, VSEQ)
3078f435e1e5SSong Gao VCMPI(vslei_b, 8, B, VSLE)
3079f435e1e5SSong Gao VCMPI(vslei_h, 16, H, VSLE)
3080f435e1e5SSong Gao VCMPI(vslei_w, 32, W, VSLE)
3081f435e1e5SSong Gao VCMPI(vslei_d, 64, D, VSLE)
3082f435e1e5SSong Gao VCMPI(vslei_bu, 8, UB, VSLE)
3083f435e1e5SSong Gao VCMPI(vslei_hu, 16, UH, VSLE)
3084f435e1e5SSong Gao VCMPI(vslei_wu, 32, UW, VSLE)
3085f435e1e5SSong Gao VCMPI(vslei_du, 64, UD, VSLE)
3086f435e1e5SSong Gao VCMPI(vslti_b, 8, B, VSLT)
3087f435e1e5SSong Gao VCMPI(vslti_h, 16, H, VSLT)
3088f435e1e5SSong Gao VCMPI(vslti_w, 32, W, VSLT)
3089f435e1e5SSong Gao VCMPI(vslti_d, 64, D, VSLT)
3090f435e1e5SSong Gao VCMPI(vslti_bu, 8, UB, VSLT)
3091f435e1e5SSong Gao VCMPI(vslti_hu, 16, UH, VSLT)
3092f435e1e5SSong Gao VCMPI(vslti_wu, 32, UW, VSLT)
3093f435e1e5SSong Gao VCMPI(vslti_du, 64, UD, VSLT)
3094386c4e86SSong Gao 
3095386c4e86SSong Gao static uint64_t vfcmp_common(CPULoongArchState *env,
3096386c4e86SSong Gao                              FloatRelation cmp, uint32_t flags)
3097386c4e86SSong Gao {
3098386c4e86SSong Gao     uint64_t ret = 0;
3099386c4e86SSong Gao 
3100386c4e86SSong Gao     switch (cmp) {
3101386c4e86SSong Gao     case float_relation_less:
3102386c4e86SSong Gao         ret = (flags & FCMP_LT);
3103386c4e86SSong Gao         break;
3104386c4e86SSong Gao     case float_relation_equal:
3105386c4e86SSong Gao         ret = (flags & FCMP_EQ);
3106386c4e86SSong Gao         break;
3107386c4e86SSong Gao     case float_relation_greater:
3108386c4e86SSong Gao         ret = (flags & FCMP_GT);
3109386c4e86SSong Gao         break;
3110386c4e86SSong Gao     case float_relation_unordered:
3111386c4e86SSong Gao         ret = (flags & FCMP_UN);
3112386c4e86SSong Gao         break;
3113386c4e86SSong Gao     default:
3114386c4e86SSong Gao         g_assert_not_reached();
3115386c4e86SSong Gao     }
3116386c4e86SSong Gao 
3117386c4e86SSong Gao     if (ret) {
3118386c4e86SSong Gao         ret = -1;
3119386c4e86SSong Gao     }
3120386c4e86SSong Gao 
3121386c4e86SSong Gao     return ret;
3122386c4e86SSong Gao }
3123386c4e86SSong Gao 
3124386c4e86SSong Gao #define VFCMP(NAME, BIT, E, FN)                                          \
31253eeda5feSSong Gao void HELPER(NAME)(CPULoongArchState *env, uint32_t oprsz,                \
3126386c4e86SSong Gao                   uint32_t vd, uint32_t vj, uint32_t vk, uint32_t flags) \
3127386c4e86SSong Gao {                                                                        \
3128386c4e86SSong Gao     int i;                                                               \
3129386c4e86SSong Gao     VReg t;                                                              \
3130386c4e86SSong Gao     VReg *Vd = &(env->fpr[vd].vreg);                                     \
3131386c4e86SSong Gao     VReg *Vj = &(env->fpr[vj].vreg);                                     \
3132386c4e86SSong Gao     VReg *Vk = &(env->fpr[vk].vreg);                                     \
3133386c4e86SSong Gao                                                                          \
3134386c4e86SSong Gao     vec_clear_cause(env);                                                \
31353eeda5feSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                            \
3136386c4e86SSong Gao         FloatRelation cmp;                                               \
3137386c4e86SSong Gao         cmp = FN(Vj->E(i), Vk->E(i), &env->fp_status);                   \
3138386c4e86SSong Gao         t.E(i) = vfcmp_common(env, cmp, flags);                          \
3139386c4e86SSong Gao         vec_update_fcsr0(env, GETPC());                                  \
3140386c4e86SSong Gao     }                                                                    \
3141386c4e86SSong Gao     *Vd = t;                                                             \
3142386c4e86SSong Gao }
3143386c4e86SSong Gao 
3144386c4e86SSong Gao VFCMP(vfcmp_c_s, 32, UW, float32_compare_quiet)
3145386c4e86SSong Gao VFCMP(vfcmp_s_s, 32, UW, float32_compare)
3146386c4e86SSong Gao VFCMP(vfcmp_c_d, 64, UD, float64_compare_quiet)
3147386c4e86SSong Gao VFCMP(vfcmp_s_d, 64, UD, float64_compare)
3148d0dfa19aSSong Gao 
3149f3dfcc8bSSong Gao void HELPER(vbitseli_b)(void *vd, void *vj,  uint64_t imm, uint32_t desc)
3150d0dfa19aSSong Gao {
3151d0dfa19aSSong Gao     int i;
3152d0dfa19aSSong Gao     VReg *Vd = (VReg *)vd;
3153d0dfa19aSSong Gao     VReg *Vj = (VReg *)vj;
3154d0dfa19aSSong Gao 
3155f3dfcc8bSSong Gao     for (i = 0; i < simd_oprsz(desc); i++) {
3156d0dfa19aSSong Gao         Vd->B(i) = (~Vd->B(i) & Vj->B(i)) | (Vd->B(i) & imm);
3157d0dfa19aSSong Gao     }
3158d0dfa19aSSong Gao }
3159d0dfa19aSSong Gao 
3160d0dfa19aSSong Gao /* Copy from target/arm/tcg/sve_helper.c */
3161d0dfa19aSSong Gao static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
3162d0dfa19aSSong Gao {
3163f3dfcc8bSSong Gao     int bits = 8 << esz;
3164d0dfa19aSSong Gao     uint64_t ones = dup_const(esz, 1);
3165d0dfa19aSSong Gao     uint64_t signs = ones << (bits - 1);
3166d0dfa19aSSong Gao     uint64_t cmp0, cmp1;
3167d0dfa19aSSong Gao 
3168d0dfa19aSSong Gao     cmp1 = dup_const(esz, n);
3169d0dfa19aSSong Gao     cmp0 = cmp1 ^ m0;
3170d0dfa19aSSong Gao     cmp1 = cmp1 ^ m1;
3171d0dfa19aSSong Gao     cmp0 = (cmp0 - ones) & ~cmp0;
3172d0dfa19aSSong Gao     cmp1 = (cmp1 - ones) & ~cmp1;
3173d0dfa19aSSong Gao     return (cmp0 | cmp1) & signs;
3174d0dfa19aSSong Gao }
3175d0dfa19aSSong Gao 
3176d0dfa19aSSong Gao #define SETANYEQZ(NAME, MO)                                       \
3177f3dfcc8bSSong Gao void HELPER(NAME)(CPULoongArchState *env,                         \
3178f3dfcc8bSSong Gao                   uint32_t oprsz, uint32_t cd, uint32_t vj)       \
3179d0dfa19aSSong Gao {                                                                 \
3180d0dfa19aSSong Gao     VReg *Vj = &(env->fpr[vj].vreg);                              \
3181d0dfa19aSSong Gao                                                                   \
3182d0dfa19aSSong Gao     env->cf[cd & 0x7] = do_match2(0, Vj->D(0), Vj->D(1), MO);     \
3183f3dfcc8bSSong Gao     if (oprsz == 32) {                                            \
3184f3dfcc8bSSong Gao         env->cf[cd & 0x7] = env->cf[cd & 0x7] ||                  \
3185f3dfcc8bSSong Gao                             do_match2(0, Vj->D(2), Vj->D(3), MO); \
3186f3dfcc8bSSong Gao     }                                                             \
3187d0dfa19aSSong Gao }
3188f3dfcc8bSSong Gao 
3189d0dfa19aSSong Gao SETANYEQZ(vsetanyeqz_b, MO_8)
3190d0dfa19aSSong Gao SETANYEQZ(vsetanyeqz_h, MO_16)
3191d0dfa19aSSong Gao SETANYEQZ(vsetanyeqz_w, MO_32)
3192d0dfa19aSSong Gao SETANYEQZ(vsetanyeqz_d, MO_64)
3193d0dfa19aSSong Gao 
3194d0dfa19aSSong Gao #define SETALLNEZ(NAME, MO)                                        \
3195f3dfcc8bSSong Gao void HELPER(NAME)(CPULoongArchState *env,                          \
3196f3dfcc8bSSong Gao                   uint32_t oprsz, uint32_t cd, uint32_t vj)        \
3197d0dfa19aSSong Gao {                                                                  \
3198d0dfa19aSSong Gao     VReg *Vj = &(env->fpr[vj].vreg);                               \
3199d0dfa19aSSong Gao                                                                    \
3200d0dfa19aSSong Gao     env->cf[cd & 0x7]= !do_match2(0, Vj->D(0), Vj->D(1), MO);      \
3201f3dfcc8bSSong Gao     if (oprsz == 32) {                                             \
3202f3dfcc8bSSong Gao         env->cf[cd & 0x7] = env->cf[cd & 0x7] &&                   \
3203f3dfcc8bSSong Gao                             !do_match2(0, Vj->D(2), Vj->D(3), MO); \
3204f3dfcc8bSSong Gao     }                                                              \
3205d0dfa19aSSong Gao }
3206f3dfcc8bSSong Gao 
3207d0dfa19aSSong Gao SETALLNEZ(vsetallnez_b, MO_8)
3208d0dfa19aSSong Gao SETALLNEZ(vsetallnez_h, MO_16)
3209d0dfa19aSSong Gao SETALLNEZ(vsetallnez_w, MO_32)
3210d0dfa19aSSong Gao SETALLNEZ(vsetallnez_d, MO_64)
3211d5e5563cSSong Gao 
3212df97f338SSong Gao #define XVINSVE0(NAME, E, MASK)                                    \
3213df97f338SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3214df97f338SSong Gao {                                                                  \
3215df97f338SSong Gao     VReg *Vd = (VReg *)vd;                                         \
3216df97f338SSong Gao     VReg *Vj = (VReg *)vj;                                         \
3217df97f338SSong Gao     Vd->E(imm & MASK) = Vj->E(0);                                  \
3218df97f338SSong Gao }
3219df97f338SSong Gao 
3220df97f338SSong Gao XVINSVE0(xvinsve0_w, W, 0x7)
3221df97f338SSong Gao XVINSVE0(xvinsve0_d, D, 0x3)
3222df97f338SSong Gao 
3223df97f338SSong Gao #define XVPICKVE(NAME, E, BIT, MASK)                               \
3224df97f338SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3225df97f338SSong Gao {                                                                  \
3226df97f338SSong Gao     int i;                                                         \
3227df97f338SSong Gao     VReg *Vd = (VReg *)vd;                                         \
3228df97f338SSong Gao     VReg *Vj = (VReg *)vj;                                         \
3229df97f338SSong Gao     int oprsz = simd_oprsz(desc);                                  \
3230df97f338SSong Gao                                                                    \
3231df97f338SSong Gao     Vd->E(0) = Vj->E(imm & MASK);                                  \
3232df97f338SSong Gao     for (i = 1; i < oprsz / (BIT / 8); i++) {                      \
3233df97f338SSong Gao         Vd->E(i) = 0;                                              \
3234df97f338SSong Gao     }                                                              \
3235df97f338SSong Gao }
3236df97f338SSong Gao 
3237df97f338SSong Gao XVPICKVE(xvpickve_w, W, 32, 0x7)
3238df97f338SSong Gao XVPICKVE(xvpickve_d, D, 64, 0x3)
3239df97f338SSong Gao 
3240d5e5563cSSong Gao #define VPACKEV(NAME, BIT, E)                                  \
324104711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
3242d5e5563cSSong Gao {                                                              \
3243d5e5563cSSong Gao     int i;                                                     \
3244ad292148SSong Gao     VReg temp = {};                                            \
324504711da1SSong Gao     VReg *Vd = (VReg *)vd;                                     \
324604711da1SSong Gao     VReg *Vj = (VReg *)vj;                                     \
324704711da1SSong Gao     VReg *Vk = (VReg *)vk;                                     \
3248ad292148SSong Gao     int oprsz = simd_oprsz(desc);                              \
3249d5e5563cSSong Gao                                                                \
3250ad292148SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
3251d5e5563cSSong Gao         temp.E(2 * i + 1) = Vj->E(2 * i);                      \
3252d5e5563cSSong Gao         temp.E(2 *i) = Vk->E(2 * i);                           \
3253d5e5563cSSong Gao     }                                                          \
3254d5e5563cSSong Gao     *Vd = temp;                                                \
3255d5e5563cSSong Gao }
3256d5e5563cSSong Gao 
3257d5e5563cSSong Gao VPACKEV(vpackev_b, 16, B)
3258d5e5563cSSong Gao VPACKEV(vpackev_h, 32, H)
3259d5e5563cSSong Gao VPACKEV(vpackev_w, 64, W)
3260d5e5563cSSong Gao VPACKEV(vpackev_d, 128, D)
3261d5e5563cSSong Gao 
3262d5e5563cSSong Gao #define VPACKOD(NAME, BIT, E)                                  \
326304711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
3264d5e5563cSSong Gao {                                                              \
3265d5e5563cSSong Gao     int i;                                                     \
3266ad292148SSong Gao     VReg temp = {};                                            \
326704711da1SSong Gao     VReg *Vd = (VReg *)vd;                                     \
326804711da1SSong Gao     VReg *Vj = (VReg *)vj;                                     \
326904711da1SSong Gao     VReg *Vk = (VReg *)vk;                                     \
3270ad292148SSong Gao     int oprsz = simd_oprsz(desc);                              \
3271d5e5563cSSong Gao                                                                \
3272ad292148SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                 \
3273d5e5563cSSong Gao         temp.E(2 * i + 1) = Vj->E(2 * i + 1);                  \
3274d5e5563cSSong Gao         temp.E(2 * i) = Vk->E(2 * i + 1);                      \
3275d5e5563cSSong Gao     }                                                          \
3276d5e5563cSSong Gao     *Vd = temp;                                                \
3277d5e5563cSSong Gao }
3278d5e5563cSSong Gao 
3279d5e5563cSSong Gao VPACKOD(vpackod_b, 16, B)
3280d5e5563cSSong Gao VPACKOD(vpackod_h, 32, H)
3281d5e5563cSSong Gao VPACKOD(vpackod_w, 64, W)
3282d5e5563cSSong Gao VPACKOD(vpackod_d, 128, D)
3283d5e5563cSSong Gao 
3284d5e5563cSSong Gao #define VPICKEV(NAME, BIT, E)                                         \
328504711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)        \
3286d5e5563cSSong Gao {                                                                     \
3287ad292148SSong Gao     int i, j, ofs;                                                    \
3288ad292148SSong Gao     VReg temp = {};                                                   \
328904711da1SSong Gao     VReg *Vd = (VReg *)vd;                                            \
329004711da1SSong Gao     VReg *Vj = (VReg *)vj;                                            \
329104711da1SSong Gao     VReg *Vk = (VReg *)vk;                                            \
3292ad292148SSong Gao     int oprsz = simd_oprsz(desc);                                     \
3293d5e5563cSSong Gao                                                                       \
3294ad292148SSong Gao     ofs = LSX_LEN / BIT;                                              \
3295ad292148SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                \
3296ad292148SSong Gao         for (j = 0; j < ofs; j++) {                                   \
3297ad292148SSong Gao             temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i)); \
3298ad292148SSong Gao             temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i));       \
3299ad292148SSong Gao         }                                                             \
3300d5e5563cSSong Gao     }                                                                 \
3301d5e5563cSSong Gao     *Vd = temp;                                                       \
3302d5e5563cSSong Gao }
3303d5e5563cSSong Gao 
3304d5e5563cSSong Gao VPICKEV(vpickev_b, 16, B)
3305d5e5563cSSong Gao VPICKEV(vpickev_h, 32, H)
3306d5e5563cSSong Gao VPICKEV(vpickev_w, 64, W)
3307d5e5563cSSong Gao VPICKEV(vpickev_d, 128, D)
3308d5e5563cSSong Gao 
3309d5e5563cSSong Gao #define VPICKOD(NAME, BIT, E)                                             \
331004711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)            \
3311d5e5563cSSong Gao {                                                                         \
3312ad292148SSong Gao     int i, j, ofs;                                                        \
3313ad292148SSong Gao     VReg temp = {};                                                       \
331404711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                \
331504711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                \
331604711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                \
3317ad292148SSong Gao     int oprsz = simd_oprsz(desc);                                         \
3318d5e5563cSSong Gao                                                                           \
3319ad292148SSong Gao     ofs = LSX_LEN / BIT;                                                  \
3320ad292148SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
3321ad292148SSong Gao         for (j = 0; j < ofs; j++) {                                       \
3322ad292148SSong Gao             temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i) + 1); \
3323ad292148SSong Gao             temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i) + 1);       \
3324ad292148SSong Gao         }                                                                 \
3325d5e5563cSSong Gao     }                                                                     \
3326d5e5563cSSong Gao     *Vd = temp;                                                           \
3327d5e5563cSSong Gao }
3328d5e5563cSSong Gao 
3329d5e5563cSSong Gao VPICKOD(vpickod_b, 16, B)
3330d5e5563cSSong Gao VPICKOD(vpickod_h, 32, H)
3331d5e5563cSSong Gao VPICKOD(vpickod_w, 64, W)
3332d5e5563cSSong Gao VPICKOD(vpickod_d, 128, D)
3333e93dd431SSong Gao 
3334e93dd431SSong Gao #define VILVL(NAME, BIT, E)                                         \
333504711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)      \
3336e93dd431SSong Gao {                                                                   \
3337ad292148SSong Gao     int i, j, ofs;                                                  \
3338ad292148SSong Gao     VReg temp = {};                                                 \
333904711da1SSong Gao     VReg *Vd = (VReg *)vd;                                          \
334004711da1SSong Gao     VReg *Vj = (VReg *)vj;                                          \
334104711da1SSong Gao     VReg *Vk = (VReg *)vk;                                          \
3342ad292148SSong Gao     int oprsz = simd_oprsz(desc);                                   \
3343e93dd431SSong Gao                                                                     \
3344ad292148SSong Gao     ofs = LSX_LEN / BIT;                                            \
3345ad292148SSong Gao     for (i = 0; i < oprsz / 16; i++) {                              \
3346ad292148SSong Gao         for (j = 0; j < ofs; j++) {                                 \
3347ad292148SSong Gao             temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * 2 * i); \
3348ad292148SSong Gao             temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * 2 * i);     \
3349ad292148SSong Gao         }                                                           \
3350e93dd431SSong Gao     }                                                               \
3351e93dd431SSong Gao     *Vd = temp;                                                     \
3352e93dd431SSong Gao }
3353e93dd431SSong Gao 
3354e93dd431SSong Gao VILVL(vilvl_b, 16, B)
3355e93dd431SSong Gao VILVL(vilvl_h, 32, H)
3356e93dd431SSong Gao VILVL(vilvl_w, 64, W)
3357e93dd431SSong Gao VILVL(vilvl_d, 128, D)
3358e93dd431SSong Gao 
3359e93dd431SSong Gao #define VILVH(NAME, BIT, E)                                               \
336004711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)            \
3361e93dd431SSong Gao {                                                                         \
3362ad292148SSong Gao     int i, j, ofs;                                                        \
3363ad292148SSong Gao     VReg temp = {};                                                       \
336404711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                \
336504711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                \
336604711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                \
3367ad292148SSong Gao     int oprsz = simd_oprsz(desc);                                         \
3368e93dd431SSong Gao                                                                           \
3369ad292148SSong Gao     ofs = LSX_LEN / BIT;                                                  \
3370ad292148SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
3371ad292148SSong Gao         for (j = 0; j < ofs; j++) {                                       \
3372ad292148SSong Gao             temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * (2 * i + 1)); \
3373ad292148SSong Gao             temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * (2 * i + 1));     \
3374ad292148SSong Gao         }                                                                 \
3375e93dd431SSong Gao     }                                                                     \
3376e93dd431SSong Gao     *Vd = temp;                                                           \
3377e93dd431SSong Gao }
3378e93dd431SSong Gao 
3379e93dd431SSong Gao VILVH(vilvh_b, 16, B)
3380e93dd431SSong Gao VILVH(vilvh_h, 32, H)
3381e93dd431SSong Gao VILVH(vilvh_w, 64, W)
3382e93dd431SSong Gao VILVH(vilvh_d, 128, D)
3383e93dd431SSong Gao 
3384513e88a2SSong Gao #define SHF_POS(i, imm) (((i) & 0xfc) + (((imm) >> (2 * ((i) & 0x03))) & 0x03))
3385513e88a2SSong Gao 
3386eb48ab22SSong Gao void HELPER(vshuf_b)(void *vd, void *vj, void *vk, void *va, uint32_t desc)
3387e93dd431SSong Gao {
3388513e88a2SSong Gao     int i, j, m;
3389513e88a2SSong Gao     VReg temp = {};
3390eb48ab22SSong Gao     VReg *Vd = (VReg *)vd;
3391eb48ab22SSong Gao     VReg *Vj = (VReg *)vj;
3392eb48ab22SSong Gao     VReg *Vk = (VReg *)vk;
3393eb48ab22SSong Gao     VReg *Va = (VReg *)va;
3394513e88a2SSong Gao     int oprsz = simd_oprsz(desc);
3395e93dd431SSong Gao 
3396e93dd431SSong Gao     m = LSX_LEN / 8;
3397513e88a2SSong Gao     for (i = 0; i < (oprsz / 16) * m; i++) {
3398513e88a2SSong Gao         j = i < m ? 0 : 1;
3399e93dd431SSong Gao         uint64_t k = (uint8_t)Va->B(i) % (2 * m);
3400513e88a2SSong Gao         temp.B(i) = k < m ? Vk->B(k + j * m): Vj->B(k + (j - 1) * m);
3401e93dd431SSong Gao     }
3402e93dd431SSong Gao     *Vd = temp;
3403e93dd431SSong Gao }
3404e93dd431SSong Gao 
3405e93dd431SSong Gao #define VSHUF(NAME, BIT, E)                                            \
340604711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)         \
3407e93dd431SSong Gao {                                                                      \
3408513e88a2SSong Gao     int i, j, m;                                                       \
3409513e88a2SSong Gao     VReg temp = {};                                                    \
341004711da1SSong Gao     VReg *Vd = (VReg *)vd;                                             \
341104711da1SSong Gao     VReg *Vj = (VReg *)vj;                                             \
341204711da1SSong Gao     VReg *Vk = (VReg *)vk;                                             \
3413513e88a2SSong Gao     int oprsz = simd_oprsz(desc);                                      \
3414e93dd431SSong Gao                                                                        \
3415e93dd431SSong Gao     m = LSX_LEN / BIT;                                                 \
3416513e88a2SSong Gao     for (i = 0; i < (oprsz / 16) * m; i++) {                           \
3417513e88a2SSong Gao         j = i < m ? 0 : 1;                                             \
3418e93dd431SSong Gao         uint64_t k  = ((uint8_t)Vd->E(i)) % (2 * m);                   \
3419513e88a2SSong Gao         temp.E(i) = k < m ? Vk->E(k + j * m) : Vj->E(k + (j - 1) * m); \
3420e93dd431SSong Gao     }                                                                  \
3421e93dd431SSong Gao     *Vd = temp;                                                        \
3422e93dd431SSong Gao }
3423e93dd431SSong Gao 
3424e93dd431SSong Gao VSHUF(vshuf_h, 16, H)
3425e93dd431SSong Gao VSHUF(vshuf_w, 32, W)
3426e93dd431SSong Gao VSHUF(vshuf_d, 64, D)
3427e93dd431SSong Gao 
3428e93dd431SSong Gao #define VSHUF4I(NAME, BIT, E)                                               \
3429329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)          \
3430e93dd431SSong Gao {                                                                           \
3431513e88a2SSong Gao     int i, j, max;                                                          \
3432513e88a2SSong Gao     VReg temp = {};                                                         \
3433329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                  \
3434329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                  \
3435513e88a2SSong Gao     int oprsz = simd_oprsz(desc);                                           \
3436e93dd431SSong Gao                                                                             \
3437513e88a2SSong Gao     max = LSX_LEN / BIT;                                                    \
3438513e88a2SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                               \
3439513e88a2SSong Gao         j = i < max ? 1 : 2;                                                \
3440513e88a2SSong Gao         temp.E(i) = Vj->E(SHF_POS(i - ((j -1)* max), imm) + (j - 1) * max); \
3441e93dd431SSong Gao     }                                                                       \
3442e93dd431SSong Gao     *Vd = temp;                                                             \
3443e93dd431SSong Gao }
3444e93dd431SSong Gao 
3445e93dd431SSong Gao VSHUF4I(vshuf4i_b, 8, B)
3446e93dd431SSong Gao VSHUF4I(vshuf4i_h, 16, H)
3447e93dd431SSong Gao VSHUF4I(vshuf4i_w, 32, W)
3448e93dd431SSong Gao 
3449329517d5SSong Gao void HELPER(vshuf4i_d)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3450e93dd431SSong Gao {
3451513e88a2SSong Gao     int i;
3452513e88a2SSong Gao     VReg temp = {};
3453329517d5SSong Gao     VReg *Vd = (VReg *)vd;
3454329517d5SSong Gao     VReg *Vj = (VReg *)vj;
3455513e88a2SSong Gao     int oprsz = simd_oprsz(desc);
3456e93dd431SSong Gao 
3457513e88a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
3458513e88a2SSong Gao         temp.D(2 * i) = (imm & 2 ? Vj : Vd)->D((imm & 1) + 2 * i);
3459513e88a2SSong Gao         temp.D(2 * i + 1) = (imm & 8 ? Vj : Vd)->D(((imm >> 2) & 1) + 2 * i);
3460513e88a2SSong Gao     }
3461513e88a2SSong Gao     *Vd = temp;
3462513e88a2SSong Gao }
3463513e88a2SSong Gao 
3464513e88a2SSong Gao void HELPER(vperm_w)(void *vd, void *vj, void *vk, uint32_t desc)
3465513e88a2SSong Gao {
3466513e88a2SSong Gao     int i, m;
3467513e88a2SSong Gao     VReg temp = {};
3468513e88a2SSong Gao     VReg *Vd = (VReg *)vd;
3469513e88a2SSong Gao     VReg *Vj = (VReg *)vj;
3470513e88a2SSong Gao     VReg *Vk = (VReg *)vk;
3471513e88a2SSong Gao 
3472513e88a2SSong Gao     m = LASX_LEN / 32;
3473513e88a2SSong Gao     for (i = 0; i < m ; i++) {
3474513e88a2SSong Gao         uint64_t k = (uint8_t)Vk->W(i) % 8;
3475513e88a2SSong Gao         temp.W(i) = Vj->W(k);
3476513e88a2SSong Gao     }
3477e93dd431SSong Gao     *Vd = temp;
3478e93dd431SSong Gao }
3479e93dd431SSong Gao 
3480329517d5SSong Gao void HELPER(vpermi_w)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3481e93dd431SSong Gao {
3482513e88a2SSong Gao     int i;
3483513e88a2SSong Gao     VReg temp = {};
3484513e88a2SSong Gao     VReg *Vd = (VReg *)vd;
3485513e88a2SSong Gao     VReg *Vj = (VReg *)vj;
3486513e88a2SSong Gao     int oprsz = simd_oprsz(desc);
3487513e88a2SSong Gao 
3488513e88a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
3489513e88a2SSong Gao         temp.W(4 * i) = Vj->W((imm & 0x3) + 4 * i);
3490513e88a2SSong Gao         temp.W(4 * i + 1) = Vj->W(((imm >> 2) & 0x3) + 4 * i);
3491513e88a2SSong Gao         temp.W(4 * i + 2) = Vd->W(((imm >> 4) & 0x3) + 4 * i);
3492513e88a2SSong Gao         temp.W(4 * i + 3) = Vd->W(((imm >> 6) & 0x3) + 4 * i);
3493513e88a2SSong Gao     }
3494513e88a2SSong Gao     *Vd = temp;
3495513e88a2SSong Gao }
3496513e88a2SSong Gao 
3497513e88a2SSong Gao void HELPER(vpermi_d)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3498513e88a2SSong Gao {
3499513e88a2SSong Gao     VReg temp = {};
3500513e88a2SSong Gao     VReg *Vd = (VReg *)vd;
3501513e88a2SSong Gao     VReg *Vj = (VReg *)vj;
3502513e88a2SSong Gao 
3503513e88a2SSong Gao     temp.D(0) = Vj->D(imm & 0x3);
3504513e88a2SSong Gao     temp.D(1) = Vj->D((imm >> 2) & 0x3);
3505513e88a2SSong Gao     temp.D(2) = Vj->D((imm >> 4) & 0x3);
3506513e88a2SSong Gao     temp.D(3) = Vj->D((imm >> 6) & 0x3);
3507513e88a2SSong Gao     *Vd = temp;
3508513e88a2SSong Gao }
3509513e88a2SSong Gao 
3510513e88a2SSong Gao void HELPER(vpermi_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3511513e88a2SSong Gao {
3512513e88a2SSong Gao     int i;
3513e93dd431SSong Gao     VReg temp;
3514329517d5SSong Gao     VReg *Vd = (VReg *)vd;
3515329517d5SSong Gao     VReg *Vj = (VReg *)vj;
3516e93dd431SSong Gao 
3517513e88a2SSong Gao     for (i = 0; i < 2; i++, imm >>= 4) {
3518513e88a2SSong Gao         temp.Q(i) = (imm & 2 ? Vd: Vj)->Q(imm & 1);
3519513e88a2SSong Gao     }
3520e93dd431SSong Gao     *Vd = temp;
3521e93dd431SSong Gao }
3522e93dd431SSong Gao 
3523e93dd431SSong Gao #define VEXTRINS(NAME, BIT, E, MASK)                               \
3524329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3525e93dd431SSong Gao {                                                                  \
3526513e88a2SSong Gao     int i, ins, extr, max;                                         \
3527329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                         \
3528329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                         \
3529513e88a2SSong Gao     int oprsz = simd_oprsz(desc);                                  \
3530e93dd431SSong Gao                                                                    \
3531513e88a2SSong Gao     max = LSX_LEN / BIT;                                           \
3532e93dd431SSong Gao     ins = (imm >> 4) & MASK;                                       \
3533e93dd431SSong Gao     extr = imm & MASK;                                             \
3534513e88a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {                             \
3535513e88a2SSong Gao         Vd->E(ins + i * max) = Vj->E(extr + i * max);              \
3536513e88a2SSong Gao     }                                                              \
3537e93dd431SSong Gao }
3538e93dd431SSong Gao 
3539e93dd431SSong Gao VEXTRINS(vextrins_b, 8, B, 0xf)
3540e93dd431SSong Gao VEXTRINS(vextrins_h, 16, H, 0x7)
3541e93dd431SSong Gao VEXTRINS(vextrins_w, 32, W, 0x3)
3542e93dd431SSong Gao VEXTRINS(vextrins_d, 64, D, 0x1)
3543