xref: /qemu/target/loongarch/tcg/vec_helper.c (revision 84307cd6027c4602913177ff09aeefa4743b7234)
1a0c9400aSSong Gao /* SPDX-License-Identifier: GPL-2.0-or-later */
2a0c9400aSSong Gao /*
31dc33f26SSong Gao  * QEMU LoongArch vector helper functions.
4a0c9400aSSong Gao  *
5a0c9400aSSong Gao  * Copyright (c) 2022-2023 Loongson Technology Corporation Limited
6a0c9400aSSong Gao  */
7c037fbc9SSong Gao 
8c037fbc9SSong Gao #include "qemu/osdep.h"
9c037fbc9SSong Gao #include "cpu.h"
10c037fbc9SSong Gao #include "exec/helper-proto.h"
11aca67472SSong Gao #include "fpu/softfloat.h"
12aca67472SSong Gao #include "internals.h"
13d0dfa19aSSong Gao #include "tcg/tcg.h"
14008a3b16SSong Gao #include "vec.h"
1564cf6b99SSong Gao #include "tcg/tcg-gvec-desc.h"
16c037fbc9SSong Gao 
17c037fbc9SSong Gao #define DO_ODD_EVEN(NAME, BIT, E1, E2, DO_OP)                        \
1804711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)       \
19c037fbc9SSong Gao {                                                                    \
20c037fbc9SSong Gao     int i;                                                           \
2104711da1SSong Gao     VReg *Vd = (VReg *)vd;                                           \
2204711da1SSong Gao     VReg *Vj = (VReg *)vj;                                           \
2304711da1SSong Gao     VReg *Vk = (VReg *)vk;                                           \
24c037fbc9SSong Gao     typedef __typeof(Vd->E1(0)) TD;                                  \
2564cf6b99SSong Gao     int oprsz = simd_oprsz(desc);                                    \
26c037fbc9SSong Gao                                                                      \
2764cf6b99SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                        \
28c037fbc9SSong Gao         Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i)); \
29c037fbc9SSong Gao     }                                                                \
30c037fbc9SSong Gao }
31c037fbc9SSong Gao 
32c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_h_b, 16, H, B, DO_ADD)
33c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_w_h, 32, W, H, DO_ADD)
34c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_d_w, 64, D, W, DO_ADD)
35c037fbc9SSong Gao 
HELPER(vhaddw_q_d)3604711da1SSong Gao void HELPER(vhaddw_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
37c037fbc9SSong Gao {
3864cf6b99SSong Gao     int i;
3904711da1SSong Gao     VReg *Vd = (VReg *)vd;
4004711da1SSong Gao     VReg *Vj = (VReg *)vj;
4104711da1SSong Gao     VReg *Vk = (VReg *)vk;
4264cf6b99SSong Gao     int oprsz = simd_oprsz(desc);
43c037fbc9SSong Gao 
4464cf6b99SSong Gao     for (i = 0; i < oprsz / 16 ; i++) {
4564cf6b99SSong Gao         Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i + 1)),
4664cf6b99SSong Gao                               int128_makes64(Vk->D(2 * i)));
4764cf6b99SSong Gao     }
48c037fbc9SSong Gao }
49c037fbc9SSong Gao 
50c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_h_b, 16, H, B, DO_SUB)
51c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_w_h, 32, W, H, DO_SUB)
52c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_d_w, 64, D, W, DO_SUB)
53c037fbc9SSong Gao 
HELPER(vhsubw_q_d)5404711da1SSong Gao void HELPER(vhsubw_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
55c037fbc9SSong Gao {
5664cf6b99SSong Gao     int i;
5704711da1SSong Gao     VReg *Vd = (VReg *)vd;
5804711da1SSong Gao     VReg *Vj = (VReg *)vj;
5904711da1SSong Gao     VReg *Vk = (VReg *)vk;
6064cf6b99SSong Gao     int oprsz = simd_oprsz(desc);
61c037fbc9SSong Gao 
6264cf6b99SSong Gao     for (i = 0; i < oprsz / 16; i++) {
6364cf6b99SSong Gao         Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)),
6464cf6b99SSong Gao                               int128_makes64(Vk->D(2 * i)));
6564cf6b99SSong Gao     }
66c037fbc9SSong Gao }
67c037fbc9SSong Gao 
68c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_hu_bu, 16, UH, UB, DO_ADD)
69c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_wu_hu, 32, UW, UH, DO_ADD)
70c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_du_wu, 64, UD, UW, DO_ADD)
71c037fbc9SSong Gao 
HELPER(vhaddw_qu_du)7204711da1SSong Gao void HELPER(vhaddw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc)
73c037fbc9SSong Gao {
7464cf6b99SSong Gao     int i;
7504711da1SSong Gao     VReg *Vd = (VReg *)vd;
7604711da1SSong Gao     VReg *Vj = (VReg *)vj;
7704711da1SSong Gao     VReg *Vk = (VReg *)vk;
7864cf6b99SSong Gao     int oprsz = simd_oprsz(desc);
79c037fbc9SSong Gao 
8064cf6b99SSong Gao     for (i = 0; i < oprsz / 16; i ++) {
8164cf6b99SSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
8264cf6b99SSong Gao                               int128_make64(Vk->UD(2 * i)));
8364cf6b99SSong Gao     }
84c037fbc9SSong Gao }
85c037fbc9SSong Gao 
86c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_hu_bu, 16, UH, UB, DO_SUB)
87c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_wu_hu, 32, UW, UH, DO_SUB)
88c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_du_wu, 64, UD, UW, DO_SUB)
89c037fbc9SSong Gao 
HELPER(vhsubw_qu_du)9004711da1SSong Gao void HELPER(vhsubw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc)
91c037fbc9SSong Gao {
9264cf6b99SSong Gao     int i;
9304711da1SSong Gao     VReg *Vd = (VReg *)vd;
9404711da1SSong Gao     VReg *Vj = (VReg *)vj;
9504711da1SSong Gao     VReg *Vk = (VReg *)vk;
9664cf6b99SSong Gao     int oprsz = simd_oprsz(desc);
97c037fbc9SSong Gao 
9864cf6b99SSong Gao     for (i = 0; i < oprsz / 16; i++) {
9964cf6b99SSong Gao         Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)),
10064cf6b99SSong Gao                               int128_make64(Vk->UD(2 * i)));
10164cf6b99SSong Gao     }
102c037fbc9SSong Gao }
1032d5f950cSSong Gao 
1042d5f950cSSong Gao #define DO_EVEN(NAME, BIT, E1, E2, DO_OP)                        \
10585995f07SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)   \
1062d5f950cSSong Gao {                                                                \
1072d5f950cSSong Gao     int i;                                                       \
1082d5f950cSSong Gao     VReg *Vd = (VReg *)vd;                                       \
1092d5f950cSSong Gao     VReg *Vj = (VReg *)vj;                                       \
1102d5f950cSSong Gao     VReg *Vk = (VReg *)vk;                                       \
1112d5f950cSSong Gao     typedef __typeof(Vd->E1(0)) TD;                              \
11285995f07SSong Gao     int oprsz = simd_oprsz(desc);                                \
11385995f07SSong Gao                                                                  \
11485995f07SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                    \
1152d5f950cSSong Gao         Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i) ,(TD)Vk->E2(2 * i)); \
1162d5f950cSSong Gao     }                                                            \
1172d5f950cSSong Gao }
1182d5f950cSSong Gao 
1192d5f950cSSong Gao #define DO_ODD(NAME, BIT, E1, E2, DO_OP)                                 \
12085995f07SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)           \
1212d5f950cSSong Gao {                                                                        \
1222d5f950cSSong Gao     int i;                                                               \
1232d5f950cSSong Gao     VReg *Vd = (VReg *)vd;                                               \
1242d5f950cSSong Gao     VReg *Vj = (VReg *)vj;                                               \
1252d5f950cSSong Gao     VReg *Vk = (VReg *)vk;                                               \
1262d5f950cSSong Gao     typedef __typeof(Vd->E1(0)) TD;                                      \
12785995f07SSong Gao     int oprsz = simd_oprsz(desc);                                        \
12885995f07SSong Gao                                                                          \
12985995f07SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                            \
1302d5f950cSSong Gao         Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i + 1)); \
1312d5f950cSSong Gao     }                                                                    \
1322d5f950cSSong Gao }
1332d5f950cSSong Gao 
HELPER(vaddwev_q_d)13485995f07SSong Gao void HELPER(vaddwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
1352d5f950cSSong Gao {
13685995f07SSong Gao     int i;
1372d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
1382d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
1392d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
14085995f07SSong Gao     int oprsz = simd_oprsz(desc);
1412d5f950cSSong Gao 
14285995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
14385995f07SSong Gao         Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i)),
14485995f07SSong Gao                               int128_makes64(Vk->D(2 * i)));
14585995f07SSong Gao     }
1462d5f950cSSong Gao }
1472d5f950cSSong Gao 
1482d5f950cSSong Gao DO_EVEN(vaddwev_h_b, 16, H, B, DO_ADD)
1492d5f950cSSong Gao DO_EVEN(vaddwev_w_h, 32, W, H, DO_ADD)
1502d5f950cSSong Gao DO_EVEN(vaddwev_d_w, 64, D, W, DO_ADD)
1512d5f950cSSong Gao 
HELPER(vaddwod_q_d)15285995f07SSong Gao void HELPER(vaddwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
1532d5f950cSSong Gao {
15485995f07SSong Gao     int i;
1552d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
1562d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
1572d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
15885995f07SSong Gao     int oprsz = simd_oprsz(desc);
1592d5f950cSSong Gao 
16085995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
16185995f07SSong Gao         Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i +1)),
16285995f07SSong Gao                               int128_makes64(Vk->D(2 * i +1)));
16385995f07SSong Gao     }
1642d5f950cSSong Gao }
1652d5f950cSSong Gao 
1662d5f950cSSong Gao DO_ODD(vaddwod_h_b, 16, H, B, DO_ADD)
1672d5f950cSSong Gao DO_ODD(vaddwod_w_h, 32, W, H, DO_ADD)
1682d5f950cSSong Gao DO_ODD(vaddwod_d_w, 64, D, W, DO_ADD)
1692d5f950cSSong Gao 
HELPER(vsubwev_q_d)17085995f07SSong Gao void HELPER(vsubwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
1712d5f950cSSong Gao {
17285995f07SSong Gao     int i;
1732d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
1742d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
1752d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
17685995f07SSong Gao     int oprsz = simd_oprsz(desc);
1772d5f950cSSong Gao 
17885995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
17985995f07SSong Gao         Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i)),
18085995f07SSong Gao                               int128_makes64(Vk->D(2 * i)));
18185995f07SSong Gao     }
1822d5f950cSSong Gao }
1832d5f950cSSong Gao 
1842d5f950cSSong Gao DO_EVEN(vsubwev_h_b, 16, H, B, DO_SUB)
1852d5f950cSSong Gao DO_EVEN(vsubwev_w_h, 32, W, H, DO_SUB)
1862d5f950cSSong Gao DO_EVEN(vsubwev_d_w, 64, D, W, DO_SUB)
1872d5f950cSSong Gao 
HELPER(vsubwod_q_d)18885995f07SSong Gao void HELPER(vsubwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
1892d5f950cSSong Gao {
19085995f07SSong Gao     int i;
1912d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
1922d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
1932d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
19485995f07SSong Gao     int oprsz = simd_oprsz(desc);
1952d5f950cSSong Gao 
19685995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
19785995f07SSong Gao         Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)),
19885995f07SSong Gao                               int128_makes64(Vk->D(2 * i + 1)));
19985995f07SSong Gao     }
2002d5f950cSSong Gao }
2012d5f950cSSong Gao 
2022d5f950cSSong Gao DO_ODD(vsubwod_h_b, 16, H, B, DO_SUB)
2032d5f950cSSong Gao DO_ODD(vsubwod_w_h, 32, W, H, DO_SUB)
2042d5f950cSSong Gao DO_ODD(vsubwod_d_w, 64, D, W, DO_SUB)
2052d5f950cSSong Gao 
HELPER(vaddwev_q_du)20685995f07SSong Gao void HELPER(vaddwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
2072d5f950cSSong Gao {
20885995f07SSong Gao     int i;
2092d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
2102d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
2112d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
21285995f07SSong Gao     int oprsz = simd_oprsz(desc);
2132d5f950cSSong Gao 
21485995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
21585995f07SSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)),
21685995f07SSong Gao                               int128_make64(Vk->UD(2 * i)));
21785995f07SSong Gao     }
2182d5f950cSSong Gao }
2192d5f950cSSong Gao 
2202d5f950cSSong Gao DO_EVEN(vaddwev_h_bu, 16, UH, UB, DO_ADD)
2212d5f950cSSong Gao DO_EVEN(vaddwev_w_hu, 32, UW, UH, DO_ADD)
2222d5f950cSSong Gao DO_EVEN(vaddwev_d_wu, 64, UD, UW, DO_ADD)
2232d5f950cSSong Gao 
HELPER(vaddwod_q_du)22485995f07SSong Gao void HELPER(vaddwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
2252d5f950cSSong Gao {
22685995f07SSong Gao     int i;
2272d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
2282d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
2292d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
23085995f07SSong Gao     int oprsz = simd_oprsz(desc);
2312d5f950cSSong Gao 
23285995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
23385995f07SSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
23485995f07SSong Gao                               int128_make64(Vk->UD(2 * i + 1)));
23585995f07SSong Gao     }
2362d5f950cSSong Gao }
2372d5f950cSSong Gao 
2382d5f950cSSong Gao DO_ODD(vaddwod_h_bu, 16, UH, UB, DO_ADD)
2392d5f950cSSong Gao DO_ODD(vaddwod_w_hu, 32, UW, UH, DO_ADD)
2402d5f950cSSong Gao DO_ODD(vaddwod_d_wu, 64, UD, UW, DO_ADD)
2412d5f950cSSong Gao 
HELPER(vsubwev_q_du)24285995f07SSong Gao void HELPER(vsubwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
2432d5f950cSSong Gao {
24485995f07SSong Gao     int i;
2452d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
2462d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
2472d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
24885995f07SSong Gao     int oprsz = simd_oprsz(desc);
2492d5f950cSSong Gao 
25085995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
25185995f07SSong Gao         Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i)),
25285995f07SSong Gao                               int128_make64(Vk->UD(2 * i)));
25385995f07SSong Gao     }
2542d5f950cSSong Gao }
2552d5f950cSSong Gao 
2562d5f950cSSong Gao DO_EVEN(vsubwev_h_bu, 16, UH, UB, DO_SUB)
2572d5f950cSSong Gao DO_EVEN(vsubwev_w_hu, 32, UW, UH, DO_SUB)
2582d5f950cSSong Gao DO_EVEN(vsubwev_d_wu, 64, UD, UW, DO_SUB)
2592d5f950cSSong Gao 
HELPER(vsubwod_q_du)26085995f07SSong Gao void HELPER(vsubwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
2612d5f950cSSong Gao {
26285995f07SSong Gao     int i;
2632d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
2642d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
2652d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
26685995f07SSong Gao     int oprsz = simd_oprsz(desc);
2672d5f950cSSong Gao 
26885995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
26985995f07SSong Gao         Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)),
27085995f07SSong Gao                               int128_make64(Vk->UD(2 * i + 1)));
27185995f07SSong Gao     }
2722d5f950cSSong Gao }
2732d5f950cSSong Gao 
2742d5f950cSSong Gao DO_ODD(vsubwod_h_bu, 16, UH, UB, DO_SUB)
2752d5f950cSSong Gao DO_ODD(vsubwod_w_hu, 32, UW, UH, DO_SUB)
2762d5f950cSSong Gao DO_ODD(vsubwod_d_wu, 64, UD, UW, DO_SUB)
2772d5f950cSSong Gao 
2782d5f950cSSong Gao #define DO_EVEN_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP)             \
27985995f07SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)        \
2802d5f950cSSong Gao {                                                                     \
2812d5f950cSSong Gao     int i;                                                            \
2822d5f950cSSong Gao     VReg *Vd = (VReg *)vd;                                            \
2832d5f950cSSong Gao     VReg *Vj = (VReg *)vj;                                            \
2842d5f950cSSong Gao     VReg *Vk = (VReg *)vk;                                            \
2852d5f950cSSong Gao     typedef __typeof(Vd->ES1(0)) TDS;                                 \
2862d5f950cSSong Gao     typedef __typeof(Vd->EU1(0)) TDU;                                 \
28785995f07SSong Gao     int oprsz = simd_oprsz(desc);                                     \
28885995f07SSong Gao                                                                       \
28985995f07SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                         \
2902d5f950cSSong Gao         Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i) ,(TDS)Vk->ES2(2 * i)); \
2912d5f950cSSong Gao     }                                                                 \
2922d5f950cSSong Gao }
2932d5f950cSSong Gao 
2942d5f950cSSong Gao #define DO_ODD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP)                      \
29585995f07SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                \
2962d5f950cSSong Gao {                                                                             \
2972d5f950cSSong Gao     int i;                                                                    \
2982d5f950cSSong Gao     VReg *Vd = (VReg *)vd;                                                    \
2992d5f950cSSong Gao     VReg *Vj = (VReg *)vj;                                                    \
3002d5f950cSSong Gao     VReg *Vk = (VReg *)vk;                                                    \
3012d5f950cSSong Gao     typedef __typeof(Vd->ES1(0)) TDS;                                         \
3022d5f950cSSong Gao     typedef __typeof(Vd->EU1(0)) TDU;                                         \
30385995f07SSong Gao     int oprsz = simd_oprsz(desc);                                             \
30485995f07SSong Gao                                                                               \
30585995f07SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                                 \
3062d5f950cSSong Gao         Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i + 1), (TDS)Vk->ES2(2 * i + 1)); \
3072d5f950cSSong Gao     }                                                                         \
3082d5f950cSSong Gao }
3092d5f950cSSong Gao 
HELPER(vaddwev_q_du_d)31085995f07SSong Gao void HELPER(vaddwev_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc)
3112d5f950cSSong Gao {
31285995f07SSong Gao     int i;
3132d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
3142d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
3152d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
31685995f07SSong Gao     int oprsz = simd_oprsz(desc);
3172d5f950cSSong Gao 
31885995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
31985995f07SSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)),
32085995f07SSong Gao                               int128_makes64(Vk->D(2 * i)));
32185995f07SSong Gao     }
3222d5f950cSSong Gao }
3232d5f950cSSong Gao 
3242d5f950cSSong Gao DO_EVEN_U_S(vaddwev_h_bu_b, 16, H, UH, B, UB, DO_ADD)
3252d5f950cSSong Gao DO_EVEN_U_S(vaddwev_w_hu_h, 32, W, UW, H, UH, DO_ADD)
3262d5f950cSSong Gao DO_EVEN_U_S(vaddwev_d_wu_w, 64, D, UD, W, UW, DO_ADD)
3272d5f950cSSong Gao 
HELPER(vaddwod_q_du_d)32885995f07SSong Gao void HELPER(vaddwod_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc)
3292d5f950cSSong Gao {
33085995f07SSong Gao     int i;
3312d5f950cSSong Gao     VReg *Vd = (VReg *)vd;
3322d5f950cSSong Gao     VReg *Vj = (VReg *)vj;
3332d5f950cSSong Gao     VReg *Vk = (VReg *)vk;
33485995f07SSong Gao     int oprsz = simd_oprsz(desc);
3352d5f950cSSong Gao 
33685995f07SSong Gao     for (i = 0; i < oprsz / 16; i++) {
33785995f07SSong Gao         Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
33885995f07SSong Gao                               int128_makes64(Vk->D(2 * i + 1)));
33985995f07SSong Gao     }
3402d5f950cSSong Gao }
3412d5f950cSSong Gao 
3422d5f950cSSong Gao DO_ODD_U_S(vaddwod_h_bu_b, 16, H, UH, B, UB, DO_ADD)
3432d5f950cSSong Gao DO_ODD_U_S(vaddwod_w_hu_h, 32, W, UW, H, UH, DO_ADD)
3442d5f950cSSong Gao DO_ODD_U_S(vaddwod_d_wu_w, 64, D, UD, W, UW, DO_ADD)
34539e9b0a7SSong Gao 
34639e9b0a7SSong Gao #define DO_3OP(NAME, BIT, E, DO_OP)                            \
347ee7250d0SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
34839e9b0a7SSong Gao {                                                              \
34939e9b0a7SSong Gao     int i;                                                     \
35039e9b0a7SSong Gao     VReg *Vd = (VReg *)vd;                                     \
35139e9b0a7SSong Gao     VReg *Vj = (VReg *)vj;                                     \
35239e9b0a7SSong Gao     VReg *Vk = (VReg *)vk;                                     \
353ee7250d0SSong Gao     int oprsz = simd_oprsz(desc);                              \
354ee7250d0SSong Gao                                                                \
355ee7250d0SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
35639e9b0a7SSong Gao         Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i));                  \
35739e9b0a7SSong Gao     }                                                          \
35839e9b0a7SSong Gao }
35939e9b0a7SSong Gao 
36039e9b0a7SSong Gao DO_3OP(vavg_b, 8, B, DO_VAVG)
36139e9b0a7SSong Gao DO_3OP(vavg_h, 16, H, DO_VAVG)
36239e9b0a7SSong Gao DO_3OP(vavg_w, 32, W, DO_VAVG)
36339e9b0a7SSong Gao DO_3OP(vavg_d, 64, D, DO_VAVG)
36439e9b0a7SSong Gao DO_3OP(vavgr_b, 8, B, DO_VAVGR)
36539e9b0a7SSong Gao DO_3OP(vavgr_h, 16, H, DO_VAVGR)
36639e9b0a7SSong Gao DO_3OP(vavgr_w, 32, W, DO_VAVGR)
36739e9b0a7SSong Gao DO_3OP(vavgr_d, 64, D, DO_VAVGR)
36839e9b0a7SSong Gao DO_3OP(vavg_bu, 8, UB, DO_VAVG)
36939e9b0a7SSong Gao DO_3OP(vavg_hu, 16, UH, DO_VAVG)
37039e9b0a7SSong Gao DO_3OP(vavg_wu, 32, UW, DO_VAVG)
37139e9b0a7SSong Gao DO_3OP(vavg_du, 64, UD, DO_VAVG)
37239e9b0a7SSong Gao DO_3OP(vavgr_bu, 8, UB, DO_VAVGR)
37339e9b0a7SSong Gao DO_3OP(vavgr_hu, 16, UH, DO_VAVGR)
37439e9b0a7SSong Gao DO_3OP(vavgr_wu, 32, UW, DO_VAVGR)
37539e9b0a7SSong Gao DO_3OP(vavgr_du, 64, UD, DO_VAVGR)
37649725659SSong Gao 
37749725659SSong Gao DO_3OP(vabsd_b, 8, B, DO_VABSD)
37849725659SSong Gao DO_3OP(vabsd_h, 16, H, DO_VABSD)
37949725659SSong Gao DO_3OP(vabsd_w, 32, W, DO_VABSD)
38049725659SSong Gao DO_3OP(vabsd_d, 64, D, DO_VABSD)
38149725659SSong Gao DO_3OP(vabsd_bu, 8, UB, DO_VABSD)
38249725659SSong Gao DO_3OP(vabsd_hu, 16, UH, DO_VABSD)
38349725659SSong Gao DO_3OP(vabsd_wu, 32, UW, DO_VABSD)
38449725659SSong Gao DO_3OP(vabsd_du, 64, UD, DO_VABSD)
385af448cb3SSong Gao 
38627f5485dSSong Gao #define DO_VADDA(NAME, BIT, E)                                 \
38727f5485dSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
388af448cb3SSong Gao {                                                              \
389af448cb3SSong Gao     int i;                                                     \
390af448cb3SSong Gao     VReg *Vd = (VReg *)vd;                                     \
391af448cb3SSong Gao     VReg *Vj = (VReg *)vj;                                     \
392af448cb3SSong Gao     VReg *Vk = (VReg *)vk;                                     \
39327f5485dSSong Gao     int oprsz = simd_oprsz(desc);                              \
39427f5485dSSong Gao                                                                \
39527f5485dSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
39627f5485dSSong Gao         Vd->E(i) = DO_VABS(Vj->E(i)) + DO_VABS(Vk->E(i));      \
397af448cb3SSong Gao     }                                                          \
398af448cb3SSong Gao }
399af448cb3SSong Gao 
40027f5485dSSong Gao DO_VADDA(vadda_b, 8, B)
40127f5485dSSong Gao DO_VADDA(vadda_h, 16, H)
40227f5485dSSong Gao DO_VADDA(vadda_w, 32, W)
40327f5485dSSong Gao DO_VADDA(vadda_d, 64, D)
4049ab29520SSong Gao 
4059ab29520SSong Gao #define VMINMAXI(NAME, BIT, E, DO_OP)                              \
406c09360faSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
4079ab29520SSong Gao {                                                                  \
4089ab29520SSong Gao     int i;                                                         \
4099ab29520SSong Gao     VReg *Vd = (VReg *)vd;                                         \
4109ab29520SSong Gao     VReg *Vj = (VReg *)vj;                                         \
4119ab29520SSong Gao     typedef __typeof(Vd->E(0)) TD;                                 \
412c09360faSSong Gao     int oprsz = simd_oprsz(desc);                                  \
4139ab29520SSong Gao                                                                    \
414c09360faSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
4159ab29520SSong Gao         Vd->E(i) = DO_OP(Vj->E(i), (TD)imm);                       \
4169ab29520SSong Gao     }                                                              \
4179ab29520SSong Gao }
4189ab29520SSong Gao 
4199ab29520SSong Gao VMINMAXI(vmini_b, 8, B, DO_MIN)
4209ab29520SSong Gao VMINMAXI(vmini_h, 16, H, DO_MIN)
4219ab29520SSong Gao VMINMAXI(vmini_w, 32, W, DO_MIN)
4229ab29520SSong Gao VMINMAXI(vmini_d, 64, D, DO_MIN)
4239ab29520SSong Gao VMINMAXI(vmaxi_b, 8, B, DO_MAX)
4249ab29520SSong Gao VMINMAXI(vmaxi_h, 16, H, DO_MAX)
4259ab29520SSong Gao VMINMAXI(vmaxi_w, 32, W, DO_MAX)
4269ab29520SSong Gao VMINMAXI(vmaxi_d, 64, D, DO_MAX)
4279ab29520SSong Gao VMINMAXI(vmini_bu, 8, UB, DO_MIN)
4289ab29520SSong Gao VMINMAXI(vmini_hu, 16, UH, DO_MIN)
4299ab29520SSong Gao VMINMAXI(vmini_wu, 32, UW, DO_MIN)
4309ab29520SSong Gao VMINMAXI(vmini_du, 64, UD, DO_MIN)
4319ab29520SSong Gao VMINMAXI(vmaxi_bu, 8, UB, DO_MAX)
4329ab29520SSong Gao VMINMAXI(vmaxi_hu, 16, UH, DO_MAX)
4339ab29520SSong Gao VMINMAXI(vmaxi_wu, 32, UW, DO_MAX)
4349ab29520SSong Gao VMINMAXI(vmaxi_du, 64, UD, DO_MAX)
435cd1c49adSSong Gao 
436cd1c49adSSong Gao #define DO_VMUH(NAME, BIT, E1, E2, DO_OP)                      \
437342dc1cfSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
438cd1c49adSSong Gao {                                                              \
439cd1c49adSSong Gao     int i;                                                     \
440cd1c49adSSong Gao     VReg *Vd = (VReg *)vd;                                     \
441cd1c49adSSong Gao     VReg *Vj = (VReg *)vj;                                     \
442cd1c49adSSong Gao     VReg *Vk = (VReg *)vk;                                     \
443cd1c49adSSong Gao     typedef __typeof(Vd->E1(0)) T;                             \
444342dc1cfSSong Gao     int oprsz = simd_oprsz(desc);                              \
445cd1c49adSSong Gao                                                                \
446342dc1cfSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
447cd1c49adSSong Gao         Vd->E2(i) = ((T)Vj->E2(i)) * ((T)Vk->E2(i)) >> BIT;    \
448cd1c49adSSong Gao     }                                                          \
449cd1c49adSSong Gao }
450cd1c49adSSong Gao 
HELPER(vmuh_d)451342dc1cfSSong Gao void HELPER(vmuh_d)(void *vd, void *vj, void *vk, uint32_t desc)
452cd1c49adSSong Gao {
453342dc1cfSSong Gao     int i;
454342dc1cfSSong Gao     uint64_t l, h;
455cd1c49adSSong Gao     VReg *Vd = (VReg *)vd;
456cd1c49adSSong Gao     VReg *Vj = (VReg *)vj;
457cd1c49adSSong Gao     VReg *Vk = (VReg *)vk;
458342dc1cfSSong Gao     int oprsz = simd_oprsz(desc);
459cd1c49adSSong Gao 
460342dc1cfSSong Gao     for (i = 0; i < oprsz / 8; i++) {
461342dc1cfSSong Gao         muls64(&l, &h, Vj->D(i), Vk->D(i));
462342dc1cfSSong Gao         Vd->D(i) = h;
463342dc1cfSSong Gao     }
464cd1c49adSSong Gao }
465cd1c49adSSong Gao 
466cd1c49adSSong Gao DO_VMUH(vmuh_b, 8, H, B, DO_MUH)
467cd1c49adSSong Gao DO_VMUH(vmuh_h, 16, W, H, DO_MUH)
468cd1c49adSSong Gao DO_VMUH(vmuh_w, 32, D, W, DO_MUH)
469cd1c49adSSong Gao 
HELPER(vmuh_du)470342dc1cfSSong Gao void HELPER(vmuh_du)(void *vd, void *vj, void *vk, uint32_t desc)
471cd1c49adSSong Gao {
472342dc1cfSSong Gao     int i;
473342dc1cfSSong Gao     uint64_t l, h;
474cd1c49adSSong Gao     VReg *Vd = (VReg *)vd;
475cd1c49adSSong Gao     VReg *Vj = (VReg *)vj;
476cd1c49adSSong Gao     VReg *Vk = (VReg *)vk;
477342dc1cfSSong Gao     int oprsz = simd_oprsz(desc);
478cd1c49adSSong Gao 
479342dc1cfSSong Gao     for (i = 0; i < oprsz / 8; i++) {
480342dc1cfSSong Gao         mulu64(&l, &h, Vj->D(i), Vk->D(i));
481342dc1cfSSong Gao         Vd->D(i) = h;
482342dc1cfSSong Gao     }
483cd1c49adSSong Gao }
484cd1c49adSSong Gao 
485cd1c49adSSong Gao DO_VMUH(vmuh_bu, 8, UH, UB, DO_MUH)
486cd1c49adSSong Gao DO_VMUH(vmuh_hu, 16, UW, UH, DO_MUH)
487cd1c49adSSong Gao DO_VMUH(vmuh_wu, 32, UD, UW, DO_MUH)
488cd1c49adSSong Gao 
489cd1c49adSSong Gao DO_EVEN(vmulwev_h_b, 16, H, B, DO_MUL)
490cd1c49adSSong Gao DO_EVEN(vmulwev_w_h, 32, W, H, DO_MUL)
491cd1c49adSSong Gao DO_EVEN(vmulwev_d_w, 64, D, W, DO_MUL)
492cd1c49adSSong Gao 
493cd1c49adSSong Gao DO_ODD(vmulwod_h_b, 16, H, B, DO_MUL)
494cd1c49adSSong Gao DO_ODD(vmulwod_w_h, 32, W, H, DO_MUL)
495cd1c49adSSong Gao DO_ODD(vmulwod_d_w, 64, D, W, DO_MUL)
496cd1c49adSSong Gao 
497cd1c49adSSong Gao DO_EVEN(vmulwev_h_bu, 16, UH, UB, DO_MUL)
498cd1c49adSSong Gao DO_EVEN(vmulwev_w_hu, 32, UW, UH, DO_MUL)
499cd1c49adSSong Gao DO_EVEN(vmulwev_d_wu, 64, UD, UW, DO_MUL)
500cd1c49adSSong Gao 
501cd1c49adSSong Gao DO_ODD(vmulwod_h_bu, 16, UH, UB, DO_MUL)
502cd1c49adSSong Gao DO_ODD(vmulwod_w_hu, 32, UW, UH, DO_MUL)
503cd1c49adSSong Gao DO_ODD(vmulwod_d_wu, 64, UD, UW, DO_MUL)
504cd1c49adSSong Gao 
505cd1c49adSSong Gao DO_EVEN_U_S(vmulwev_h_bu_b, 16, H, UH, B, UB, DO_MUL)
506cd1c49adSSong Gao DO_EVEN_U_S(vmulwev_w_hu_h, 32, W, UW, H, UH, DO_MUL)
507cd1c49adSSong Gao DO_EVEN_U_S(vmulwev_d_wu_w, 64, D, UD, W, UW, DO_MUL)
508cd1c49adSSong Gao 
509cd1c49adSSong Gao DO_ODD_U_S(vmulwod_h_bu_b, 16, H, UH, B, UB, DO_MUL)
510cd1c49adSSong Gao DO_ODD_U_S(vmulwod_w_hu_h, 32, W, UW, H, UH, DO_MUL)
511cd1c49adSSong Gao DO_ODD_U_S(vmulwod_d_wu_w, 64, D, UD, W, UW, DO_MUL)
512d3aec65bSSong Gao 
513d3aec65bSSong Gao #define VMADDSUB(NAME, BIT, E, DO_OP)                          \
5143f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
515d3aec65bSSong Gao {                                                              \
516d3aec65bSSong Gao     int i;                                                     \
517d3aec65bSSong Gao     VReg *Vd = (VReg *)vd;                                     \
518d3aec65bSSong Gao     VReg *Vj = (VReg *)vj;                                     \
519d3aec65bSSong Gao     VReg *Vk = (VReg *)vk;                                     \
5203f450c17SSong Gao     int oprsz = simd_oprsz(desc);                              \
5213f450c17SSong Gao                                                                \
5223f450c17SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
523d3aec65bSSong Gao         Vd->E(i) = DO_OP(Vd->E(i), Vj->E(i) ,Vk->E(i));        \
524d3aec65bSSong Gao     }                                                          \
525d3aec65bSSong Gao }
526d3aec65bSSong Gao 
527d3aec65bSSong Gao VMADDSUB(vmadd_b, 8, B, DO_MADD)
528d3aec65bSSong Gao VMADDSUB(vmadd_h, 16, H, DO_MADD)
529d3aec65bSSong Gao VMADDSUB(vmadd_w, 32, W, DO_MADD)
530d3aec65bSSong Gao VMADDSUB(vmadd_d, 64, D, DO_MADD)
531d3aec65bSSong Gao VMADDSUB(vmsub_b, 8, B, DO_MSUB)
532d3aec65bSSong Gao VMADDSUB(vmsub_h, 16, H, DO_MSUB)
533d3aec65bSSong Gao VMADDSUB(vmsub_w, 32, W, DO_MSUB)
534d3aec65bSSong Gao VMADDSUB(vmsub_d, 64, D, DO_MSUB)
535d3aec65bSSong Gao 
536d3aec65bSSong Gao #define VMADDWEV(NAME, BIT, E1, E2, DO_OP)                        \
5373f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)    \
538d3aec65bSSong Gao {                                                                 \
539d3aec65bSSong Gao     int i;                                                        \
540d3aec65bSSong Gao     VReg *Vd = (VReg *)vd;                                        \
541d3aec65bSSong Gao     VReg *Vj = (VReg *)vj;                                        \
542d3aec65bSSong Gao     VReg *Vk = (VReg *)vk;                                        \
543d3aec65bSSong Gao     typedef __typeof(Vd->E1(0)) TD;                               \
5443f450c17SSong Gao     int oprsz = simd_oprsz(desc);                                 \
545d3aec65bSSong Gao                                                                   \
5463f450c17SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                     \
547d3aec65bSSong Gao         Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i), (TD)Vk->E2(2 * i)); \
548d3aec65bSSong Gao     }                                                             \
549d3aec65bSSong Gao }
550d3aec65bSSong Gao 
551d3aec65bSSong Gao VMADDWEV(vmaddwev_h_b, 16, H, B, DO_MUL)
552d3aec65bSSong Gao VMADDWEV(vmaddwev_w_h, 32, W, H, DO_MUL)
553d3aec65bSSong Gao VMADDWEV(vmaddwev_d_w, 64, D, W, DO_MUL)
554d3aec65bSSong Gao VMADDWEV(vmaddwev_h_bu, 16, UH, UB, DO_MUL)
555d3aec65bSSong Gao VMADDWEV(vmaddwev_w_hu, 32, UW, UH, DO_MUL)
556d3aec65bSSong Gao VMADDWEV(vmaddwev_d_wu, 64, UD, UW, DO_MUL)
557d3aec65bSSong Gao 
558d3aec65bSSong Gao #define VMADDWOD(NAME, BIT, E1, E2, DO_OP)                     \
5593f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
560d3aec65bSSong Gao {                                                              \
561d3aec65bSSong Gao     int i;                                                     \
562d3aec65bSSong Gao     VReg *Vd = (VReg *)vd;                                     \
563d3aec65bSSong Gao     VReg *Vj = (VReg *)vj;                                     \
564d3aec65bSSong Gao     VReg *Vk = (VReg *)vk;                                     \
565d3aec65bSSong Gao     typedef __typeof(Vd->E1(0)) TD;                            \
5663f450c17SSong Gao     int oprsz = simd_oprsz(desc);                              \
567d3aec65bSSong Gao                                                                \
5683f450c17SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
569d3aec65bSSong Gao         Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i + 1),              \
570d3aec65bSSong Gao                            (TD)Vk->E2(2 * i + 1));             \
571d3aec65bSSong Gao     }                                                          \
572d3aec65bSSong Gao }
573d3aec65bSSong Gao 
574d3aec65bSSong Gao VMADDWOD(vmaddwod_h_b, 16, H, B, DO_MUL)
575d3aec65bSSong Gao VMADDWOD(vmaddwod_w_h, 32, W, H, DO_MUL)
576d3aec65bSSong Gao VMADDWOD(vmaddwod_d_w, 64, D, W, DO_MUL)
577d3aec65bSSong Gao VMADDWOD(vmaddwod_h_bu, 16,  UH, UB, DO_MUL)
578d3aec65bSSong Gao VMADDWOD(vmaddwod_w_hu, 32,  UW, UH, DO_MUL)
579d3aec65bSSong Gao VMADDWOD(vmaddwod_d_wu, 64,  UD, UW, DO_MUL)
580d3aec65bSSong Gao 
581d3aec65bSSong Gao #define VMADDWEV_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP)     \
5823f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
583d3aec65bSSong Gao {                                                              \
584d3aec65bSSong Gao     int i;                                                     \
585d3aec65bSSong Gao     VReg *Vd = (VReg *)vd;                                     \
586d3aec65bSSong Gao     VReg *Vj = (VReg *)vj;                                     \
587d3aec65bSSong Gao     VReg *Vk = (VReg *)vk;                                     \
588d3aec65bSSong Gao     typedef __typeof(Vd->ES1(0)) TS1;                          \
589d3aec65bSSong Gao     typedef __typeof(Vd->EU1(0)) TU1;                          \
5903f450c17SSong Gao     int oprsz = simd_oprsz(desc);                              \
591d3aec65bSSong Gao                                                                \
5923f450c17SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
593d3aec65bSSong Gao         Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i),               \
594d3aec65bSSong Gao                             (TS1)Vk->ES2(2 * i));              \
595d3aec65bSSong Gao     }                                                          \
596d3aec65bSSong Gao }
597d3aec65bSSong Gao 
598d3aec65bSSong Gao VMADDWEV_U_S(vmaddwev_h_bu_b, 16, H, UH, B, UB, DO_MUL)
599d3aec65bSSong Gao VMADDWEV_U_S(vmaddwev_w_hu_h, 32, W, UW, H, UH, DO_MUL)
600d3aec65bSSong Gao VMADDWEV_U_S(vmaddwev_d_wu_w, 64, D, UD, W, UW, DO_MUL)
601d3aec65bSSong Gao 
602d3aec65bSSong Gao #define VMADDWOD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP)     \
6033f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
604d3aec65bSSong Gao {                                                              \
605d3aec65bSSong Gao     int i;                                                     \
606d3aec65bSSong Gao     VReg *Vd = (VReg *)vd;                                     \
607d3aec65bSSong Gao     VReg *Vj = (VReg *)vj;                                     \
608d3aec65bSSong Gao     VReg *Vk = (VReg *)vk;                                     \
609d3aec65bSSong Gao     typedef __typeof(Vd->ES1(0)) TS1;                          \
610d3aec65bSSong Gao     typedef __typeof(Vd->EU1(0)) TU1;                          \
6113f450c17SSong Gao     int oprsz = simd_oprsz(desc);                              \
612d3aec65bSSong Gao                                                                \
6133f450c17SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
614d3aec65bSSong Gao         Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i + 1),           \
615d3aec65bSSong Gao                             (TS1)Vk->ES2(2 * i + 1));          \
616d3aec65bSSong Gao     }                                                          \
617d3aec65bSSong Gao }
618d3aec65bSSong Gao 
619d3aec65bSSong Gao VMADDWOD_U_S(vmaddwod_h_bu_b, 16, H, UH, B, UB, DO_MUL)
620d3aec65bSSong Gao VMADDWOD_U_S(vmaddwod_w_hu_h, 32, W, UW, H, UH, DO_MUL)
621d3aec65bSSong Gao VMADDWOD_U_S(vmaddwod_d_wu_w, 64, D, UD, W, UW, DO_MUL)
6224cc4c0f7SSong Gao 
6234cc4c0f7SSong Gao #define VDIV(NAME, BIT, E, DO_OP)                              \
62404711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
6254cc4c0f7SSong Gao {                                                              \
6264cc4c0f7SSong Gao     int i;                                                     \
62704711da1SSong Gao     VReg *Vd = (VReg *)vd;                                     \
62804711da1SSong Gao     VReg *Vj = (VReg *)vj;                                     \
62904711da1SSong Gao     VReg *Vk = (VReg *)vk;                                     \
630abb693deSSong Gao     int oprsz = simd_oprsz(desc);                              \
631abb693deSSong Gao                                                                \
632abb693deSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
6334cc4c0f7SSong Gao         Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i));                  \
6344cc4c0f7SSong Gao     }                                                          \
6354cc4c0f7SSong Gao }
6364cc4c0f7SSong Gao 
6374cc4c0f7SSong Gao VDIV(vdiv_b, 8, B, DO_DIV)
6384cc4c0f7SSong Gao VDIV(vdiv_h, 16, H, DO_DIV)
6394cc4c0f7SSong Gao VDIV(vdiv_w, 32, W, DO_DIV)
6404cc4c0f7SSong Gao VDIV(vdiv_d, 64, D, DO_DIV)
6414cc4c0f7SSong Gao VDIV(vdiv_bu, 8, UB, DO_DIVU)
6424cc4c0f7SSong Gao VDIV(vdiv_hu, 16, UH, DO_DIVU)
6434cc4c0f7SSong Gao VDIV(vdiv_wu, 32, UW, DO_DIVU)
6444cc4c0f7SSong Gao VDIV(vdiv_du, 64, UD, DO_DIVU)
6454cc4c0f7SSong Gao VDIV(vmod_b, 8, B, DO_REM)
6464cc4c0f7SSong Gao VDIV(vmod_h, 16, H, DO_REM)
6474cc4c0f7SSong Gao VDIV(vmod_w, 32, W, DO_REM)
6484cc4c0f7SSong Gao VDIV(vmod_d, 64, D, DO_REM)
6494cc4c0f7SSong Gao VDIV(vmod_bu, 8, UB, DO_REMU)
6504cc4c0f7SSong Gao VDIV(vmod_hu, 16, UH, DO_REMU)
6514cc4c0f7SSong Gao VDIV(vmod_wu, 32, UW, DO_REMU)
6524cc4c0f7SSong Gao VDIV(vmod_du, 64, UD, DO_REMU)
653cbe44190SSong Gao 
654cbe44190SSong Gao #define VSAT_S(NAME, BIT, E)                                       \
655e5c7f031SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \
656cbe44190SSong Gao {                                                                  \
657cbe44190SSong Gao     int i;                                                         \
658cbe44190SSong Gao     VReg *Vd = (VReg *)vd;                                         \
659cbe44190SSong Gao     VReg *Vj = (VReg *)vj;                                         \
660cbe44190SSong Gao     typedef __typeof(Vd->E(0)) TD;                                 \
661e5c7f031SSong Gao     int oprsz = simd_oprsz(desc);                                  \
662cbe44190SSong Gao                                                                    \
663e5c7f031SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
664cbe44190SSong Gao         Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max :                  \
665cbe44190SSong Gao                    Vj->E(i) < (TD)~max ? (TD)~max: Vj->E(i);       \
666cbe44190SSong Gao     }                                                              \
667cbe44190SSong Gao }
668cbe44190SSong Gao 
669cbe44190SSong Gao VSAT_S(vsat_b, 8, B)
670cbe44190SSong Gao VSAT_S(vsat_h, 16, H)
671cbe44190SSong Gao VSAT_S(vsat_w, 32, W)
672cbe44190SSong Gao VSAT_S(vsat_d, 64, D)
673cbe44190SSong Gao 
674cbe44190SSong Gao #define VSAT_U(NAME, BIT, E)                                       \
675e5c7f031SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \
676cbe44190SSong Gao {                                                                  \
677cbe44190SSong Gao     int i;                                                         \
678cbe44190SSong Gao     VReg *Vd = (VReg *)vd;                                         \
679cbe44190SSong Gao     VReg *Vj = (VReg *)vj;                                         \
680cbe44190SSong Gao     typedef __typeof(Vd->E(0)) TD;                                 \
681e5c7f031SSong Gao     int oprsz = simd_oprsz(desc);                                  \
682cbe44190SSong Gao                                                                    \
683e5c7f031SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
684cbe44190SSong Gao         Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : Vj->E(i);        \
685cbe44190SSong Gao     }                                                              \
686cbe44190SSong Gao }
687cbe44190SSong Gao 
688cbe44190SSong Gao VSAT_U(vsat_bu, 8, UB)
689cbe44190SSong Gao VSAT_U(vsat_hu, 16, UH)
690cbe44190SSong Gao VSAT_U(vsat_wu, 32, UW)
691cbe44190SSong Gao VSAT_U(vsat_du, 64, UD)
6923734ad93SSong Gao 
6933734ad93SSong Gao #define VEXTH(NAME, BIT, E1, E2)                                 \
694ff27e335SSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc)             \
6953734ad93SSong Gao {                                                                \
696f0db0bebSSong Gao     int i, j, ofs;                                               \
697ff27e335SSong Gao     VReg *Vd = (VReg *)vd;                                       \
698ff27e335SSong Gao     VReg *Vj = (VReg *)vj;                                       \
699f0db0bebSSong Gao     int oprsz = simd_oprsz(desc);                                \
7003734ad93SSong Gao                                                                  \
701f0db0bebSSong Gao     ofs = LSX_LEN / BIT;                                         \
702f0db0bebSSong Gao     for (i = 0; i < oprsz / 16; i++) {                           \
703f0db0bebSSong Gao         for (j = 0; j < ofs; j++) {                              \
704f0db0bebSSong Gao             Vd->E1(j + i * ofs) = Vj->E2(j + ofs + ofs * 2 * i); \
705f0db0bebSSong Gao         }                                                        \
7063734ad93SSong Gao     }                                                            \
7073734ad93SSong Gao }
7083734ad93SSong Gao 
HELPER(vexth_q_d)709ff27e335SSong Gao void HELPER(vexth_q_d)(void *vd, void *vj, uint32_t desc)
7103734ad93SSong Gao {
711f0db0bebSSong Gao     int i;
712ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
713ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
714f0db0bebSSong Gao     int oprsz = simd_oprsz(desc);
7153734ad93SSong Gao 
716f0db0bebSSong Gao     for (i = 0; i < oprsz / 16; i++) {
717f0db0bebSSong Gao         Vd->Q(i) = int128_makes64(Vj->D(2 * i + 1));
718f0db0bebSSong Gao     }
7193734ad93SSong Gao }
7203734ad93SSong Gao 
HELPER(vexth_qu_du)721ff27e335SSong Gao void HELPER(vexth_qu_du)(void *vd, void *vj, uint32_t desc)
7223734ad93SSong Gao {
723f0db0bebSSong Gao     int i;
724ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
725ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
726f0db0bebSSong Gao     int oprsz = simd_oprsz(desc);
7273734ad93SSong Gao 
728f0db0bebSSong Gao     for (i = 0; i < oprsz / 16; i++) {
729f0db0bebSSong Gao         Vd->Q(i) = int128_make64(Vj->UD(2 * i + 1));
730f0db0bebSSong Gao     }
7313734ad93SSong Gao }
7323734ad93SSong Gao 
7333734ad93SSong Gao VEXTH(vexth_h_b, 16, H, B)
7343734ad93SSong Gao VEXTH(vexth_w_h, 32, W, H)
7353734ad93SSong Gao VEXTH(vexth_d_w, 64, D, W)
7363734ad93SSong Gao VEXTH(vexth_hu_bu, 16, UH, UB)
7373734ad93SSong Gao VEXTH(vexth_wu_hu, 32, UW, UH)
7383734ad93SSong Gao VEXTH(vexth_du_wu, 64, UD, UW)
739f0e395dfSSong Gao 
740790acb2aSSong Gao #define VEXT2XV(NAME, BIT, E1, E2)                   \
741790acb2aSSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
742790acb2aSSong Gao {                                                    \
743790acb2aSSong Gao     int i;                                           \
744790acb2aSSong Gao     VReg temp = {};                                  \
745790acb2aSSong Gao     VReg *Vd = (VReg *)vd;                           \
746790acb2aSSong Gao     VReg *Vj = (VReg *)vj;                           \
747790acb2aSSong Gao     int oprsz = simd_oprsz(desc);                    \
748790acb2aSSong Gao                                                      \
749790acb2aSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {        \
750790acb2aSSong Gao         temp.E1(i) = Vj->E2(i);                      \
751790acb2aSSong Gao     }                                                \
752790acb2aSSong Gao     *Vd = temp;                                      \
753790acb2aSSong Gao }
754790acb2aSSong Gao 
755790acb2aSSong Gao VEXT2XV(vext2xv_h_b, 16, H, B)
756790acb2aSSong Gao VEXT2XV(vext2xv_w_b, 32, W, B)
757790acb2aSSong Gao VEXT2XV(vext2xv_d_b, 64, D, B)
758790acb2aSSong Gao VEXT2XV(vext2xv_w_h, 32, W, H)
759790acb2aSSong Gao VEXT2XV(vext2xv_d_h, 64, D, H)
760790acb2aSSong Gao VEXT2XV(vext2xv_d_w, 64, D, W)
761790acb2aSSong Gao VEXT2XV(vext2xv_hu_bu, 16, UH, UB)
762790acb2aSSong Gao VEXT2XV(vext2xv_wu_bu, 32, UW, UB)
763790acb2aSSong Gao VEXT2XV(vext2xv_du_bu, 64, UD, UB)
764790acb2aSSong Gao VEXT2XV(vext2xv_wu_hu, 32, UW, UH)
765790acb2aSSong Gao VEXT2XV(vext2xv_du_hu, 64, UD, UH)
766790acb2aSSong Gao VEXT2XV(vext2xv_du_wu, 64, UD, UW)
767790acb2aSSong Gao 
768f0e395dfSSong Gao DO_3OP(vsigncov_b, 8, B, DO_SIGNCOV)
769f0e395dfSSong Gao DO_3OP(vsigncov_h, 16, H, DO_SIGNCOV)
770f0e395dfSSong Gao DO_3OP(vsigncov_w, 32, W, DO_SIGNCOV)
771f0e395dfSSong Gao DO_3OP(vsigncov_d, 64, D, DO_SIGNCOV)
772789f4a4cSSong Gao 
do_vmskltz_b(int64_t val)773789f4a4cSSong Gao static uint64_t do_vmskltz_b(int64_t val)
774789f4a4cSSong Gao {
775789f4a4cSSong Gao     uint64_t m = 0x8080808080808080ULL;
776789f4a4cSSong Gao     uint64_t c =  val & m;
777789f4a4cSSong Gao     c |= c << 7;
778789f4a4cSSong Gao     c |= c << 14;
779789f4a4cSSong Gao     c |= c << 28;
780789f4a4cSSong Gao     return c >> 56;
781789f4a4cSSong Gao }
782789f4a4cSSong Gao 
HELPER(vmskltz_b)783ff27e335SSong Gao void HELPER(vmskltz_b)(void *vd, void *vj, uint32_t desc)
784789f4a4cSSong Gao {
78597074674SSong Gao     int i;
786789f4a4cSSong Gao     uint16_t temp = 0;
787ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
788ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
78997074674SSong Gao     int oprsz = simd_oprsz(desc);
790789f4a4cSSong Gao 
79197074674SSong Gao     for (i = 0; i < oprsz / 16; i++) {
79297074674SSong Gao         temp = 0;
79397074674SSong Gao         temp = do_vmskltz_b(Vj->D(2 * i));
79497074674SSong Gao         temp |= (do_vmskltz_b(Vj->D(2 * i  + 1)) << 8);
79597074674SSong Gao         Vd->D(2 * i) = temp;
79697074674SSong Gao         Vd->D(2 * i + 1) = 0;
79797074674SSong Gao     }
798789f4a4cSSong Gao }
799789f4a4cSSong Gao 
do_vmskltz_h(int64_t val)800789f4a4cSSong Gao static uint64_t do_vmskltz_h(int64_t val)
801789f4a4cSSong Gao {
802789f4a4cSSong Gao     uint64_t m = 0x8000800080008000ULL;
803789f4a4cSSong Gao     uint64_t c =  val & m;
804789f4a4cSSong Gao     c |= c << 15;
805789f4a4cSSong Gao     c |= c << 30;
806789f4a4cSSong Gao     return c >> 60;
807789f4a4cSSong Gao }
808789f4a4cSSong Gao 
HELPER(vmskltz_h)809ff27e335SSong Gao void HELPER(vmskltz_h)(void *vd, void *vj, uint32_t desc)
810789f4a4cSSong Gao {
81197074674SSong Gao     int i;
812789f4a4cSSong Gao     uint16_t temp = 0;
813ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
814ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
81597074674SSong Gao     int oprsz = simd_oprsz(desc);
816789f4a4cSSong Gao 
81797074674SSong Gao     for (i = 0; i < oprsz / 16; i++) {
81897074674SSong Gao         temp = 0;
81997074674SSong Gao         temp = do_vmskltz_h(Vj->D(2 * i));
82097074674SSong Gao         temp |= (do_vmskltz_h(Vj->D(2 * i + 1)) << 4);
82197074674SSong Gao         Vd->D(2 * i) = temp;
82297074674SSong Gao         Vd->D(2 * i + 1) = 0;
82397074674SSong Gao     }
824789f4a4cSSong Gao }
825789f4a4cSSong Gao 
do_vmskltz_w(int64_t val)826789f4a4cSSong Gao static uint64_t do_vmskltz_w(int64_t val)
827789f4a4cSSong Gao {
828789f4a4cSSong Gao     uint64_t m = 0x8000000080000000ULL;
829789f4a4cSSong Gao     uint64_t c =  val & m;
830789f4a4cSSong Gao     c |= c << 31;
831789f4a4cSSong Gao     return c >> 62;
832789f4a4cSSong Gao }
833789f4a4cSSong Gao 
HELPER(vmskltz_w)834ff27e335SSong Gao void HELPER(vmskltz_w)(void *vd, void *vj, uint32_t desc)
835789f4a4cSSong Gao {
83697074674SSong Gao     int i;
837789f4a4cSSong Gao     uint16_t temp = 0;
838ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
839ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
84097074674SSong Gao     int oprsz = simd_oprsz(desc);
841789f4a4cSSong Gao 
84297074674SSong Gao     for (i = 0; i < oprsz / 16; i++) {
84397074674SSong Gao         temp = 0;
84497074674SSong Gao         temp = do_vmskltz_w(Vj->D(2 * i));
84597074674SSong Gao         temp |= (do_vmskltz_w(Vj->D(2 * i + 1)) << 2);
84697074674SSong Gao         Vd->D(2 * i) = temp;
84797074674SSong Gao         Vd->D(2 * i + 1) = 0;
84897074674SSong Gao     }
849789f4a4cSSong Gao }
850789f4a4cSSong Gao 
do_vmskltz_d(int64_t val)851789f4a4cSSong Gao static uint64_t do_vmskltz_d(int64_t val)
852789f4a4cSSong Gao {
853789f4a4cSSong Gao     return (uint64_t)val >> 63;
854789f4a4cSSong Gao }
HELPER(vmskltz_d)855ff27e335SSong Gao void HELPER(vmskltz_d)(void *vd, void *vj, uint32_t desc)
856789f4a4cSSong Gao {
85797074674SSong Gao     int i;
858789f4a4cSSong Gao     uint16_t temp = 0;
859ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
860ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
86197074674SSong Gao     int oprsz = simd_oprsz(desc);
862789f4a4cSSong Gao 
86397074674SSong Gao     for (i = 0; i < oprsz / 16; i++) {
86497074674SSong Gao         temp = 0;
86597074674SSong Gao         temp = do_vmskltz_d(Vj->D(2 * i));
86697074674SSong Gao         temp |= (do_vmskltz_d(Vj->D(2 * i + 1)) << 1);
86797074674SSong Gao         Vd->D(2 * i) = temp;
86897074674SSong Gao         Vd->D(2 * i + 1) = 0;
86997074674SSong Gao     }
870789f4a4cSSong Gao }
871789f4a4cSSong Gao 
HELPER(vmskgez_b)872ff27e335SSong Gao void HELPER(vmskgez_b)(void *vd, void *vj, uint32_t desc)
873789f4a4cSSong Gao {
87497074674SSong Gao     int i;
875789f4a4cSSong Gao     uint16_t temp = 0;
876ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
877ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
87897074674SSong Gao     int oprsz = simd_oprsz(desc);
879789f4a4cSSong Gao 
88097074674SSong Gao     for (i = 0; i < oprsz / 16; i++) {
88197074674SSong Gao         temp = 0;
88297074674SSong Gao         temp =  do_vmskltz_b(Vj->D(2 * i));
88397074674SSong Gao         temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8);
88497074674SSong Gao         Vd->D(2 * i) = (uint16_t)(~temp);
88597074674SSong Gao         Vd->D(2 * i + 1) = 0;
88697074674SSong Gao     }
887789f4a4cSSong Gao }
888789f4a4cSSong Gao 
do_vmskez_b(uint64_t a)889789f4a4cSSong Gao static uint64_t do_vmskez_b(uint64_t a)
890789f4a4cSSong Gao {
891789f4a4cSSong Gao     uint64_t m = 0x7f7f7f7f7f7f7f7fULL;
892789f4a4cSSong Gao     uint64_t c = ~(((a & m) + m) | a | m);
893789f4a4cSSong Gao     c |= c << 7;
894789f4a4cSSong Gao     c |= c << 14;
895789f4a4cSSong Gao     c |= c << 28;
896789f4a4cSSong Gao     return c >> 56;
897789f4a4cSSong Gao }
898789f4a4cSSong Gao 
HELPER(vmsknz_b)899ff27e335SSong Gao void HELPER(vmsknz_b)(void *vd, void *vj, uint32_t desc)
900789f4a4cSSong Gao {
90197074674SSong Gao     int i;
902789f4a4cSSong Gao     uint16_t temp = 0;
903ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
904ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
90597074674SSong Gao     int oprsz = simd_oprsz(desc);
906789f4a4cSSong Gao 
90797074674SSong Gao     for (i = 0; i < oprsz / 16; i++) {
90897074674SSong Gao         temp = 0;
90997074674SSong Gao         temp = do_vmskez_b(Vj->D(2 * i));
91097074674SSong Gao         temp |= (do_vmskez_b(Vj->D(2 * i + 1)) << 8);
91197074674SSong Gao         Vd->D(2 * i) = (uint16_t)(~temp);
91297074674SSong Gao         Vd->D(2 * i + 1) = 0;
91397074674SSong Gao     }
914789f4a4cSSong Gao }
915f205a539SSong Gao 
HELPER(vnori_b)9164472a45aSSong Gao void HELPER(vnori_b)(void *vd, void *vj, uint64_t imm, uint32_t desc)
917f205a539SSong Gao {
918f205a539SSong Gao     int i;
919f205a539SSong Gao     VReg *Vd = (VReg *)vd;
920f205a539SSong Gao     VReg *Vj = (VReg *)vj;
921f205a539SSong Gao 
9224472a45aSSong Gao     for (i = 0; i < simd_oprsz(desc); i++) {
923f205a539SSong Gao         Vd->B(i) = ~(Vj->B(i) | (uint8_t)imm);
924f205a539SSong Gao     }
925f205a539SSong Gao }
9269b21a7a5SSong Gao 
9279b21a7a5SSong Gao #define VSLLWIL(NAME, BIT, E1, E2)                                             \
928329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)             \
9299b21a7a5SSong Gao {                                                                              \
9306567eac7SSong Gao     int i, j, ofs;                                                             \
9316567eac7SSong Gao     VReg temp = {};                                                            \
932329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                     \
933329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                     \
9346567eac7SSong Gao     int oprsz = simd_oprsz(desc);                                              \
9359b21a7a5SSong Gao     typedef __typeof(temp.E1(0)) TD;                                           \
9369b21a7a5SSong Gao                                                                                \
9376567eac7SSong Gao     ofs = LSX_LEN / BIT;                                                       \
9386567eac7SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
9396567eac7SSong Gao         for (j = 0; j < ofs; j++) {                                            \
9406567eac7SSong Gao             temp.E1(j + ofs * i) = (TD)Vj->E2(j + ofs * 2 * i) << (imm % BIT); \
9416567eac7SSong Gao         }                                                                      \
9429b21a7a5SSong Gao     }                                                                          \
9439b21a7a5SSong Gao     *Vd = temp;                                                                \
9449b21a7a5SSong Gao }
9459b21a7a5SSong Gao 
9466567eac7SSong Gao 
HELPER(vextl_q_d)947ff27e335SSong Gao void HELPER(vextl_q_d)(void *vd, void *vj, uint32_t desc)
9489b21a7a5SSong Gao {
9496567eac7SSong Gao     int i;
950ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
951ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
9526567eac7SSong Gao     int oprsz = simd_oprsz(desc);
9539b21a7a5SSong Gao 
9546567eac7SSong Gao     for (i = 0; i < oprsz / 16; i++) {
9556567eac7SSong Gao         Vd->Q(i) = int128_makes64(Vj->D(2 * i));
9566567eac7SSong Gao     }
9579b21a7a5SSong Gao }
9589b21a7a5SSong Gao 
HELPER(vextl_qu_du)959ff27e335SSong Gao void HELPER(vextl_qu_du)(void *vd, void *vj, uint32_t desc)
9609b21a7a5SSong Gao {
9616567eac7SSong Gao     int i;
962ff27e335SSong Gao     VReg *Vd = (VReg *)vd;
963ff27e335SSong Gao     VReg *Vj = (VReg *)vj;
9646567eac7SSong Gao     int oprsz = simd_oprsz(desc);
9659b21a7a5SSong Gao 
9666567eac7SSong Gao     for (i = 0; i < oprsz / 16; i++) {
9676567eac7SSong Gao         Vd->Q(i) = int128_make64(Vj->UD(2 * i));
9686567eac7SSong Gao     }
9699b21a7a5SSong Gao }
9709b21a7a5SSong Gao 
9719b21a7a5SSong Gao VSLLWIL(vsllwil_h_b, 16, H, B)
9729b21a7a5SSong Gao VSLLWIL(vsllwil_w_h, 32, W, H)
9739b21a7a5SSong Gao VSLLWIL(vsllwil_d_w, 64, D, W)
9749b21a7a5SSong Gao VSLLWIL(vsllwil_hu_bu, 16, UH, UB)
9759b21a7a5SSong Gao VSLLWIL(vsllwil_wu_hu, 32, UW, UH)
9769b21a7a5SSong Gao VSLLWIL(vsllwil_du_wu, 64, UD, UW)
977ecb93716SSong Gao 
978ecb93716SSong Gao #define do_vsrlr(E, T)                                  \
979ecb93716SSong Gao static T do_vsrlr_ ##E(T s1, int sh)                    \
980ecb93716SSong Gao {                                                       \
981ecb93716SSong Gao     if (sh == 0) {                                      \
982ecb93716SSong Gao         return s1;                                      \
983ecb93716SSong Gao     } else {                                            \
984ecb93716SSong Gao         return  (s1 >> sh)  + ((s1 >> (sh - 1)) & 0x1); \
985ecb93716SSong Gao     }                                                   \
986ecb93716SSong Gao }
987ecb93716SSong Gao 
do_vsrlr(B,uint8_t)988ecb93716SSong Gao do_vsrlr(B, uint8_t)
989ecb93716SSong Gao do_vsrlr(H, uint16_t)
990ecb93716SSong Gao do_vsrlr(W, uint32_t)
991ecb93716SSong Gao do_vsrlr(D, uint64_t)
992ecb93716SSong Gao 
993ecb93716SSong Gao #define VSRLR(NAME, BIT, T, E)                                  \
99404711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)  \
995ecb93716SSong Gao {                                                               \
996ecb93716SSong Gao     int i;                                                      \
99704711da1SSong Gao     VReg *Vd = (VReg *)vd;                                      \
99804711da1SSong Gao     VReg *Vj = (VReg *)vj;                                      \
99904711da1SSong Gao     VReg *Vk = (VReg *)vk;                                      \
10008c272fe8SSong Gao     int oprsz = simd_oprsz(desc);                               \
1001ecb93716SSong Gao                                                                 \
10028c272fe8SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                   \
1003ecb93716SSong Gao         Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \
1004ecb93716SSong Gao     }                                                           \
1005ecb93716SSong Gao }
1006ecb93716SSong Gao 
1007ecb93716SSong Gao VSRLR(vsrlr_b, 8,  uint8_t, B)
1008ecb93716SSong Gao VSRLR(vsrlr_h, 16, uint16_t, H)
1009ecb93716SSong Gao VSRLR(vsrlr_w, 32, uint32_t, W)
1010ecb93716SSong Gao VSRLR(vsrlr_d, 64, uint64_t, D)
1011ecb93716SSong Gao 
1012ecb93716SSong Gao #define VSRLRI(NAME, BIT, E)                                       \
1013329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1014ecb93716SSong Gao {                                                                  \
1015ecb93716SSong Gao     int i;                                                         \
1016329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                         \
1017329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                         \
10188c272fe8SSong Gao     int oprsz = simd_oprsz(desc);                                  \
1019ecb93716SSong Gao                                                                    \
10208c272fe8SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
1021ecb93716SSong Gao         Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), imm);                  \
1022ecb93716SSong Gao     }                                                              \
1023ecb93716SSong Gao }
1024ecb93716SSong Gao 
1025ecb93716SSong Gao VSRLRI(vsrlri_b, 8, B)
1026ecb93716SSong Gao VSRLRI(vsrlri_h, 16, H)
1027ecb93716SSong Gao VSRLRI(vsrlri_w, 32, W)
1028ecb93716SSong Gao VSRLRI(vsrlri_d, 64, D)
1029ecb93716SSong Gao 
1030ecb93716SSong Gao #define do_vsrar(E, T)                                  \
1031ecb93716SSong Gao static T do_vsrar_ ##E(T s1, int sh)                    \
1032ecb93716SSong Gao {                                                       \
1033ecb93716SSong Gao     if (sh == 0) {                                      \
1034ecb93716SSong Gao         return s1;                                      \
1035ecb93716SSong Gao     } else {                                            \
1036ecb93716SSong Gao         return  (s1 >> sh)  + ((s1 >> (sh - 1)) & 0x1); \
1037ecb93716SSong Gao     }                                                   \
1038ecb93716SSong Gao }
1039ecb93716SSong Gao 
1040ecb93716SSong Gao do_vsrar(B, int8_t)
1041ecb93716SSong Gao do_vsrar(H, int16_t)
1042ecb93716SSong Gao do_vsrar(W, int32_t)
1043ecb93716SSong Gao do_vsrar(D, int64_t)
1044ecb93716SSong Gao 
1045ecb93716SSong Gao #define VSRAR(NAME, BIT, T, E)                                  \
104604711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)  \
1047ecb93716SSong Gao {                                                               \
1048ecb93716SSong Gao     int i;                                                      \
104904711da1SSong Gao     VReg *Vd = (VReg *)vd;                                      \
105004711da1SSong Gao     VReg *Vj = (VReg *)vj;                                      \
105104711da1SSong Gao     VReg *Vk = (VReg *)vk;                                      \
10528c272fe8SSong Gao     int oprsz = simd_oprsz(desc);                               \
1053ecb93716SSong Gao                                                                 \
10548c272fe8SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                   \
1055ecb93716SSong Gao         Vd->E(i) = do_vsrar_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \
1056ecb93716SSong Gao     }                                                           \
1057ecb93716SSong Gao }
1058ecb93716SSong Gao 
1059ecb93716SSong Gao VSRAR(vsrar_b, 8,  uint8_t, B)
1060ecb93716SSong Gao VSRAR(vsrar_h, 16, uint16_t, H)
1061ecb93716SSong Gao VSRAR(vsrar_w, 32, uint32_t, W)
1062ecb93716SSong Gao VSRAR(vsrar_d, 64, uint64_t, D)
1063ecb93716SSong Gao 
1064ecb93716SSong Gao #define VSRARI(NAME, BIT, E)                                       \
1065329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1066ecb93716SSong Gao {                                                                  \
1067ecb93716SSong Gao     int i;                                                         \
1068329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                         \
1069329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                         \
10708c272fe8SSong Gao     int oprsz = simd_oprsz(desc);                                  \
1071ecb93716SSong Gao                                                                    \
10728c272fe8SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
1073ecb93716SSong Gao         Vd->E(i) = do_vsrar_ ## E(Vj->E(i), imm);                  \
1074ecb93716SSong Gao     }                                                              \
1075ecb93716SSong Gao }
1076ecb93716SSong Gao 
1077ecb93716SSong Gao VSRARI(vsrari_b, 8, B)
1078ecb93716SSong Gao VSRARI(vsrari_h, 16, H)
1079ecb93716SSong Gao VSRARI(vsrari_w, 32, W)
1080ecb93716SSong Gao VSRARI(vsrari_d, 64, D)
1081d79fb8ddSSong Gao 
108240c7674eSSong Gao #define VSRLN(NAME, BIT, E1, E2)                                          \
108304711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)            \
1084d79fb8ddSSong Gao {                                                                         \
108540c7674eSSong Gao     int i, j, ofs;                                                        \
108604711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                \
108704711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                \
108804711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                \
108940c7674eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
1090d79fb8ddSSong Gao                                                                           \
109140c7674eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
109240c7674eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
109340c7674eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
109440c7674eSSong Gao             Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i),        \
109540c7674eSSong Gao                                               Vk->E2(j + ofs * i) % BIT); \
1096d79fb8ddSSong Gao         }                                                                 \
109740c7674eSSong Gao         Vd->D(2 * i + 1) = 0;                                             \
109840c7674eSSong Gao     }                                                                     \
1099d79fb8ddSSong Gao }
1100d79fb8ddSSong Gao 
110140c7674eSSong Gao VSRLN(vsrln_b_h, 16, B, UH)
110240c7674eSSong Gao VSRLN(vsrln_h_w, 32, H, UW)
110340c7674eSSong Gao VSRLN(vsrln_w_d, 64, W, UD)
1104d79fb8ddSSong Gao 
110540c7674eSSong Gao #define VSRAN(NAME, BIT, E1, E2, E3)                                      \
110604711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)            \
1107d79fb8ddSSong Gao {                                                                         \
110840c7674eSSong Gao     int i, j, ofs;                                                        \
110904711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                \
111004711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                \
111104711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                \
111240c7674eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
1113d79fb8ddSSong Gao                                                                           \
111440c7674eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
111540c7674eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
111640c7674eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
111740c7674eSSong Gao             Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i),        \
111840c7674eSSong Gao                                               Vk->E3(j + ofs * i) % BIT); \
1119d79fb8ddSSong Gao         }                                                                 \
112040c7674eSSong Gao         Vd->D(2 * i + 1) = 0;                                             \
112140c7674eSSong Gao     }                                                                     \
1122d79fb8ddSSong Gao }
1123d79fb8ddSSong Gao 
112440c7674eSSong Gao VSRAN(vsran_b_h, 16, B, H, UH)
112540c7674eSSong Gao VSRAN(vsran_h_w, 32, H, W, UW)
112640c7674eSSong Gao VSRAN(vsran_w_d, 64, W, D, UD)
1127d79fb8ddSSong Gao 
112840c7674eSSong Gao #define VSRLNI(NAME, BIT, E1, E2)                                         \
1129329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)        \
1130d79fb8ddSSong Gao {                                                                         \
113140c7674eSSong Gao     int i, j, ofs;                                                        \
113240c7674eSSong Gao     VReg temp = {};                                                       \
1133329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                \
1134329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                \
113540c7674eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
1136d79fb8ddSSong Gao                                                                           \
113740c7674eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
113840c7674eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
113940c7674eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
114040c7674eSSong Gao             temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \
114140c7674eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \
114240c7674eSSong Gao                                                      imm);                \
114340c7674eSSong Gao         }                                                                 \
1144d79fb8ddSSong Gao     }                                                                     \
1145d79fb8ddSSong Gao     *Vd = temp;                                                           \
1146d79fb8ddSSong Gao }
1147d79fb8ddSSong Gao 
1148329517d5SSong Gao void HELPER(vsrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1149d79fb8ddSSong Gao {
115040c7674eSSong Gao     int i;
115140c7674eSSong Gao     VReg temp = {};
1152329517d5SSong Gao     VReg *Vd = (VReg *)vd;
1153329517d5SSong Gao     VReg *Vj = (VReg *)vj;
1154d79fb8ddSSong Gao 
115540c7674eSSong Gao     for (i = 0; i < 2; i++) {
115640c7674eSSong Gao         temp.D(2 * i) = int128_getlo(int128_urshift(Vj->Q(i), imm % 128));
115740c7674eSSong Gao         temp.D(2 * i +1) = int128_getlo(int128_urshift(Vd->Q(i), imm % 128));
115840c7674eSSong Gao     }
1159d79fb8ddSSong Gao     *Vd = temp;
1160d79fb8ddSSong Gao }
1161d79fb8ddSSong Gao 
116240c7674eSSong Gao VSRLNI(vsrlni_b_h, 16, B, UH)
116340c7674eSSong Gao VSRLNI(vsrlni_h_w, 32, H, UW)
116440c7674eSSong Gao VSRLNI(vsrlni_w_d, 64, W, UD)
1165d79fb8ddSSong Gao 
1166d79fb8ddSSong Gao #define VSRANI(NAME, BIT, E1, E2)                                         \
1167329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)        \
1168d79fb8ddSSong Gao {                                                                         \
116940c7674eSSong Gao     int i, j, ofs;                                                        \
117040c7674eSSong Gao     VReg temp = {};                                                       \
1171329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                \
1172329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                \
117340c7674eSSong Gao     int oprsz = simd_oprsz(desc);                                         \
1174d79fb8ddSSong Gao                                                                           \
117540c7674eSSong Gao     ofs = LSX_LEN / BIT;                                                  \
117640c7674eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
117740c7674eSSong Gao         for (j = 0; j < ofs; j++) {                                       \
117840c7674eSSong Gao             temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \
117940c7674eSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \
118040c7674eSSong Gao                                                      imm);                \
118140c7674eSSong Gao         }                                                                 \
1182d79fb8ddSSong Gao     }                                                                     \
1183d79fb8ddSSong Gao     *Vd = temp;                                                           \
1184d79fb8ddSSong Gao }
1185d79fb8ddSSong Gao 
HELPER(vsrani_d_q)1186329517d5SSong Gao void HELPER(vsrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1187d79fb8ddSSong Gao {
118840c7674eSSong Gao     int i;
118940c7674eSSong Gao     VReg temp = {};
1190329517d5SSong Gao     VReg *Vd = (VReg *)vd;
1191329517d5SSong Gao     VReg *Vj = (VReg *)vj;
1192d79fb8ddSSong Gao 
119340c7674eSSong Gao     for (i = 0; i < 2; i++) {
119440c7674eSSong Gao         temp.D(2 * i) = int128_getlo(int128_rshift(Vj->Q(i), imm % 128));
119540c7674eSSong Gao         temp.D(2 * i + 1) = int128_getlo(int128_rshift(Vd->Q(i), imm % 128));
119640c7674eSSong Gao     }
1197d79fb8ddSSong Gao     *Vd = temp;
1198d79fb8ddSSong Gao }
1199d79fb8ddSSong Gao 
1200d79fb8ddSSong Gao VSRANI(vsrani_b_h, 16, B, H)
1201d79fb8ddSSong Gao VSRANI(vsrani_h_w, 32, H, W)
1202d79fb8ddSSong Gao VSRANI(vsrani_w_d, 64, W, D)
1203a5200a17SSong Gao 
1204c50ce38aSSong Gao #define VSRLRN(NAME, BIT, E1, E2, E3)                                      \
120504711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)             \
1206a5200a17SSong Gao {                                                                          \
1207c50ce38aSSong Gao     int i, j, ofs;                                                         \
120804711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                 \
120904711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                 \
121004711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                 \
1211c50ce38aSSong Gao     int oprsz = simd_oprsz(desc);                                          \
1212a5200a17SSong Gao                                                                            \
1213c50ce38aSSong Gao     ofs = LSX_LEN / BIT;                                                   \
1214c50ce38aSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                     \
1215c50ce38aSSong Gao         for (j = 0; j < ofs; j++) {                                        \
1216c50ce38aSSong Gao             Vd->E1(j + ofs * 2 * i) = do_vsrlr_ ##E2(Vj->E2(j + ofs * i),  \
1217c50ce38aSSong Gao                                                Vk->E3(j + ofs * i) % BIT); \
1218a5200a17SSong Gao         }                                                                  \
1219c50ce38aSSong Gao         Vd->D(2 * i + 1) = 0;                                              \
1220c50ce38aSSong Gao     }                                                                      \
1221a5200a17SSong Gao }
1222a5200a17SSong Gao 
1223c50ce38aSSong Gao VSRLRN(vsrlrn_b_h, 16, B, H, UH)
1224c50ce38aSSong Gao VSRLRN(vsrlrn_h_w, 32, H, W, UW)
1225c50ce38aSSong Gao VSRLRN(vsrlrn_w_d, 64, W, D, UD)
1226a5200a17SSong Gao 
1227c50ce38aSSong Gao #define VSRARN(NAME, BIT, E1, E2, E3)                                       \
122804711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)              \
1229a5200a17SSong Gao {                                                                           \
1230c50ce38aSSong Gao     int i, j, ofs;                                                          \
123104711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                  \
123204711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                  \
123304711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                  \
1234c50ce38aSSong Gao     int oprsz = simd_oprsz(desc);                                           \
1235a5200a17SSong Gao                                                                             \
1236c50ce38aSSong Gao     ofs = LSX_LEN / BIT;                                                    \
1237c50ce38aSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                      \
1238c50ce38aSSong Gao         for (j = 0; j < ofs; j++) {                                         \
1239c50ce38aSSong Gao             Vd->E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i),  \
1240c50ce38aSSong Gao                                                 Vk->E3(j + ofs * i) % BIT); \
1241a5200a17SSong Gao         }                                                                   \
1242c50ce38aSSong Gao         Vd->D(2 * i + 1) = 0;                                               \
1243c50ce38aSSong Gao     }                                                                       \
1244a5200a17SSong Gao }
1245a5200a17SSong Gao 
1246c50ce38aSSong Gao VSRARN(vsrarn_b_h, 16, B, H, UH)
1247c50ce38aSSong Gao VSRARN(vsrarn_h_w, 32, H, W, UW)
1248c50ce38aSSong Gao VSRARN(vsrarn_w_d, 64, W, D, UD)
1249a5200a17SSong Gao 
1250a5200a17SSong Gao #define VSRLRNI(NAME, BIT, E1, E2)                                                \
1251329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                \
1252a5200a17SSong Gao {                                                                                 \
1253c50ce38aSSong Gao     int i, j, ofs;                                                                \
1254c50ce38aSSong Gao     VReg temp = {};                                                               \
1255329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                        \
1256329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                        \
1257c50ce38aSSong Gao     int oprsz = simd_oprsz(desc);                                                 \
1258a5200a17SSong Gao                                                                                   \
1259c50ce38aSSong Gao     ofs = LSX_LEN / BIT;                                                          \
1260c50ce38aSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                            \
1261c50ce38aSSong Gao         for (j = 0; j < ofs; j++) {                                               \
1262c50ce38aSSong Gao             temp.E1(j + ofs * 2 * i) = do_vsrlr_ ## E2(Vj->E2(j + ofs * i), imm); \
1263c50ce38aSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_vsrlr_ ## E2(Vd->E2(j + ofs * i), \
1264c50ce38aSSong Gao                                                                  imm);            \
1265c50ce38aSSong Gao         }                                                                         \
1266a5200a17SSong Gao     }                                                                             \
1267a5200a17SSong Gao     *Vd = temp;                                                                   \
1268a5200a17SSong Gao }
1269a5200a17SSong Gao 
HELPER(vsrlrni_d_q)1270329517d5SSong Gao void HELPER(vsrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1271a5200a17SSong Gao {
1272c50ce38aSSong Gao     int i;
1273c50ce38aSSong Gao     VReg temp = {};
1274329517d5SSong Gao     VReg *Vd = (VReg *)vd;
1275329517d5SSong Gao     VReg *Vj = (VReg *)vj;
1276c50ce38aSSong Gao     Int128 r[4];
1277c50ce38aSSong Gao     int oprsz = simd_oprsz(desc);
1278a5200a17SSong Gao 
1279c50ce38aSSong Gao     for (i = 0; i < oprsz / 16; i++) {
1280a5200a17SSong Gao         if (imm == 0) {
1281c50ce38aSSong Gao             temp.D(2 * i) = int128_getlo(Vj->Q(i));
1282c50ce38aSSong Gao             temp.D(2 * i + 1) = int128_getlo(Vd->Q(i));
1283a5200a17SSong Gao         } else {
1284c50ce38aSSong Gao             r[2 * i] = int128_and(int128_urshift(Vj->Q(i), (imm - 1)),
1285c50ce38aSSong Gao                                   int128_one());
1286c50ce38aSSong Gao             r[2 * i + 1] = int128_and(int128_urshift(Vd->Q(i), (imm - 1)),
1287c50ce38aSSong Gao                                       int128_one());
1288c50ce38aSSong Gao             temp.D(2 * i) = int128_getlo(int128_add(int128_urshift(Vj->Q(i),
1289c50ce38aSSong Gao                                                     imm), r[2 * i]));
1290c50ce38aSSong Gao             temp.D(2 * i + 1) = int128_getlo(int128_add(int128_urshift(Vd->Q(i),
1291c50ce38aSSong Gao                                                         imm), r[ 2 * i + 1]));
1292c50ce38aSSong Gao         }
1293a5200a17SSong Gao     }
1294a5200a17SSong Gao     *Vd = temp;
1295a5200a17SSong Gao }
1296a5200a17SSong Gao 
1297a5200a17SSong Gao VSRLRNI(vsrlrni_b_h, 16, B, H)
1298a5200a17SSong Gao VSRLRNI(vsrlrni_h_w, 32, H, W)
1299a5200a17SSong Gao VSRLRNI(vsrlrni_w_d, 64, W, D)
1300a5200a17SSong Gao 
1301a5200a17SSong Gao #define VSRARNI(NAME, BIT, E1, E2)                                                \
1302329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                \
1303a5200a17SSong Gao {                                                                                 \
1304c50ce38aSSong Gao     int i, j, ofs;                                                                \
1305c50ce38aSSong Gao     VReg temp = {};                                                               \
1306329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                        \
1307329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                        \
1308c50ce38aSSong Gao     int oprsz = simd_oprsz(desc);                                                 \
1309a5200a17SSong Gao                                                                                   \
1310c50ce38aSSong Gao     ofs = LSX_LEN / BIT;                                                          \
1311c50ce38aSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                            \
1312c50ce38aSSong Gao         for (j = 0; j < ofs; j++) {                                               \
1313c50ce38aSSong Gao             temp.E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i), imm); \
1314c50ce38aSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_vsrar_ ## E2(Vd->E2(j + ofs * i), \
1315c50ce38aSSong Gao                                                              imm);                \
1316c50ce38aSSong Gao         }                                                                         \
1317a5200a17SSong Gao     }                                                                             \
1318a5200a17SSong Gao     *Vd = temp;                                                                   \
1319a5200a17SSong Gao }
1320a5200a17SSong Gao 
HELPER(vsrarni_d_q)1321329517d5SSong Gao void HELPER(vsrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1322a5200a17SSong Gao {
1323c50ce38aSSong Gao     int i;
1324c50ce38aSSong Gao     VReg temp = {};
1325329517d5SSong Gao     VReg *Vd = (VReg *)vd;
1326329517d5SSong Gao     VReg *Vj = (VReg *)vj;
1327c50ce38aSSong Gao     Int128 r[4];
1328c50ce38aSSong Gao     int oprsz = simd_oprsz(desc);
1329a5200a17SSong Gao 
1330c50ce38aSSong Gao     for (i = 0; i < oprsz / 16; i++) {
1331a5200a17SSong Gao         if (imm == 0) {
1332c50ce38aSSong Gao             temp.D(2 * i) = int128_getlo(Vj->Q(i));
1333c50ce38aSSong Gao             temp.D(2 * i + 1) = int128_getlo(Vd->Q(i));
1334a5200a17SSong Gao         } else {
1335c50ce38aSSong Gao             r[2 * i] = int128_and(int128_rshift(Vj->Q(i), (imm - 1)),
1336c50ce38aSSong Gao                                   int128_one());
1337c50ce38aSSong Gao             r[2 * i + 1] = int128_and(int128_rshift(Vd->Q(i), (imm - 1)),
1338c50ce38aSSong Gao                                       int128_one());
1339c50ce38aSSong Gao             temp.D(2 * i) = int128_getlo(int128_add(int128_rshift(Vj->Q(i),
1340c50ce38aSSong Gao                                                     imm), r[2 * i]));
1341c50ce38aSSong Gao             temp.D(2 * i + 1) = int128_getlo(int128_add(int128_rshift(Vd->Q(i),
1342c50ce38aSSong Gao                                                         imm), r[2 * i + 1]));
1343c50ce38aSSong Gao         }
1344a5200a17SSong Gao     }
1345a5200a17SSong Gao     *Vd = temp;
1346a5200a17SSong Gao }
1347a5200a17SSong Gao 
1348a5200a17SSong Gao VSRARNI(vsrarni_b_h, 16, B, H)
1349a5200a17SSong Gao VSRARNI(vsrarni_h_w, 32, H, W)
1350a5200a17SSong Gao VSRARNI(vsrarni_w_d, 64, W, D)
135183b3815dSSong Gao 
135283b3815dSSong Gao #define SSRLNS(NAME, T1, T2, T3)                    \
135383b3815dSSong Gao static T1 do_ssrlns_ ## NAME(T2 e2, int sa, int sh) \
135483b3815dSSong Gao {                                                   \
135583b3815dSSong Gao         T1 shft_res;                                \
135683b3815dSSong Gao         if (sa == 0) {                              \
135783b3815dSSong Gao             shft_res = e2;                          \
135883b3815dSSong Gao         } else {                                    \
135983b3815dSSong Gao             shft_res = (((T1)e2) >> sa);            \
136083b3815dSSong Gao         }                                           \
136183b3815dSSong Gao         T3 mask;                                    \
136283b3815dSSong Gao         mask = (1ull << sh) -1;                     \
136383b3815dSSong Gao         if (shft_res > mask) {                      \
136483b3815dSSong Gao             return mask;                            \
136583b3815dSSong Gao         } else {                                    \
136683b3815dSSong Gao             return  shft_res;                       \
136783b3815dSSong Gao         }                                           \
136883b3815dSSong Gao }
136983b3815dSSong Gao 
SSRLNS(B,uint16_t,int16_t,uint8_t)137083b3815dSSong Gao SSRLNS(B, uint16_t, int16_t, uint8_t)
137183b3815dSSong Gao SSRLNS(H, uint32_t, int32_t, uint16_t)
137283b3815dSSong Gao SSRLNS(W, uint64_t, int64_t, uint32_t)
137383b3815dSSong Gao 
13746256c8caSSong Gao #define VSSRLN(NAME, BIT, E1, E2, E3)                                       \
137504711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)              \
137683b3815dSSong Gao {                                                                           \
13776256c8caSSong Gao     int i, j, ofs;                                                          \
137804711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                  \
137904711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                  \
138004711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                  \
13816256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                           \
138283b3815dSSong Gao                                                                             \
13836256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                    \
13846256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                      \
13856256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                         \
13866256c8caSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i), \
13876256c8caSSong Gao                                                 Vk->E3(j + ofs * i) % BIT,  \
13886256c8caSSong Gao                                                 BIT / 2 - 1);               \
138983b3815dSSong Gao         }                                                                   \
13906256c8caSSong Gao         Vd->D(2 * i + 1) = 0;                                               \
13916256c8caSSong Gao     }                                                                       \
139283b3815dSSong Gao }
139383b3815dSSong Gao 
13946256c8caSSong Gao VSSRLN(vssrln_b_h, 16, B, H, UH)
13956256c8caSSong Gao VSSRLN(vssrln_h_w, 32, H, W, UW)
13966256c8caSSong Gao VSSRLN(vssrln_w_d, 64, W, D, UD)
139783b3815dSSong Gao 
139883b3815dSSong Gao #define SSRANS(E, T1, T2)                        \
139983b3815dSSong Gao static T1 do_ssrans_ ## E(T1 e2, int sa, int sh) \
140083b3815dSSong Gao {                                                \
140183b3815dSSong Gao         T1 shft_res;                             \
140283b3815dSSong Gao         if (sa == 0) {                           \
140383b3815dSSong Gao             shft_res = e2;                       \
140483b3815dSSong Gao         } else {                                 \
140583b3815dSSong Gao             shft_res = e2 >> sa;                 \
140683b3815dSSong Gao         }                                        \
140783b3815dSSong Gao         T2 mask;                                 \
140883b3815dSSong Gao         mask = (1ll << sh) - 1;                  \
140983b3815dSSong Gao         if (shft_res > mask) {                   \
141083b3815dSSong Gao             return  mask;                        \
141183b3815dSSong Gao         } else if (shft_res < -(mask + 1)) {     \
141283b3815dSSong Gao             return  ~mask;                       \
141383b3815dSSong Gao         } else {                                 \
141483b3815dSSong Gao             return shft_res;                     \
141583b3815dSSong Gao         }                                        \
141683b3815dSSong Gao }
141783b3815dSSong Gao 
141883b3815dSSong Gao SSRANS(B, int16_t, int8_t)
141983b3815dSSong Gao SSRANS(H, int32_t, int16_t)
142083b3815dSSong Gao SSRANS(W, int64_t, int32_t)
142183b3815dSSong Gao 
14226256c8caSSong Gao #define VSSRAN(NAME, BIT, E1, E2, E3)                                       \
142304711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)              \
142483b3815dSSong Gao {                                                                           \
14256256c8caSSong Gao     int i, j, ofs;                                                          \
142604711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                  \
142704711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                  \
142804711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                  \
14296256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                           \
143083b3815dSSong Gao                                                                             \
14316256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                    \
14326256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                      \
14336256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                         \
14346256c8caSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i), \
14356256c8caSSong Gao                                                 Vk->E3(j + ofs * i) % BIT,  \
14366256c8caSSong Gao                                                 BIT / 2 - 1);               \
143783b3815dSSong Gao         }                                                                   \
14386256c8caSSong Gao         Vd->D(2 * i + 1) = 0;                                               \
14396256c8caSSong Gao     }                                                                       \
144083b3815dSSong Gao }
144183b3815dSSong Gao 
14426256c8caSSong Gao VSSRAN(vssran_b_h, 16, B, H, UH)
14436256c8caSSong Gao VSSRAN(vssran_h_w, 32, H, W, UW)
14446256c8caSSong Gao VSSRAN(vssran_w_d, 64, W, D, UD)
144583b3815dSSong Gao 
144683b3815dSSong Gao #define SSRLNU(E, T1, T2, T3)                    \
144783b3815dSSong Gao static T1 do_ssrlnu_ ## E(T3 e2, int sa, int sh) \
144883b3815dSSong Gao {                                                \
144983b3815dSSong Gao         T1 shft_res;                             \
145083b3815dSSong Gao         if (sa == 0) {                           \
145183b3815dSSong Gao             shft_res = e2;                       \
145283b3815dSSong Gao         } else {                                 \
145383b3815dSSong Gao             shft_res = (((T1)e2) >> sa);         \
145483b3815dSSong Gao         }                                        \
145583b3815dSSong Gao         T2 mask;                                 \
145683b3815dSSong Gao         mask = (1ull << sh) - 1;                 \
145783b3815dSSong Gao         if (shft_res > mask) {                   \
145883b3815dSSong Gao             return mask;                         \
145983b3815dSSong Gao         } else {                                 \
146083b3815dSSong Gao             return shft_res;                     \
146183b3815dSSong Gao         }                                        \
146283b3815dSSong Gao }
146383b3815dSSong Gao 
146483b3815dSSong Gao SSRLNU(B, uint16_t, uint8_t,  int16_t)
146583b3815dSSong Gao SSRLNU(H, uint32_t, uint16_t, int32_t)
146683b3815dSSong Gao SSRLNU(W, uint64_t, uint32_t, int64_t)
146783b3815dSSong Gao 
14686256c8caSSong Gao #define VSSRLNU(NAME, BIT, E1, E2, E3)                                      \
146904711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)              \
147083b3815dSSong Gao {                                                                           \
14716256c8caSSong Gao     int i, j, ofs;                                                          \
147204711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                  \
147304711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                  \
147404711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                  \
14756256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                           \
147683b3815dSSong Gao                                                                             \
14776256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                    \
14786256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                      \
14796256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                         \
14806256c8caSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i), \
14816256c8caSSong Gao                                                 Vk->E3(j + ofs * i) % BIT,  \
14826256c8caSSong Gao                                                 BIT / 2);                   \
148383b3815dSSong Gao         }                                                                   \
14846256c8caSSong Gao         Vd->D(2 * i + 1) = 0;                                               \
14856256c8caSSong Gao     }                                                                       \
148683b3815dSSong Gao }
148783b3815dSSong Gao 
14886256c8caSSong Gao VSSRLNU(vssrln_bu_h, 16, B, H, UH)
14896256c8caSSong Gao VSSRLNU(vssrln_hu_w, 32, H, W, UW)
14906256c8caSSong Gao VSSRLNU(vssrln_wu_d, 64, W, D, UD)
149183b3815dSSong Gao 
149283b3815dSSong Gao #define SSRANU(E, T1, T2, T3)                    \
149383b3815dSSong Gao static T1 do_ssranu_ ## E(T3 e2, int sa, int sh) \
149483b3815dSSong Gao {                                                \
149583b3815dSSong Gao         T1 shft_res;                             \
149683b3815dSSong Gao         if (sa == 0) {                           \
149783b3815dSSong Gao             shft_res = e2;                       \
149883b3815dSSong Gao         } else {                                 \
149983b3815dSSong Gao             shft_res = e2 >> sa;                 \
150083b3815dSSong Gao         }                                        \
150183b3815dSSong Gao         if (e2 < 0) {                            \
150283b3815dSSong Gao             shft_res = 0;                        \
150383b3815dSSong Gao         }                                        \
150483b3815dSSong Gao         T2 mask;                                 \
150583b3815dSSong Gao         mask = (1ull << sh) - 1;                 \
150683b3815dSSong Gao         if (shft_res > mask) {                   \
150783b3815dSSong Gao             return mask;                         \
150883b3815dSSong Gao         } else {                                 \
150983b3815dSSong Gao             return shft_res;                     \
151083b3815dSSong Gao         }                                        \
151183b3815dSSong Gao }
151283b3815dSSong Gao 
151383b3815dSSong Gao SSRANU(B, uint16_t, uint8_t,  int16_t)
151483b3815dSSong Gao SSRANU(H, uint32_t, uint16_t, int32_t)
151583b3815dSSong Gao SSRANU(W, uint64_t, uint32_t, int64_t)
151683b3815dSSong Gao 
15176256c8caSSong Gao #define VSSRANU(NAME, BIT, E1, E2, E3)                                         \
151804711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                 \
151983b3815dSSong Gao {                                                                              \
15206256c8caSSong Gao     int i, j, ofs;                                                             \
152104711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                     \
152204711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                     \
152304711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                     \
15246256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                              \
152583b3815dSSong Gao                                                                                \
15266256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                       \
15276256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
15286256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                            \
15296256c8caSSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i),    \
15306256c8caSSong Gao                                                     Vk->E3(j + ofs * i) % BIT, \
15316256c8caSSong Gao                                                     BIT / 2);                  \
153283b3815dSSong Gao         }                                                                      \
15336256c8caSSong Gao         Vd->D(2 * i + 1) = 0;                                                  \
15346256c8caSSong Gao     }                                                                          \
153583b3815dSSong Gao }
153683b3815dSSong Gao 
15376256c8caSSong Gao VSSRANU(vssran_bu_h, 16, B, H, UH)
15386256c8caSSong Gao VSSRANU(vssran_hu_w, 32, H, W, UW)
15396256c8caSSong Gao VSSRANU(vssran_wu_d, 64, W, D, UD)
154083b3815dSSong Gao 
154183b3815dSSong Gao #define VSSRLNI(NAME, BIT, E1, E2)                                                 \
1542329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                 \
154383b3815dSSong Gao {                                                                                  \
15446256c8caSSong Gao     int i, j, ofs;                                                                 \
15456256c8caSSong Gao     VReg temp = {};                                                                \
1546329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                         \
1547329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                         \
15486256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                                  \
154983b3815dSSong Gao                                                                                    \
15506256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                           \
15516256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                             \
15526256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                                \
15536256c8caSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i),       \
15546256c8caSSong Gao                                                      imm, BIT / 2 - 1);            \
15556256c8caSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrlns_ ## E1(Vd->E2(j + ofs * i), \
15566256c8caSSong Gao                                                            imm, BIT / 2 - 1);      \
15576256c8caSSong Gao         }                                                                          \
155883b3815dSSong Gao     }                                                                              \
155983b3815dSSong Gao     *Vd = temp;                                                                    \
156083b3815dSSong Gao }
156183b3815dSSong Gao 
15626256c8caSSong Gao static void do_vssrlni_q(VReg *Vd, VReg *Vj,
15636256c8caSSong Gao                          uint64_t imm, int idx, Int128 mask)
156483b3815dSSong Gao {
15656256c8caSSong Gao     Int128 shft_res1, shft_res2;
156683b3815dSSong Gao 
156783b3815dSSong Gao     if (imm == 0) {
15686256c8caSSong Gao         shft_res1 = Vj->Q(idx);
15696256c8caSSong Gao         shft_res2 = Vd->Q(idx);
157083b3815dSSong Gao     } else {
15716256c8caSSong Gao         shft_res1 = int128_urshift(Vj->Q(idx), imm);
15726256c8caSSong Gao         shft_res2 = int128_urshift(Vd->Q(idx), imm);
157383b3815dSSong Gao     }
157483b3815dSSong Gao 
157583b3815dSSong Gao     if (int128_ult(mask, shft_res1)) {
15766256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(mask);
157783b3815dSSong Gao     }else {
15786256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
157983b3815dSSong Gao     }
158083b3815dSSong Gao 
158183b3815dSSong Gao     if (int128_ult(mask, shft_res2)) {
15826256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask);
158383b3815dSSong Gao     }else {
15846256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
15856256c8caSSong Gao     }
15866256c8caSSong Gao }
15876256c8caSSong Gao 
HELPER(vssrlni_d_q)15886256c8caSSong Gao void HELPER(vssrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
15896256c8caSSong Gao {
15906256c8caSSong Gao     int i;
15916256c8caSSong Gao     Int128 mask;
15926256c8caSSong Gao     VReg *Vd = (VReg *)vd;
15936256c8caSSong Gao     VReg *Vj = (VReg *)vj;
15946256c8caSSong Gao     int oprsz = simd_oprsz(desc);
15956256c8caSSong Gao 
15966256c8caSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
15976256c8caSSong Gao 
15986256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {
15996256c8caSSong Gao         do_vssrlni_q(Vd, Vj, imm, i, mask);
160083b3815dSSong Gao     }
160183b3815dSSong Gao }
160283b3815dSSong Gao 
160383b3815dSSong Gao VSSRLNI(vssrlni_b_h, 16, B, H)
160483b3815dSSong Gao VSSRLNI(vssrlni_h_w, 32, H, W)
160583b3815dSSong Gao VSSRLNI(vssrlni_w_d, 64, W, D)
160683b3815dSSong Gao 
160783b3815dSSong Gao #define VSSRANI(NAME, BIT, E1, E2)                                                 \
1608329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                 \
160983b3815dSSong Gao {                                                                                  \
16106256c8caSSong Gao     int i, j, ofs;                                                                 \
16116256c8caSSong Gao     VReg temp = {};                                                                \
1612329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                         \
1613329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                         \
16146256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                                  \
161583b3815dSSong Gao                                                                                    \
16166256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                           \
16176256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                             \
16186256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                                \
16196256c8caSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i),       \
16206256c8caSSong Gao                                                         imm, BIT / 2 - 1);         \
16216256c8caSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrans_ ## E1(Vd->E2(j + ofs * i), \
16226256c8caSSong Gao                                                               imm, BIT / 2 - 1);   \
16236256c8caSSong Gao         }                                                                          \
162483b3815dSSong Gao     }                                                                              \
162583b3815dSSong Gao     *Vd = temp;                                                                    \
162683b3815dSSong Gao }
162783b3815dSSong Gao 
do_vssrani_d_q(VReg * Vd,VReg * Vj,uint64_t imm,int idx,Int128 mask,Int128 min)16286256c8caSSong Gao static void do_vssrani_d_q(VReg *Vd, VReg *Vj,
16296256c8caSSong Gao                            uint64_t imm, int idx, Int128 mask, Int128 min)
163083b3815dSSong Gao {
16316256c8caSSong Gao     Int128 shft_res1, shft_res2;
163283b3815dSSong Gao 
163383b3815dSSong Gao     if (imm == 0) {
16346256c8caSSong Gao         shft_res1 = Vj->Q(idx);
16356256c8caSSong Gao         shft_res2 = Vd->Q(idx);
163683b3815dSSong Gao     } else {
16376256c8caSSong Gao         shft_res1 = int128_rshift(Vj->Q(idx), imm);
16386256c8caSSong Gao         shft_res2 = int128_rshift(Vd->Q(idx), imm);
163983b3815dSSong Gao     }
164083b3815dSSong Gao 
164183b3815dSSong Gao     if (int128_gt(shft_res1, mask)) {
16426256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(mask);
164383b3815dSSong Gao     } else if (int128_lt(shft_res1, int128_neg(min))) {
16446256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(min);
164583b3815dSSong Gao     } else {
16466256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
164783b3815dSSong Gao     }
164883b3815dSSong Gao 
164983b3815dSSong Gao     if (int128_gt(shft_res2, mask)) {
16506256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask);
165183b3815dSSong Gao     } else if (int128_lt(shft_res2, int128_neg(min))) {
16526256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(min);
165383b3815dSSong Gao     } else {
16546256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
165583b3815dSSong Gao     }
165683b3815dSSong Gao }
165783b3815dSSong Gao 
HELPER(vssrani_d_q)16586256c8caSSong Gao void HELPER(vssrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
16596256c8caSSong Gao {
16606256c8caSSong Gao     int i;
16616256c8caSSong Gao     Int128 mask, min;
16626256c8caSSong Gao     VReg *Vd = (VReg *)vd;
16636256c8caSSong Gao     VReg *Vj = (VReg *)vj;
16646256c8caSSong Gao     int oprsz = simd_oprsz(desc);
16656256c8caSSong Gao 
16666256c8caSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
16676256c8caSSong Gao     min  = int128_lshift(int128_one(), 63);
16686256c8caSSong Gao 
16696256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {
16706256c8caSSong Gao         do_vssrani_d_q(Vd, Vj, imm, i, mask, min);
16716256c8caSSong Gao     }
16726256c8caSSong Gao }
16736256c8caSSong Gao 
16746256c8caSSong Gao 
167583b3815dSSong Gao VSSRANI(vssrani_b_h, 16, B, H)
167683b3815dSSong Gao VSSRANI(vssrani_h_w, 32, H, W)
167783b3815dSSong Gao VSSRANI(vssrani_w_d, 64, W, D)
167883b3815dSSong Gao 
167983b3815dSSong Gao #define VSSRLNUI(NAME, BIT, E1, E2)                                                \
1680329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                 \
168183b3815dSSong Gao {                                                                                  \
16826256c8caSSong Gao     int i, j, ofs;                                                                 \
16836256c8caSSong Gao     VReg temp = {};                                                                \
1684329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                         \
1685329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                         \
16866256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                                  \
168783b3815dSSong Gao                                                                                    \
16886256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                           \
16896256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                             \
16906256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                                \
16916256c8caSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i),       \
16926256c8caSSong Gao                                                         imm, BIT / 2);             \
16936256c8caSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrlnu_ ## E1(Vd->E2(j + ofs * i), \
16946256c8caSSong Gao                                                               imm, BIT / 2);       \
16956256c8caSSong Gao         }                                                                          \
169683b3815dSSong Gao     }                                                                              \
169783b3815dSSong Gao     *Vd = temp;                                                                    \
169883b3815dSSong Gao }
169983b3815dSSong Gao 
HELPER(vssrlni_du_q)1700329517d5SSong Gao void HELPER(vssrlni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
170183b3815dSSong Gao {
17026256c8caSSong Gao     int i;
17036256c8caSSong Gao     Int128 mask;
1704329517d5SSong Gao     VReg *Vd = (VReg *)vd;
1705329517d5SSong Gao     VReg *Vj = (VReg *)vj;
17066256c8caSSong Gao     int oprsz = simd_oprsz(desc);
170783b3815dSSong Gao 
170883b3815dSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
170983b3815dSSong Gao 
17106256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {
17116256c8caSSong Gao         do_vssrlni_q(Vd, Vj, imm, i, mask);
171283b3815dSSong Gao     }
171383b3815dSSong Gao }
171483b3815dSSong Gao 
171583b3815dSSong Gao VSSRLNUI(vssrlni_bu_h, 16, B, H)
171683b3815dSSong Gao VSSRLNUI(vssrlni_hu_w, 32, H, W)
171783b3815dSSong Gao VSSRLNUI(vssrlni_wu_d, 64, W, D)
171883b3815dSSong Gao 
171983b3815dSSong Gao #define VSSRANUI(NAME, BIT, E1, E2)                                                \
1720329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                 \
172183b3815dSSong Gao {                                                                                  \
17226256c8caSSong Gao     int i, j, ofs;                                                                 \
17236256c8caSSong Gao     VReg temp = {};                                                                \
1724329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                         \
1725329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                         \
17266256c8caSSong Gao     int oprsz = simd_oprsz(desc);                                                  \
172783b3815dSSong Gao                                                                                    \
17286256c8caSSong Gao     ofs = LSX_LEN / BIT;                                                           \
17296256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {                                             \
17306256c8caSSong Gao         for (j = 0; j < ofs; j++) {                                                \
17316256c8caSSong Gao             temp.E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i),       \
17326256c8caSSong Gao                                                         imm, BIT / 2);             \
17336256c8caSSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssranu_ ## E1(Vd->E2(j + ofs * i), \
17346256c8caSSong Gao                                                               imm, BIT / 2);       \
17356256c8caSSong Gao         }                                                                          \
173683b3815dSSong Gao     }                                                                              \
173783b3815dSSong Gao     *Vd = temp;                                                                    \
173883b3815dSSong Gao }
173983b3815dSSong Gao 
do_vssrani_du_q(VReg * Vd,VReg * Vj,uint64_t imm,int idx,Int128 mask)17406256c8caSSong Gao static void do_vssrani_du_q(VReg *Vd, VReg *Vj,
17416256c8caSSong Gao                             uint64_t imm, int idx, Int128 mask)
174283b3815dSSong Gao {
17436256c8caSSong Gao     Int128 shft_res1, shft_res2;
174483b3815dSSong Gao 
174583b3815dSSong Gao     if (imm == 0) {
17466256c8caSSong Gao         shft_res1 = Vj->Q(idx);
17476256c8caSSong Gao         shft_res2 = Vd->Q(idx);
174883b3815dSSong Gao     } else {
17496256c8caSSong Gao         shft_res1 = int128_rshift(Vj->Q(idx), imm);
17506256c8caSSong Gao         shft_res2 = int128_rshift(Vd->Q(idx), imm);
175183b3815dSSong Gao     }
175283b3815dSSong Gao 
17536256c8caSSong Gao     if (int128_lt(Vj->Q(idx), int128_zero())) {
175483b3815dSSong Gao         shft_res1 = int128_zero();
175583b3815dSSong Gao     }
175683b3815dSSong Gao 
17576256c8caSSong Gao     if (int128_lt(Vd->Q(idx), int128_zero())) {
175883b3815dSSong Gao         shft_res2 = int128_zero();
175983b3815dSSong Gao     }
176083b3815dSSong Gao     if (int128_ult(mask, shft_res1)) {
17616256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(mask);
176283b3815dSSong Gao     }else {
17636256c8caSSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
176483b3815dSSong Gao     }
176583b3815dSSong Gao 
176683b3815dSSong Gao     if (int128_ult(mask, shft_res2)) {
17676256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask);
176883b3815dSSong Gao     }else {
17696256c8caSSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
17706256c8caSSong Gao     }
17716256c8caSSong Gao 
17726256c8caSSong Gao }
17736256c8caSSong Gao 
HELPER(vssrani_du_q)17746256c8caSSong Gao void HELPER(vssrani_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
17756256c8caSSong Gao {
17766256c8caSSong Gao     int i;
17776256c8caSSong Gao     Int128 mask;
17786256c8caSSong Gao     VReg *Vd = (VReg *)vd;
17796256c8caSSong Gao     VReg *Vj = (VReg *)vj;
17806256c8caSSong Gao     int oprsz = simd_oprsz(desc);
17816256c8caSSong Gao 
17826256c8caSSong Gao     mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
17836256c8caSSong Gao 
17846256c8caSSong Gao     for (i = 0; i < oprsz / 16; i++) {
17856256c8caSSong Gao         do_vssrani_du_q(Vd, Vj, imm, i, mask);
178683b3815dSSong Gao     }
178783b3815dSSong Gao }
178883b3815dSSong Gao 
178983b3815dSSong Gao VSSRANUI(vssrani_bu_h, 16, B, H)
179083b3815dSSong Gao VSSRANUI(vssrani_hu_w, 32, H, W)
179183b3815dSSong Gao VSSRANUI(vssrani_wu_d, 64, W, D)
1792162cd32cSSong Gao 
1793162cd32cSSong Gao #define SSRLRNS(E1, E2, T1, T2, T3)                \
1794162cd32cSSong Gao static T1 do_ssrlrns_ ## E1(T2 e2, int sa, int sh) \
1795162cd32cSSong Gao {                                                  \
1796162cd32cSSong Gao     T1 shft_res;                                   \
1797162cd32cSSong Gao                                                    \
1798162cd32cSSong Gao     shft_res = do_vsrlr_ ## E2(e2, sa);            \
1799162cd32cSSong Gao     T1 mask;                                       \
1800162cd32cSSong Gao     mask = (1ull << sh) - 1;                       \
1801162cd32cSSong Gao     if (shft_res > mask) {                         \
1802162cd32cSSong Gao         return mask;                               \
1803162cd32cSSong Gao     } else {                                       \
1804162cd32cSSong Gao         return  shft_res;                          \
1805162cd32cSSong Gao     }                                              \
1806162cd32cSSong Gao }
1807162cd32cSSong Gao 
SSRLRNS(B,H,uint16_t,int16_t,uint8_t)1808162cd32cSSong Gao SSRLRNS(B, H, uint16_t, int16_t, uint8_t)
1809162cd32cSSong Gao SSRLRNS(H, W, uint32_t, int32_t, uint16_t)
1810162cd32cSSong Gao SSRLRNS(W, D, uint64_t, int64_t, uint32_t)
1811162cd32cSSong Gao 
181277fca794SSong Gao #define VSSRLRN(NAME, BIT, E1, E2, E3)                                         \
181304711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                 \
1814162cd32cSSong Gao {                                                                              \
181577fca794SSong Gao     int i, j, ofs;                                                             \
181604711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                     \
181704711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                     \
181804711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                     \
181977fca794SSong Gao     int oprsz = simd_oprsz(desc);                                              \
1820162cd32cSSong Gao                                                                                \
182177fca794SSong Gao     ofs = LSX_LEN / BIT;                                                       \
182277fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
182377fca794SSong Gao         for (j = 0; j < ofs; j++) {                                            \
182477fca794SSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i),   \
182577fca794SSong Gao                                                     Vk->E3(j + ofs * i) % BIT, \
182677fca794SSong Gao                                                     BIT / 2 - 1);              \
1827162cd32cSSong Gao         }                                                                      \
182877fca794SSong Gao         Vd->D(2 * i + 1) = 0;                                                  \
182977fca794SSong Gao     }                                                                          \
1830162cd32cSSong Gao }
1831162cd32cSSong Gao 
183277fca794SSong Gao VSSRLRN(vssrlrn_b_h, 16, B, H, UH)
183377fca794SSong Gao VSSRLRN(vssrlrn_h_w, 32, H, W, UW)
183477fca794SSong Gao VSSRLRN(vssrlrn_w_d, 64, W, D, UD)
1835162cd32cSSong Gao 
1836162cd32cSSong Gao #define SSRARNS(E1, E2, T1, T2)                    \
1837162cd32cSSong Gao static T1 do_ssrarns_ ## E1(T1 e2, int sa, int sh) \
1838162cd32cSSong Gao {                                                  \
1839162cd32cSSong Gao     T1 shft_res;                                   \
1840162cd32cSSong Gao                                                    \
1841162cd32cSSong Gao     shft_res = do_vsrar_ ## E2(e2, sa);            \
1842162cd32cSSong Gao     T2 mask;                                       \
1843162cd32cSSong Gao     mask = (1ll << sh) - 1;                        \
1844162cd32cSSong Gao     if (shft_res > mask) {                         \
1845162cd32cSSong Gao         return  mask;                              \
1846162cd32cSSong Gao     } else if (shft_res < -(mask +1)) {            \
1847162cd32cSSong Gao         return  ~mask;                             \
1848162cd32cSSong Gao     } else {                                       \
1849162cd32cSSong Gao         return shft_res;                           \
1850162cd32cSSong Gao     }                                              \
1851162cd32cSSong Gao }
1852162cd32cSSong Gao 
1853162cd32cSSong Gao SSRARNS(B, H, int16_t, int8_t)
1854162cd32cSSong Gao SSRARNS(H, W, int32_t, int16_t)
1855162cd32cSSong Gao SSRARNS(W, D, int64_t, int32_t)
1856162cd32cSSong Gao 
185777fca794SSong Gao #define VSSRARN(NAME, BIT, E1, E2, E3)                                         \
185804711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                 \
1859162cd32cSSong Gao {                                                                              \
186077fca794SSong Gao     int i, j, ofs;                                                             \
186104711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                     \
186204711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                     \
186304711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                     \
186477fca794SSong Gao     int oprsz = simd_oprsz(desc);                                              \
1865162cd32cSSong Gao                                                                                \
186677fca794SSong Gao     ofs = LSX_LEN / BIT;                                                       \
186777fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
186877fca794SSong Gao         for (j = 0; j < ofs; j++) {                                            \
186977fca794SSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i),   \
187077fca794SSong Gao                                                     Vk->E3(j + ofs * i) % BIT, \
187177fca794SSong Gao                                                     BIT/ 2 - 1);               \
1872162cd32cSSong Gao         }                                                                      \
187377fca794SSong Gao         Vd->D(2 * i + 1) = 0;                                                  \
187477fca794SSong Gao     }                                                                          \
1875162cd32cSSong Gao }
1876162cd32cSSong Gao 
187777fca794SSong Gao VSSRARN(vssrarn_b_h, 16, B, H, UH)
187877fca794SSong Gao VSSRARN(vssrarn_h_w, 32, H, W, UW)
187977fca794SSong Gao VSSRARN(vssrarn_w_d, 64, W, D, UD)
1880162cd32cSSong Gao 
1881162cd32cSSong Gao #define SSRLRNU(E1, E2, T1, T2, T3)                \
1882162cd32cSSong Gao static T1 do_ssrlrnu_ ## E1(T3 e2, int sa, int sh) \
1883162cd32cSSong Gao {                                                  \
1884162cd32cSSong Gao     T1 shft_res;                                   \
1885162cd32cSSong Gao                                                    \
1886162cd32cSSong Gao     shft_res = do_vsrlr_ ## E2(e2, sa);            \
1887162cd32cSSong Gao                                                    \
1888162cd32cSSong Gao     T2 mask;                                       \
1889162cd32cSSong Gao     mask = (1ull << sh) - 1;                       \
1890162cd32cSSong Gao     if (shft_res > mask) {                         \
1891162cd32cSSong Gao         return mask;                               \
1892162cd32cSSong Gao     } else {                                       \
1893162cd32cSSong Gao         return shft_res;                           \
1894162cd32cSSong Gao     }                                              \
1895162cd32cSSong Gao }
1896162cd32cSSong Gao 
1897162cd32cSSong Gao SSRLRNU(B, H, uint16_t, uint8_t, int16_t)
1898162cd32cSSong Gao SSRLRNU(H, W, uint32_t, uint16_t, int32_t)
1899162cd32cSSong Gao SSRLRNU(W, D, uint64_t, uint32_t, int64_t)
1900162cd32cSSong Gao 
190177fca794SSong Gao #define VSSRLRNU(NAME, BIT, E1, E2, E3)                                        \
190204711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)                 \
1903162cd32cSSong Gao {                                                                              \
190477fca794SSong Gao     int i, j, ofs;                                                             \
190504711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                     \
190604711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                     \
190704711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                     \
190877fca794SSong Gao     int oprsz = simd_oprsz(desc);                                              \
1909162cd32cSSong Gao                                                                                \
191077fca794SSong Gao     ofs = LSX_LEN / BIT;                                                       \
191177fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                         \
191277fca794SSong Gao         for (j = 0; j < ofs; j++) {                                            \
191377fca794SSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i),   \
191477fca794SSong Gao                                                     Vk->E3(j + ofs * i) % BIT, \
191577fca794SSong Gao                                                     BIT / 2);                  \
1916162cd32cSSong Gao         }                                                                      \
191777fca794SSong Gao         Vd->D(2 * i + 1) = 0;                                                  \
191877fca794SSong Gao     }                                                                          \
1919162cd32cSSong Gao }
1920162cd32cSSong Gao 
192177fca794SSong Gao VSSRLRNU(vssrlrn_bu_h, 16, B, H, UH)
192277fca794SSong Gao VSSRLRNU(vssrlrn_hu_w, 32, H, W, UW)
192377fca794SSong Gao VSSRLRNU(vssrlrn_wu_d, 64, W, D, UD)
1924162cd32cSSong Gao 
1925162cd32cSSong Gao #define SSRARNU(E1, E2, T1, T2, T3)                \
1926162cd32cSSong Gao static T1 do_ssrarnu_ ## E1(T3 e2, int sa, int sh) \
1927162cd32cSSong Gao {                                                  \
1928162cd32cSSong Gao     T1 shft_res;                                   \
1929162cd32cSSong Gao                                                    \
1930162cd32cSSong Gao     if (e2 < 0) {                                  \
1931162cd32cSSong Gao         shft_res = 0;                              \
1932162cd32cSSong Gao     } else {                                       \
1933162cd32cSSong Gao         shft_res = do_vsrar_ ## E2(e2, sa);        \
1934162cd32cSSong Gao     }                                              \
1935162cd32cSSong Gao     T2 mask;                                       \
1936162cd32cSSong Gao     mask = (1ull << sh) - 1;                       \
1937162cd32cSSong Gao     if (shft_res > mask) {                         \
1938162cd32cSSong Gao         return mask;                               \
1939162cd32cSSong Gao     } else {                                       \
1940162cd32cSSong Gao         return shft_res;                           \
1941162cd32cSSong Gao     }                                              \
1942162cd32cSSong Gao }
1943162cd32cSSong Gao 
1944162cd32cSSong Gao SSRARNU(B, H, uint16_t, uint8_t, int16_t)
1945162cd32cSSong Gao SSRARNU(H, W, uint32_t, uint16_t, int32_t)
1946162cd32cSSong Gao SSRARNU(W, D, uint64_t, uint32_t, int64_t)
1947162cd32cSSong Gao 
194877fca794SSong Gao #define VSSRARNU(NAME, BIT, E1, E2, E3)                                      \
194904711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void  *vk, uint32_t desc)              \
1950162cd32cSSong Gao {                                                                            \
195177fca794SSong Gao     int i, j, ofs;                                                           \
195204711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                   \
195304711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                   \
195404711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                   \
195577fca794SSong Gao     int oprsz = simd_oprsz(desc);                                            \
1956162cd32cSSong Gao                                                                              \
195777fca794SSong Gao     ofs = LSX_LEN / BIT;                                                     \
195877fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                       \
195977fca794SSong Gao         for (j = 0; j < ofs; j++) {                                          \
196077fca794SSong Gao             Vd->E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i), \
196177fca794SSong Gao                                                 Vk->E3(j + ofs * i) % BIT,   \
196277fca794SSong Gao                                                 BIT / 2);                    \
1963162cd32cSSong Gao         }                                                                    \
196477fca794SSong Gao         Vd->D(2 * i + 1) = 0;                                                \
196577fca794SSong Gao     }                                                                        \
1966162cd32cSSong Gao }
1967162cd32cSSong Gao 
196877fca794SSong Gao VSSRARNU(vssrarn_bu_h, 16, B, H, UH)
196977fca794SSong Gao VSSRARNU(vssrarn_hu_w, 32, H, W, UW)
197077fca794SSong Gao VSSRARNU(vssrarn_wu_d, 64, W, D, UD)
1971162cd32cSSong Gao 
1972162cd32cSSong Gao #define VSSRLRNI(NAME, BIT, E1, E2)                                                 \
1973329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                  \
1974162cd32cSSong Gao {                                                                                   \
197577fca794SSong Gao     int i, j, ofs;                                                                  \
197677fca794SSong Gao     VReg temp = {};                                                                 \
1977329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                          \
1978329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                          \
197977fca794SSong Gao     int oprsz = simd_oprsz(desc);                                                   \
1980162cd32cSSong Gao                                                                                     \
198177fca794SSong Gao     ofs = LSX_LEN / BIT;                                                            \
198277fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                              \
198377fca794SSong Gao         for (j = 0; j < ofs; j++) {                                                 \
198477fca794SSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i),       \
198577fca794SSong Gao                                                          imm, BIT / 2 - 1);         \
198677fca794SSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrns_ ## E1(Vd->E2(j + ofs * i), \
198777fca794SSong Gao                                                                imm, BIT / 2 - 1);   \
198877fca794SSong Gao         }                                                                           \
1989162cd32cSSong Gao     }                                                                               \
1990162cd32cSSong Gao     *Vd = temp;                                                                     \
1991162cd32cSSong Gao }
1992162cd32cSSong Gao 
199377fca794SSong Gao static void do_vssrlrni_q(VReg *Vd, VReg * Vj,
199477fca794SSong Gao                           uint64_t imm, int idx, Int128 mask)
199577fca794SSong Gao {
199677fca794SSong Gao     Int128 shft_res1, shft_res2, r1, r2;
199777fca794SSong Gao     if (imm == 0) {
199877fca794SSong Gao         shft_res1 = Vj->Q(idx);
199977fca794SSong Gao         shft_res2 = Vd->Q(idx);
200077fca794SSong Gao     } else {
200177fca794SSong Gao         r1 = int128_and(int128_urshift(Vj->Q(idx), (imm - 1)), int128_one());
200277fca794SSong Gao         r2 = int128_and(int128_urshift(Vd->Q(idx), (imm - 1)), int128_one());
200377fca794SSong Gao         shft_res1 = (int128_add(int128_urshift(Vj->Q(idx), imm), r1));
200477fca794SSong Gao         shft_res2 = (int128_add(int128_urshift(Vd->Q(idx), imm), r2));
200577fca794SSong Gao     }
200677fca794SSong Gao 
200777fca794SSong Gao     if (int128_ult(mask, shft_res1)) {
200877fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(mask);
200977fca794SSong Gao     }else {
201077fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
201177fca794SSong Gao     }
201277fca794SSong Gao 
201377fca794SSong Gao     if (int128_ult(mask, shft_res2)) {
201477fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask);
201577fca794SSong Gao     }else {
201677fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
201777fca794SSong Gao     }
201877fca794SSong Gao }
201977fca794SSong Gao 
HELPER(vssrlrni_d_q)202077fca794SSong Gao void HELPER(vssrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
202177fca794SSong Gao {
202277fca794SSong Gao     int i;
202377fca794SSong Gao     Int128 mask;
202477fca794SSong Gao     VReg *Vd = (VReg *)vd;
202577fca794SSong Gao     VReg *Vj = (VReg *)vj;
202677fca794SSong Gao     int oprsz = simd_oprsz(desc);
202777fca794SSong Gao 
202877fca794SSong Gao     mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
202977fca794SSong Gao 
203077fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {
203177fca794SSong Gao         do_vssrlrni_q(Vd, Vj, imm, i, mask);
203277fca794SSong Gao     }
2033162cd32cSSong Gao }
2034162cd32cSSong Gao 
2035162cd32cSSong Gao VSSRLRNI(vssrlrni_b_h, 16, B, H)
2036162cd32cSSong Gao VSSRLRNI(vssrlrni_h_w, 32, H, W)
2037162cd32cSSong Gao VSSRLRNI(vssrlrni_w_d, 64, W, D)
2038162cd32cSSong Gao 
2039162cd32cSSong Gao #define VSSRARNI(NAME, BIT, E1, E2)                                                 \
2040329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                  \
2041162cd32cSSong Gao {                                                                                   \
204277fca794SSong Gao     int i, j, ofs;                                                                  \
204377fca794SSong Gao     VReg temp = {};                                                                 \
2044329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                          \
2045329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                          \
204677fca794SSong Gao     int oprsz = simd_oprsz(desc);                                                   \
2047162cd32cSSong Gao                                                                                     \
204877fca794SSong Gao     ofs = LSX_LEN / BIT;                                                            \
204977fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                              \
205077fca794SSong Gao         for (j = 0; j < ofs; j++) {                                                 \
205177fca794SSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i),       \
205277fca794SSong Gao                                                          imm, BIT / 2 - 1);         \
205377fca794SSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrarns_ ## E1(Vd->E2(j + ofs * i), \
205477fca794SSong Gao                                                                imm, BIT / 2 - 1);   \
205577fca794SSong Gao         }                                                                           \
2056162cd32cSSong Gao     }                                                                               \
2057162cd32cSSong Gao     *Vd = temp;                                                                     \
2058162cd32cSSong Gao }
2059162cd32cSSong Gao 
do_vssrarni_d_q(VReg * Vd,VReg * Vj,uint64_t imm,int idx,Int128 mask1,Int128 mask2)206077fca794SSong Gao static void do_vssrarni_d_q(VReg *Vd, VReg *Vj,
206177fca794SSong Gao                            uint64_t imm, int idx, Int128 mask1, Int128 mask2)
2062162cd32cSSong Gao {
206377fca794SSong Gao     Int128 shft_res1, shft_res2, r1, r2;
2064162cd32cSSong Gao 
2065162cd32cSSong Gao     if (imm == 0) {
206677fca794SSong Gao         shft_res1 = Vj->Q(idx);
206777fca794SSong Gao         shft_res2 = Vd->Q(idx);
2068162cd32cSSong Gao     } else {
206977fca794SSong Gao         r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one());
207077fca794SSong Gao         r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one());
207177fca794SSong Gao         shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1);
207277fca794SSong Gao         shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2);
2073162cd32cSSong Gao     }
207477fca794SSong Gao     if (int128_gt(shft_res1, mask1)) {
207577fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(mask1);
207677fca794SSong Gao     } else if (int128_lt(shft_res1, int128_neg(mask2))) {
207777fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(mask2);
207877fca794SSong Gao     } else {
207977fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
208077fca794SSong Gao     }
208177fca794SSong Gao 
208277fca794SSong Gao     if (int128_gt(shft_res2, mask1)) {
208377fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask1);
208477fca794SSong Gao     } else if (int128_lt(shft_res2, int128_neg(mask2))) {
208577fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask2);
208677fca794SSong Gao     } else {
208777fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
208877fca794SSong Gao     }
208977fca794SSong Gao }
209077fca794SSong Gao 
HELPER(vssrarni_d_q)209177fca794SSong Gao void HELPER(vssrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
209277fca794SSong Gao {
209377fca794SSong Gao     int i;
209477fca794SSong Gao     Int128 mask1, mask2;
209577fca794SSong Gao     VReg *Vd = (VReg *)vd;
209677fca794SSong Gao     VReg *Vj = (VReg *)vj;
209777fca794SSong Gao     int oprsz = simd_oprsz(desc);
2098162cd32cSSong Gao 
2099162cd32cSSong Gao     mask1 = int128_sub(int128_lshift(int128_one(), 63), int128_one());
2100162cd32cSSong Gao     mask2  = int128_lshift(int128_one(), 63);
2101162cd32cSSong Gao 
210277fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {
210377fca794SSong Gao         do_vssrarni_d_q(Vd, Vj, imm, i, mask1, mask2);
2104162cd32cSSong Gao     }
2105162cd32cSSong Gao }
2106162cd32cSSong Gao 
2107162cd32cSSong Gao VSSRARNI(vssrarni_b_h, 16, B, H)
2108162cd32cSSong Gao VSSRARNI(vssrarni_h_w, 32, H, W)
2109162cd32cSSong Gao VSSRARNI(vssrarni_w_d, 64, W, D)
2110162cd32cSSong Gao 
2111162cd32cSSong Gao #define VSSRLRNUI(NAME, BIT, E1, E2)                                                \
2112329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                  \
2113162cd32cSSong Gao {                                                                                   \
211477fca794SSong Gao     int i, j, ofs;                                                                  \
211577fca794SSong Gao     VReg temp = {};                                                                 \
2116329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                          \
2117329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                          \
211877fca794SSong Gao     int oprsz = simd_oprsz(desc);                                                   \
2119162cd32cSSong Gao                                                                                     \
212077fca794SSong Gao     ofs = LSX_LEN / BIT;                                                            \
212177fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                              \
212277fca794SSong Gao         for (j = 0; j < ofs; j++) {                                                 \
212377fca794SSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i),       \
212477fca794SSong Gao                                                          imm, BIT / 2);             \
212577fca794SSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrnu_ ## E1(Vd->E2(j + ofs * i), \
212677fca794SSong Gao                                                                imm, BIT / 2);       \
212777fca794SSong Gao         }                                                                           \
2128162cd32cSSong Gao     }                                                                               \
2129162cd32cSSong Gao     *Vd = temp;                                                                     \
2130162cd32cSSong Gao }
2131162cd32cSSong Gao 
HELPER(vssrlrni_du_q)213277fca794SSong Gao void HELPER(vssrlrni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
213377fca794SSong Gao {
213477fca794SSong Gao     int i;
213577fca794SSong Gao     Int128 mask;
213677fca794SSong Gao     VReg *Vd = (VReg *)vd;
213777fca794SSong Gao     VReg *Vj = (VReg *)vj;
213877fca794SSong Gao     int oprsz = simd_oprsz(desc);
213977fca794SSong Gao 
214077fca794SSong Gao     mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
214177fca794SSong Gao 
214277fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {
214377fca794SSong Gao         do_vssrlrni_q(Vd, Vj, imm, i, mask);
214477fca794SSong Gao     }
214577fca794SSong Gao }
214677fca794SSong Gao 
2147162cd32cSSong Gao VSSRLRNUI(vssrlrni_bu_h, 16, B, H)
2148162cd32cSSong Gao VSSRLRNUI(vssrlrni_hu_w, 32, H, W)
2149162cd32cSSong Gao VSSRLRNUI(vssrlrni_wu_d, 64, W, D)
2150162cd32cSSong Gao 
2151162cd32cSSong Gao #define VSSRARNUI(NAME, BIT, E1, E2)                                                \
2152329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)                  \
2153162cd32cSSong Gao {                                                                                   \
215477fca794SSong Gao     int i, j, ofs;                                                                  \
215577fca794SSong Gao     VReg temp = {};                                                                 \
2156329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                          \
2157329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                          \
215877fca794SSong Gao     int oprsz = simd_oprsz(desc);                                                   \
2159162cd32cSSong Gao                                                                                     \
216077fca794SSong Gao     ofs = LSX_LEN / BIT;                                                            \
216177fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                              \
216277fca794SSong Gao         for (j = 0; j < ofs; j++) {                                                 \
216377fca794SSong Gao             temp.E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i),       \
216477fca794SSong Gao                                                          imm, BIT / 2);             \
216577fca794SSong Gao             temp.E1(j + ofs * (2 * i + 1)) = do_ssrarnu_ ## E1(Vd->E2(j + ofs * i), \
216677fca794SSong Gao                                                                imm, BIT / 2);       \
216777fca794SSong Gao         }                                                                           \
2168162cd32cSSong Gao     }                                                                               \
2169162cd32cSSong Gao     *Vd = temp;                                                                     \
2170162cd32cSSong Gao }
2171162cd32cSSong Gao 
do_vssrarni_du_q(VReg * Vd,VReg * Vj,uint64_t imm,int idx,Int128 mask1,Int128 mask2)217277fca794SSong Gao static void do_vssrarni_du_q(VReg *Vd, VReg *Vj,
217377fca794SSong Gao                              uint64_t imm, int idx, Int128 mask1, Int128 mask2)
2174162cd32cSSong Gao {
217577fca794SSong Gao     Int128 shft_res1, shft_res2, r1, r2;
2176162cd32cSSong Gao 
2177162cd32cSSong Gao     if (imm == 0) {
217877fca794SSong Gao         shft_res1 = Vj->Q(idx);
217977fca794SSong Gao         shft_res2 = Vd->Q(idx);
2180162cd32cSSong Gao     } else {
218177fca794SSong Gao         r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one());
218277fca794SSong Gao         r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one());
218377fca794SSong Gao         shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1);
218477fca794SSong Gao         shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2);
2185162cd32cSSong Gao     }
2186162cd32cSSong Gao 
218777fca794SSong Gao     if (int128_lt(Vj->Q(idx), int128_zero())) {
2188162cd32cSSong Gao         shft_res1 = int128_zero();
2189162cd32cSSong Gao     }
219077fca794SSong Gao     if (int128_lt(Vd->Q(idx), int128_zero())) {
2191162cd32cSSong Gao         shft_res2 = int128_zero();
2192162cd32cSSong Gao     }
2193162cd32cSSong Gao 
219477fca794SSong Gao     if (int128_gt(shft_res1,  mask1)) {
219577fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(mask1);
219677fca794SSong Gao     } else if (int128_lt(shft_res1, int128_neg(mask2))) {
219777fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(mask2);
219877fca794SSong Gao     } else {
219977fca794SSong Gao         Vd->D(idx * 2) = int128_getlo(shft_res1);
220077fca794SSong Gao     }
220177fca794SSong Gao 
220277fca794SSong Gao     if (int128_gt(shft_res2, mask1)) {
220377fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask1);
220477fca794SSong Gao     } else if (int128_lt(shft_res2, int128_neg(mask2))) {
220577fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(mask2);
220677fca794SSong Gao     } else {
220777fca794SSong Gao         Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
220877fca794SSong Gao     }
220977fca794SSong Gao }
221077fca794SSong Gao 
HELPER(vssrarni_du_q)221177fca794SSong Gao void HELPER(vssrarni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
221277fca794SSong Gao {
221377fca794SSong Gao     int i;
221477fca794SSong Gao     Int128 mask1, mask2;
221577fca794SSong Gao     VReg *Vd = (VReg *)vd;
221677fca794SSong Gao     VReg *Vj = (VReg *)vj;
221777fca794SSong Gao     int oprsz = simd_oprsz(desc);
221877fca794SSong Gao 
2219162cd32cSSong Gao     mask1 = int128_sub(int128_lshift(int128_one(), 64), int128_one());
2220162cd32cSSong Gao     mask2  = int128_lshift(int128_one(), 64);
2221162cd32cSSong Gao 
222277fca794SSong Gao     for (i = 0; i < oprsz / 16; i++) {
222377fca794SSong Gao         do_vssrarni_du_q(Vd, Vj, imm, i, mask1, mask2);
2224162cd32cSSong Gao     }
2225162cd32cSSong Gao }
2226162cd32cSSong Gao 
2227162cd32cSSong Gao VSSRARNUI(vssrarni_bu_h, 16, B, H)
2228162cd32cSSong Gao VSSRARNUI(vssrarni_hu_w, 32, H, W)
2229162cd32cSSong Gao VSSRARNUI(vssrarni_wu_d, 64, W, D)
22302e105e12SSong Gao 
22312e105e12SSong Gao #define DO_2OP(NAME, BIT, E, DO_OP)                  \
2232ff27e335SSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
22332e105e12SSong Gao {                                                    \
22342e105e12SSong Gao     int i;                                           \
2235ff27e335SSong Gao     VReg *Vd = (VReg *)vd;                           \
2236ff27e335SSong Gao     VReg *Vj = (VReg *)vj;                           \
223712ad133fSSong Gao     int oprsz = simd_oprsz(desc);                    \
22382e105e12SSong Gao                                                      \
223912ad133fSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++)          \
22402e105e12SSong Gao     {                                                \
22412e105e12SSong Gao         Vd->E(i) = DO_OP(Vj->E(i));                  \
22422e105e12SSong Gao     }                                                \
22432e105e12SSong Gao }
22442e105e12SSong Gao 
22452e105e12SSong Gao DO_2OP(vclo_b, 8, UB, DO_CLO_B)
22462e105e12SSong Gao DO_2OP(vclo_h, 16, UH, DO_CLO_H)
22472e105e12SSong Gao DO_2OP(vclo_w, 32, UW, DO_CLO_W)
22482e105e12SSong Gao DO_2OP(vclo_d, 64, UD, DO_CLO_D)
22492e105e12SSong Gao DO_2OP(vclz_b, 8, UB, DO_CLZ_B)
22502e105e12SSong Gao DO_2OP(vclz_h, 16, UH, DO_CLZ_H)
22512e105e12SSong Gao DO_2OP(vclz_w, 32, UW, DO_CLZ_W)
22522e105e12SSong Gao DO_2OP(vclz_d, 64, UD, DO_CLZ_D)
2253bb22ee57SSong Gao 
2254bb22ee57SSong Gao #define VPCNT(NAME, BIT, E, FN)                      \
2255ff27e335SSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
2256bb22ee57SSong Gao {                                                    \
2257bb22ee57SSong Gao     int i;                                           \
2258ff27e335SSong Gao     VReg *Vd = (VReg *)vd;                           \
2259ff27e335SSong Gao     VReg *Vj = (VReg *)vj;                           \
2260956dec74SSong Gao     int oprsz = simd_oprsz(desc);                    \
2261bb22ee57SSong Gao                                                      \
2262956dec74SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++)          \
2263bb22ee57SSong Gao     {                                                \
2264bb22ee57SSong Gao         Vd->E(i) = FN(Vj->E(i));                     \
2265bb22ee57SSong Gao     }                                                \
2266bb22ee57SSong Gao }
2267bb22ee57SSong Gao 
2268bb22ee57SSong Gao VPCNT(vpcnt_b, 8, UB, ctpop8)
2269bb22ee57SSong Gao VPCNT(vpcnt_h, 16, UH, ctpop16)
2270bb22ee57SSong Gao VPCNT(vpcnt_w, 32, UW, ctpop32)
2271bb22ee57SSong Gao VPCNT(vpcnt_d, 64, UD, ctpop64)
22720b1e6705SSong Gao 
22730b1e6705SSong Gao #define DO_BIT(NAME, BIT, E, DO_OP)                            \
22741b3e242fSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
22750b1e6705SSong Gao {                                                              \
22760b1e6705SSong Gao     int i;                                                     \
22770b1e6705SSong Gao     VReg *Vd = (VReg *)vd;                                     \
22780b1e6705SSong Gao     VReg *Vj = (VReg *)vj;                                     \
22790b1e6705SSong Gao     VReg *Vk = (VReg *)vk;                                     \
22801b3e242fSSong Gao     int oprsz = simd_oprsz(desc);                              \
22810b1e6705SSong Gao                                                                \
22821b3e242fSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
22830b1e6705SSong Gao         Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)%BIT);              \
22840b1e6705SSong Gao     }                                                          \
22850b1e6705SSong Gao }
22860b1e6705SSong Gao 
22870b1e6705SSong Gao DO_BIT(vbitclr_b, 8, UB, DO_BITCLR)
22880b1e6705SSong Gao DO_BIT(vbitclr_h, 16, UH, DO_BITCLR)
22890b1e6705SSong Gao DO_BIT(vbitclr_w, 32, UW, DO_BITCLR)
22900b1e6705SSong Gao DO_BIT(vbitclr_d, 64, UD, DO_BITCLR)
22910b1e6705SSong Gao DO_BIT(vbitset_b, 8, UB, DO_BITSET)
22920b1e6705SSong Gao DO_BIT(vbitset_h, 16, UH, DO_BITSET)
22930b1e6705SSong Gao DO_BIT(vbitset_w, 32, UW, DO_BITSET)
22940b1e6705SSong Gao DO_BIT(vbitset_d, 64, UD, DO_BITSET)
22950b1e6705SSong Gao DO_BIT(vbitrev_b, 8, UB, DO_BITREV)
22960b1e6705SSong Gao DO_BIT(vbitrev_h, 16, UH, DO_BITREV)
22970b1e6705SSong Gao DO_BIT(vbitrev_w, 32, UW, DO_BITREV)
22980b1e6705SSong Gao DO_BIT(vbitrev_d, 64, UD, DO_BITREV)
22990b1e6705SSong Gao 
23000b1e6705SSong Gao #define DO_BITI(NAME, BIT, E, DO_OP)                               \
23011b3e242fSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
23020b1e6705SSong Gao {                                                                  \
23030b1e6705SSong Gao     int i;                                                         \
23040b1e6705SSong Gao     VReg *Vd = (VReg *)vd;                                         \
23050b1e6705SSong Gao     VReg *Vj = (VReg *)vj;                                         \
23061b3e242fSSong Gao     int oprsz = simd_oprsz(desc);                                  \
23070b1e6705SSong Gao                                                                    \
23081b3e242fSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
23090b1e6705SSong Gao         Vd->E(i) = DO_OP(Vj->E(i), imm);                           \
23100b1e6705SSong Gao     }                                                              \
23110b1e6705SSong Gao }
23120b1e6705SSong Gao 
23130b1e6705SSong Gao DO_BITI(vbitclri_b, 8, UB, DO_BITCLR)
23140b1e6705SSong Gao DO_BITI(vbitclri_h, 16, UH, DO_BITCLR)
23150b1e6705SSong Gao DO_BITI(vbitclri_w, 32, UW, DO_BITCLR)
23160b1e6705SSong Gao DO_BITI(vbitclri_d, 64, UD, DO_BITCLR)
23170b1e6705SSong Gao DO_BITI(vbitseti_b, 8, UB, DO_BITSET)
23180b1e6705SSong Gao DO_BITI(vbitseti_h, 16, UH, DO_BITSET)
23190b1e6705SSong Gao DO_BITI(vbitseti_w, 32, UW, DO_BITSET)
23200b1e6705SSong Gao DO_BITI(vbitseti_d, 64, UD, DO_BITSET)
23210b1e6705SSong Gao DO_BITI(vbitrevi_b, 8, UB, DO_BITREV)
23220b1e6705SSong Gao DO_BITI(vbitrevi_h, 16, UH, DO_BITREV)
23230b1e6705SSong Gao DO_BITI(vbitrevi_w, 32, UW, DO_BITREV)
23240b1e6705SSong Gao DO_BITI(vbitrevi_d, 64, UD, DO_BITREV)
2325ac95a0b9SSong Gao 
2326ac95a0b9SSong Gao #define VFRSTP(NAME, BIT, MASK, E)                             \
232704711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2328ac95a0b9SSong Gao {                                                              \
2329abee168eSSong Gao     int i, j, m, ofs;                                          \
233004711da1SSong Gao     VReg *Vd = (VReg *)vd;                                     \
233104711da1SSong Gao     VReg *Vj = (VReg *)vj;                                     \
233204711da1SSong Gao     VReg *Vk = (VReg *)vk;                                     \
2333abee168eSSong Gao     int oprsz = simd_oprsz(desc);                              \
2334ac95a0b9SSong Gao                                                                \
2335abee168eSSong Gao     ofs = LSX_LEN / BIT;                                       \
2336abee168eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                         \
2337abee168eSSong Gao         m = Vk->E(i * ofs) & MASK;                             \
2338abee168eSSong Gao         for (j = 0; j < ofs; j++) {                            \
2339abee168eSSong Gao             if (Vj->E(j + ofs * i) < 0) {                      \
2340ac95a0b9SSong Gao                 break;                                         \
2341ac95a0b9SSong Gao             }                                                  \
2342ac95a0b9SSong Gao         }                                                      \
2343abee168eSSong Gao         Vd->E(m + i * ofs) = j;                                \
2344abee168eSSong Gao     }                                                          \
2345ac95a0b9SSong Gao }
2346ac95a0b9SSong Gao 
2347ac95a0b9SSong Gao VFRSTP(vfrstp_b, 8, 0xf, B)
2348ac95a0b9SSong Gao VFRSTP(vfrstp_h, 16, 0x7, H)
2349ac95a0b9SSong Gao 
2350ac95a0b9SSong Gao #define VFRSTPI(NAME, BIT, E)                                      \
2351329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
2352ac95a0b9SSong Gao {                                                                  \
2353abee168eSSong Gao     int i, j, m, ofs;                                              \
2354329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                         \
2355329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                         \
2356abee168eSSong Gao     int oprsz = simd_oprsz(desc);                                  \
2357ac95a0b9SSong Gao                                                                    \
2358abee168eSSong Gao     ofs = LSX_LEN / BIT;                                           \
2359abee168eSSong Gao     m = imm % ofs;                                                 \
2360abee168eSSong Gao     for (i = 0; i < oprsz / 16; i++) {                             \
2361abee168eSSong Gao         for (j = 0; j < ofs; j++) {                                \
2362abee168eSSong Gao             if (Vj->E(j + ofs * i) < 0) {                          \
2363ac95a0b9SSong Gao                 break;                                             \
2364ac95a0b9SSong Gao             }                                                      \
2365ac95a0b9SSong Gao         }                                                          \
2366abee168eSSong Gao         Vd->E(m + i * ofs) = j;                                    \
2367abee168eSSong Gao     }                                                              \
2368ac95a0b9SSong Gao }
2369ac95a0b9SSong Gao 
2370ac95a0b9SSong Gao VFRSTPI(vfrstpi_b, 8,  B)
2371ac95a0b9SSong Gao VFRSTPI(vfrstpi_h, 16, H)
2372aca67472SSong Gao 
vec_update_fcsr0_mask(CPULoongArchState * env,uintptr_t pc,int mask)2373aca67472SSong Gao static void vec_update_fcsr0_mask(CPULoongArchState *env,
2374aca67472SSong Gao                                   uintptr_t pc, int mask)
2375aca67472SSong Gao {
2376aca67472SSong Gao     int flags = get_float_exception_flags(&env->fp_status);
2377aca67472SSong Gao 
2378aca67472SSong Gao     set_float_exception_flags(0, &env->fp_status);
2379aca67472SSong Gao 
2380aca67472SSong Gao     flags &= ~mask;
2381aca67472SSong Gao 
2382aca67472SSong Gao     if (flags) {
2383aca67472SSong Gao         flags = ieee_ex_to_loongarch(flags);
2384aca67472SSong Gao         UPDATE_FP_CAUSE(env->fcsr0, flags);
2385aca67472SSong Gao     }
2386aca67472SSong Gao 
2387aca67472SSong Gao     if (GET_FP_ENABLES(env->fcsr0) & flags) {
2388aca67472SSong Gao         do_raise_exception(env, EXCCODE_FPE, pc);
2389aca67472SSong Gao     } else {
2390aca67472SSong Gao         UPDATE_FP_FLAGS(env->fcsr0, flags);
2391aca67472SSong Gao     }
2392aca67472SSong Gao }
2393aca67472SSong Gao 
vec_update_fcsr0(CPULoongArchState * env,uintptr_t pc)2394aca67472SSong Gao static void vec_update_fcsr0(CPULoongArchState *env, uintptr_t pc)
2395aca67472SSong Gao {
2396aca67472SSong Gao     vec_update_fcsr0_mask(env, pc, 0);
2397aca67472SSong Gao }
2398aca67472SSong Gao 
vec_clear_cause(CPULoongArchState * env)2399aca67472SSong Gao static inline void vec_clear_cause(CPULoongArchState *env)
2400aca67472SSong Gao {
2401aca67472SSong Gao     SET_FP_CAUSE(env->fcsr0, 0);
2402aca67472SSong Gao }
2403aca67472SSong Gao 
2404aca67472SSong Gao #define DO_3OP_F(NAME, BIT, E, FN)                          \
24053b286753SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk,             \
24063b286753SSong Gao                   CPULoongArchState *env, uint32_t desc)    \
2407aca67472SSong Gao {                                                           \
2408aca67472SSong Gao     int i;                                                  \
24093b286753SSong Gao     VReg *Vd = (VReg *)vd;                                  \
24103b286753SSong Gao     VReg *Vj = (VReg *)vj;                                  \
24113b286753SSong Gao     VReg *Vk = (VReg *)vk;                                  \
2412c9caf158SSong Gao     int oprsz = simd_oprsz(desc);                           \
2413aca67472SSong Gao                                                             \
2414aca67472SSong Gao     vec_clear_cause(env);                                   \
2415c9caf158SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {               \
2416aca67472SSong Gao         Vd->E(i) = FN(Vj->E(i), Vk->E(i), &env->fp_status); \
2417aca67472SSong Gao         vec_update_fcsr0(env, GETPC());                     \
2418aca67472SSong Gao     }                                                       \
2419aca67472SSong Gao }
2420aca67472SSong Gao 
2421aca67472SSong Gao DO_3OP_F(vfadd_s, 32, UW, float32_add)
2422aca67472SSong Gao DO_3OP_F(vfadd_d, 64, UD, float64_add)
2423aca67472SSong Gao DO_3OP_F(vfsub_s, 32, UW, float32_sub)
2424aca67472SSong Gao DO_3OP_F(vfsub_d, 64, UD, float64_sub)
2425aca67472SSong Gao DO_3OP_F(vfmul_s, 32, UW, float32_mul)
2426aca67472SSong Gao DO_3OP_F(vfmul_d, 64, UD, float64_mul)
2427aca67472SSong Gao DO_3OP_F(vfdiv_s, 32, UW, float32_div)
2428aca67472SSong Gao DO_3OP_F(vfdiv_d, 64, UD, float64_div)
2429aca67472SSong Gao DO_3OP_F(vfmax_s, 32, UW, float32_maxnum)
2430aca67472SSong Gao DO_3OP_F(vfmax_d, 64, UD, float64_maxnum)
2431aca67472SSong Gao DO_3OP_F(vfmin_s, 32, UW, float32_minnum)
2432aca67472SSong Gao DO_3OP_F(vfmin_d, 64, UD, float64_minnum)
2433aca67472SSong Gao DO_3OP_F(vfmaxa_s, 32, UW, float32_maxnummag)
2434aca67472SSong Gao DO_3OP_F(vfmaxa_d, 64, UD, float64_maxnummag)
2435aca67472SSong Gao DO_3OP_F(vfmina_s, 32, UW, float32_minnummag)
2436aca67472SSong Gao DO_3OP_F(vfmina_d, 64, UD, float64_minnummag)
2437aca67472SSong Gao 
2438aca67472SSong Gao #define DO_4OP_F(NAME, BIT, E, FN, flags)                                    \
2439e2600dadSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, void *va,                    \
2440e2600dadSSong Gao                   CPULoongArchState *env, uint32_t desc)                     \
2441aca67472SSong Gao {                                                                            \
2442aca67472SSong Gao     int i;                                                                   \
2443e2600dadSSong Gao     VReg *Vd = (VReg *)vd;                                                   \
2444e2600dadSSong Gao     VReg *Vj = (VReg *)vj;                                                   \
2445e2600dadSSong Gao     VReg *Vk = (VReg *)vk;                                                   \
2446e2600dadSSong Gao     VReg *Va = (VReg *)va;                                                   \
2447c9caf158SSong Gao     int oprsz = simd_oprsz(desc);                                            \
2448aca67472SSong Gao                                                                              \
2449aca67472SSong Gao     vec_clear_cause(env);                                                    \
2450c9caf158SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                                \
2451aca67472SSong Gao         Vd->E(i) = FN(Vj->E(i), Vk->E(i), Va->E(i), flags, &env->fp_status); \
2452aca67472SSong Gao         vec_update_fcsr0(env, GETPC());                                      \
2453aca67472SSong Gao     }                                                                        \
2454aca67472SSong Gao }
2455aca67472SSong Gao 
2456aca67472SSong Gao DO_4OP_F(vfmadd_s, 32, UW, float32_muladd, 0)
2457aca67472SSong Gao DO_4OP_F(vfmadd_d, 64, UD, float64_muladd, 0)
2458aca67472SSong Gao DO_4OP_F(vfmsub_s, 32, UW, float32_muladd, float_muladd_negate_c)
2459aca67472SSong Gao DO_4OP_F(vfmsub_d, 64, UD, float64_muladd, float_muladd_negate_c)
2460aca67472SSong Gao DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_result)
2461aca67472SSong Gao DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_result)
2462aca67472SSong Gao DO_4OP_F(vfnmsub_s, 32, UW, float32_muladd,
2463aca67472SSong Gao          float_muladd_negate_c | float_muladd_negate_result)
2464aca67472SSong Gao DO_4OP_F(vfnmsub_d, 64, UD, float64_muladd,
2465aca67472SSong Gao          float_muladd_negate_c | float_muladd_negate_result)
2466aca67472SSong Gao 
2467aca67472SSong Gao #define DO_2OP_F(NAME, BIT, E, FN)                       \
2468226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj,                    \
2469226bf881SSong Gao                   CPULoongArchState *env, uint32_t desc) \
2470aca67472SSong Gao {                                                        \
2471aca67472SSong Gao     int i;                                               \
2472226bf881SSong Gao     VReg *Vd = (VReg *)vd;                               \
2473226bf881SSong Gao     VReg *Vj = (VReg *)vj;                               \
2474c9caf158SSong Gao     int oprsz = simd_oprsz(desc);                        \
2475aca67472SSong Gao                                                          \
2476aca67472SSong Gao     vec_clear_cause(env);                                \
2477c9caf158SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {            \
2478aca67472SSong Gao         Vd->E(i) = FN(env, Vj->E(i));                    \
2479aca67472SSong Gao     }                                                    \
2480aca67472SSong Gao }
2481aca67472SSong Gao 
2482aca67472SSong Gao #define FLOGB(BIT, T)                                            \
2483aca67472SSong Gao static T do_flogb_## BIT(CPULoongArchState *env, T fj)           \
2484aca67472SSong Gao {                                                                \
2485aca67472SSong Gao     T fp, fd;                                                    \
2486aca67472SSong Gao     float_status *status = &env->fp_status;                      \
2487aca67472SSong Gao     FloatRoundMode old_mode = get_float_rounding_mode(status);   \
2488aca67472SSong Gao                                                                  \
2489aca67472SSong Gao     set_float_rounding_mode(float_round_down, status);           \
2490aca67472SSong Gao     fp = float ## BIT ##_log2(fj, status);                       \
2491aca67472SSong Gao     fd = float ## BIT ##_round_to_int(fp, status);               \
2492aca67472SSong Gao     set_float_rounding_mode(old_mode, status);                   \
2493aca67472SSong Gao     vec_update_fcsr0_mask(env, GETPC(), float_flag_inexact);     \
2494aca67472SSong Gao     return fd;                                                   \
2495aca67472SSong Gao }
2496aca67472SSong Gao 
2497aca67472SSong Gao FLOGB(32, uint32_t)
2498aca67472SSong Gao FLOGB(64, uint64_t)
2499aca67472SSong Gao 
2500aca67472SSong Gao #define FCLASS(NAME, BIT, E, FN)                         \
2501226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj,                    \
2502226bf881SSong Gao                   CPULoongArchState *env, uint32_t desc) \
2503aca67472SSong Gao {                                                        \
2504aca67472SSong Gao     int i;                                               \
2505226bf881SSong Gao     VReg *Vd = (VReg *)vd;                               \
2506226bf881SSong Gao     VReg *Vj = (VReg *)vj;                               \
2507c9caf158SSong Gao     int oprsz = simd_oprsz(desc);                        \
2508aca67472SSong Gao                                                          \
2509c9caf158SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {            \
2510aca67472SSong Gao         Vd->E(i) = FN(env, Vj->E(i));                    \
2511aca67472SSong Gao     }                                                    \
2512aca67472SSong Gao }
2513aca67472SSong Gao 
2514aca67472SSong Gao FCLASS(vfclass_s, 32, UW, helper_fclass_s)
2515aca67472SSong Gao FCLASS(vfclass_d, 64, UD, helper_fclass_d)
2516aca67472SSong Gao 
2517aca67472SSong Gao #define FSQRT(BIT, T)                                  \
2518aca67472SSong Gao static T do_fsqrt_## BIT(CPULoongArchState *env, T fj) \
2519aca67472SSong Gao {                                                      \
2520aca67472SSong Gao     T fd;                                              \
2521aca67472SSong Gao     fd = float ## BIT ##_sqrt(fj, &env->fp_status);    \
2522aca67472SSong Gao     vec_update_fcsr0(env, GETPC());                    \
2523aca67472SSong Gao     return fd;                                         \
2524aca67472SSong Gao }
2525aca67472SSong Gao 
2526aca67472SSong Gao FSQRT(32, uint32_t)
2527aca67472SSong Gao FSQRT(64, uint64_t)
2528aca67472SSong Gao 
2529aca67472SSong Gao #define FRECIP(BIT, T)                                                  \
2530aca67472SSong Gao static T do_frecip_## BIT(CPULoongArchState *env, T fj)                 \
2531aca67472SSong Gao {                                                                       \
2532aca67472SSong Gao     T fd;                                                               \
2533aca67472SSong Gao     fd = float ## BIT ##_div(float ## BIT ##_one, fj, &env->fp_status); \
2534aca67472SSong Gao     vec_update_fcsr0(env, GETPC());                                     \
2535aca67472SSong Gao     return fd;                                                          \
2536aca67472SSong Gao }
2537aca67472SSong Gao 
2538aca67472SSong Gao FRECIP(32, uint32_t)
2539aca67472SSong Gao FRECIP(64, uint64_t)
2540aca67472SSong Gao 
2541aca67472SSong Gao #define FRSQRT(BIT, T)                                                  \
2542aca67472SSong Gao static T do_frsqrt_## BIT(CPULoongArchState *env, T fj)                 \
2543aca67472SSong Gao {                                                                       \
2544aca67472SSong Gao     T fd, fp;                                                           \
2545aca67472SSong Gao     fp = float ## BIT ##_sqrt(fj, &env->fp_status);                     \
2546aca67472SSong Gao     fd = float ## BIT ##_div(float ## BIT ##_one, fp, &env->fp_status); \
2547aca67472SSong Gao     vec_update_fcsr0(env, GETPC());                                     \
2548aca67472SSong Gao     return fd;                                                          \
2549aca67472SSong Gao }
2550aca67472SSong Gao 
2551aca67472SSong Gao FRSQRT(32, uint32_t)
2552aca67472SSong Gao FRSQRT(64, uint64_t)
2553aca67472SSong Gao 
2554aca67472SSong Gao DO_2OP_F(vflogb_s, 32, UW, do_flogb_32)
2555aca67472SSong Gao DO_2OP_F(vflogb_d, 64, UD, do_flogb_64)
2556aca67472SSong Gao DO_2OP_F(vfsqrt_s, 32, UW, do_fsqrt_32)
2557aca67472SSong Gao DO_2OP_F(vfsqrt_d, 64, UD, do_fsqrt_64)
2558aca67472SSong Gao DO_2OP_F(vfrecip_s, 32, UW, do_frecip_32)
2559aca67472SSong Gao DO_2OP_F(vfrecip_d, 64, UD, do_frecip_64)
2560aca67472SSong Gao DO_2OP_F(vfrsqrt_s, 32, UW, do_frsqrt_32)
2561aca67472SSong Gao DO_2OP_F(vfrsqrt_d, 64, UD, do_frsqrt_64)
2562399665d2SSong Gao 
float16_cvt_float32(uint16_t h,float_status * status)2563399665d2SSong Gao static uint32_t float16_cvt_float32(uint16_t h, float_status *status)
2564399665d2SSong Gao {
2565399665d2SSong Gao     return float16_to_float32(h, true, status);
2566399665d2SSong Gao }
float32_cvt_float64(uint32_t s,float_status * status)2567399665d2SSong Gao static uint64_t float32_cvt_float64(uint32_t s, float_status *status)
2568399665d2SSong Gao {
2569399665d2SSong Gao     return float32_to_float64(s, status);
2570399665d2SSong Gao }
2571399665d2SSong Gao 
float32_cvt_float16(uint32_t s,float_status * status)2572399665d2SSong Gao static uint16_t float32_cvt_float16(uint32_t s, float_status *status)
2573399665d2SSong Gao {
2574399665d2SSong Gao     return float32_to_float16(s, true, status);
2575399665d2SSong Gao }
float64_cvt_float32(uint64_t d,float_status * status)2576399665d2SSong Gao static uint32_t float64_cvt_float32(uint64_t d, float_status *status)
2577399665d2SSong Gao {
2578399665d2SSong Gao     return float64_to_float32(d, status);
2579399665d2SSong Gao }
2580399665d2SSong Gao 
HELPER(vfcvtl_s_h)2581226bf881SSong Gao void HELPER(vfcvtl_s_h)(void *vd, void *vj,
2582226bf881SSong Gao                         CPULoongArchState *env, uint32_t desc)
2583399665d2SSong Gao {
258460df31a2SSong Gao     int i, j, ofs;
258560df31a2SSong Gao     VReg temp = {};
2586226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2587226bf881SSong Gao     VReg *Vj = (VReg *)vj;
258860df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2589399665d2SSong Gao 
259060df31a2SSong Gao     ofs = LSX_LEN / 32;
2591399665d2SSong Gao     vec_clear_cause(env);
259260df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
259360df31a2SSong Gao         for (j = 0; j < ofs; j++) {
259460df31a2SSong Gao             temp.UW(j + ofs * i) =float16_cvt_float32(Vj->UH(j + ofs * 2 * i),
259560df31a2SSong Gao                                                       &env->fp_status);
259660df31a2SSong Gao         }
2597399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2598399665d2SSong Gao     }
2599399665d2SSong Gao     *Vd = temp;
2600399665d2SSong Gao }
2601399665d2SSong Gao 
HELPER(vfcvtl_d_s)2602226bf881SSong Gao void HELPER(vfcvtl_d_s)(void *vd, void *vj,
2603226bf881SSong Gao                         CPULoongArchState *env, uint32_t desc)
2604399665d2SSong Gao {
260560df31a2SSong Gao     int i, j, ofs;
260660df31a2SSong Gao     VReg temp = {};
2607226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2608226bf881SSong Gao     VReg *Vj = (VReg *)vj;
260960df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2610399665d2SSong Gao 
261160df31a2SSong Gao     ofs = LSX_LEN / 64;
2612399665d2SSong Gao     vec_clear_cause(env);
261360df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
261460df31a2SSong Gao         for (j = 0; j < ofs; j++) {
261560df31a2SSong Gao             temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * 2 * i),
261660df31a2SSong Gao                                                        &env->fp_status);
261760df31a2SSong Gao         }
2618399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2619399665d2SSong Gao     }
2620399665d2SSong Gao     *Vd = temp;
2621399665d2SSong Gao }
2622399665d2SSong Gao 
HELPER(vfcvth_s_h)2623226bf881SSong Gao void HELPER(vfcvth_s_h)(void *vd, void *vj,
2624226bf881SSong Gao                         CPULoongArchState *env, uint32_t desc)
2625399665d2SSong Gao {
262660df31a2SSong Gao     int i, j, ofs;
262760df31a2SSong Gao     VReg temp = {};
2628226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2629226bf881SSong Gao     VReg *Vj = (VReg *)vj;
263060df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2631399665d2SSong Gao 
263260df31a2SSong Gao     ofs = LSX_LEN / 32;
2633399665d2SSong Gao     vec_clear_cause(env);
263460df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
263560df31a2SSong Gao         for (j = 0; j < ofs; j++) {
263660df31a2SSong Gao             temp.UW(j + ofs * i) = float16_cvt_float32(Vj->UH(j + ofs * (2 * i + 1)),
263760df31a2SSong Gao                                                        &env->fp_status);
263860df31a2SSong Gao         }
2639399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2640399665d2SSong Gao     }
2641399665d2SSong Gao     *Vd = temp;
2642399665d2SSong Gao }
2643399665d2SSong Gao 
HELPER(vfcvth_d_s)2644226bf881SSong Gao void HELPER(vfcvth_d_s)(void *vd, void *vj,
2645226bf881SSong Gao                         CPULoongArchState *env, uint32_t desc)
2646399665d2SSong Gao {
264760df31a2SSong Gao     int i, j, ofs;
264860df31a2SSong Gao     VReg temp = {};
2649226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2650226bf881SSong Gao     VReg *Vj = (VReg *)vj;
265160df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2652399665d2SSong Gao 
265360df31a2SSong Gao     ofs = LSX_LEN / 64;
2654399665d2SSong Gao     vec_clear_cause(env);
265560df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
265660df31a2SSong Gao         for (j = 0; j < ofs; j++) {
265760df31a2SSong Gao             temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * (2 * i + 1)),
265860df31a2SSong Gao                                                         &env->fp_status);
265960df31a2SSong Gao         }
2660399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2661399665d2SSong Gao     }
2662399665d2SSong Gao     *Vd = temp;
2663399665d2SSong Gao }
2664399665d2SSong Gao 
HELPER(vfcvt_h_s)26653b286753SSong Gao void HELPER(vfcvt_h_s)(void *vd, void *vj, void *vk,
26663b286753SSong Gao                        CPULoongArchState *env, uint32_t desc)
2667399665d2SSong Gao {
266860df31a2SSong Gao     int i, j, ofs;
266960df31a2SSong Gao     VReg temp = {};
26703b286753SSong Gao     VReg *Vd = (VReg *)vd;
26713b286753SSong Gao     VReg *Vj = (VReg *)vj;
26723b286753SSong Gao     VReg *Vk = (VReg *)vk;
267360df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2674399665d2SSong Gao 
267560df31a2SSong Gao     ofs = LSX_LEN / 32;
2676399665d2SSong Gao     vec_clear_cause(env);
267760df31a2SSong Gao     for(i = 0; i < oprsz / 16; i++) {
267860df31a2SSong Gao         for (j = 0; j < ofs; j++) {
267960df31a2SSong Gao             temp.UH(j + ofs * (2 * i + 1)) = float32_cvt_float16(Vj->UW(j + ofs * i),
268060df31a2SSong Gao                                                                  &env->fp_status);
268160df31a2SSong Gao             temp.UH(j + ofs * 2 * i) = float32_cvt_float16(Vk->UW(j + ofs * i),
268260df31a2SSong Gao                                                            &env->fp_status);
268360df31a2SSong Gao         }
2684399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2685399665d2SSong Gao     }
2686399665d2SSong Gao     *Vd = temp;
2687399665d2SSong Gao }
2688399665d2SSong Gao 
HELPER(vfcvt_s_d)26893b286753SSong Gao void HELPER(vfcvt_s_d)(void *vd, void *vj, void *vk,
26903b286753SSong Gao                        CPULoongArchState *env, uint32_t desc)
2691399665d2SSong Gao {
269260df31a2SSong Gao     int i, j, ofs;
269360df31a2SSong Gao     VReg temp = {};
26943b286753SSong Gao     VReg *Vd = (VReg *)vd;
26953b286753SSong Gao     VReg *Vj = (VReg *)vj;
26963b286753SSong Gao     VReg *Vk = (VReg *)vk;
269760df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2698399665d2SSong Gao 
269960df31a2SSong Gao     ofs = LSX_LEN / 64;
2700399665d2SSong Gao     vec_clear_cause(env);
270160df31a2SSong Gao     for(i = 0; i < oprsz / 16; i++) {
270260df31a2SSong Gao         for (j = 0; j < ofs; j++) {
270360df31a2SSong Gao             temp.UW(j + ofs * (2 * i + 1)) = float64_cvt_float32(Vj->UD(j + ofs * i),
270460df31a2SSong Gao                                                                  &env->fp_status);
270560df31a2SSong Gao             temp.UW(j + ofs * 2 * i) = float64_cvt_float32(Vk->UD(j + ofs * i),
270660df31a2SSong Gao                                                            &env->fp_status);
270760df31a2SSong Gao         }
2708399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2709399665d2SSong Gao     }
2710399665d2SSong Gao     *Vd = temp;
2711399665d2SSong Gao }
2712399665d2SSong Gao 
HELPER(vfrint_s)2713226bf881SSong Gao void HELPER(vfrint_s)(void *vd, void *vj,
2714226bf881SSong Gao                       CPULoongArchState *env, uint32_t desc)
2715399665d2SSong Gao {
2716399665d2SSong Gao     int i;
2717226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2718226bf881SSong Gao     VReg *Vj = (VReg *)vj;
271960df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2720399665d2SSong Gao 
2721399665d2SSong Gao     vec_clear_cause(env);
272260df31a2SSong Gao     for (i = 0; i < oprsz / 4; i++) {
2723399665d2SSong Gao         Vd->W(i) = float32_round_to_int(Vj->UW(i), &env->fp_status);
2724399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2725399665d2SSong Gao     }
2726399665d2SSong Gao }
2727399665d2SSong Gao 
HELPER(vfrint_d)2728226bf881SSong Gao void HELPER(vfrint_d)(void *vd, void *vj,
2729226bf881SSong Gao                       CPULoongArchState *env, uint32_t desc)
2730399665d2SSong Gao {
2731399665d2SSong Gao     int i;
2732226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2733226bf881SSong Gao     VReg *Vj = (VReg *)vj;
273460df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2735399665d2SSong Gao 
2736399665d2SSong Gao     vec_clear_cause(env);
273760df31a2SSong Gao     for (i = 0; i < oprsz / 8; i++) {
2738399665d2SSong Gao         Vd->D(i) = float64_round_to_int(Vj->UD(i), &env->fp_status);
2739399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2740399665d2SSong Gao     }
2741399665d2SSong Gao }
2742399665d2SSong Gao 
2743399665d2SSong Gao #define FCVT_2OP(NAME, BIT, E, MODE)                                        \
2744226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj,                                       \
2745226bf881SSong Gao                   CPULoongArchState *env, uint32_t desc)                    \
2746399665d2SSong Gao {                                                                           \
2747399665d2SSong Gao     int i;                                                                  \
2748226bf881SSong Gao     VReg *Vd = (VReg *)vd;                                                  \
2749226bf881SSong Gao     VReg *Vj = (VReg *)vj;                                                  \
275060df31a2SSong Gao     int oprsz = simd_oprsz(desc);                                           \
2751399665d2SSong Gao                                                                             \
2752399665d2SSong Gao     vec_clear_cause(env);                                                   \
275360df31a2SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                               \
2754399665d2SSong Gao         FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \
2755399665d2SSong Gao         set_float_rounding_mode(MODE, &env->fp_status);                     \
2756399665d2SSong Gao         Vd->E(i) = float## BIT ## _round_to_int(Vj->E(i), &env->fp_status); \
2757399665d2SSong Gao         set_float_rounding_mode(old_mode, &env->fp_status);                 \
2758399665d2SSong Gao         vec_update_fcsr0(env, GETPC());                                     \
2759399665d2SSong Gao     }                                                                       \
2760399665d2SSong Gao }
2761399665d2SSong Gao 
2762399665d2SSong Gao FCVT_2OP(vfrintrne_s, 32, UW, float_round_nearest_even)
2763399665d2SSong Gao FCVT_2OP(vfrintrne_d, 64, UD, float_round_nearest_even)
2764399665d2SSong Gao FCVT_2OP(vfrintrz_s, 32, UW, float_round_to_zero)
2765399665d2SSong Gao FCVT_2OP(vfrintrz_d, 64, UD, float_round_to_zero)
2766399665d2SSong Gao FCVT_2OP(vfrintrp_s, 32, UW, float_round_up)
2767399665d2SSong Gao FCVT_2OP(vfrintrp_d, 64, UD, float_round_up)
2768399665d2SSong Gao FCVT_2OP(vfrintrm_s, 32, UW, float_round_down)
2769399665d2SSong Gao FCVT_2OP(vfrintrm_d, 64, UD, float_round_down)
2770399665d2SSong Gao 
2771399665d2SSong Gao #define FTINT(NAME, FMT1, FMT2, T1, T2,  MODE)                          \
2772399665d2SSong Gao static T2 do_ftint ## NAME(CPULoongArchState *env, T1 fj)               \
2773399665d2SSong Gao {                                                                       \
2774399665d2SSong Gao     T2 fd;                                                              \
2775399665d2SSong Gao     FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \
2776399665d2SSong Gao                                                                         \
2777399665d2SSong Gao     set_float_rounding_mode(MODE, &env->fp_status);                     \
2778399665d2SSong Gao     fd = do_## FMT1 ##_to_## FMT2(env, fj);                             \
2779399665d2SSong Gao     set_float_rounding_mode(old_mode, &env->fp_status);                 \
2780399665d2SSong Gao     return fd;                                                          \
2781399665d2SSong Gao }
2782399665d2SSong Gao 
2783399665d2SSong Gao #define DO_FTINT(FMT1, FMT2, T1, T2)                                         \
2784399665d2SSong Gao static T2 do_## FMT1 ##_to_## FMT2(CPULoongArchState *env, T1 fj)            \
2785399665d2SSong Gao {                                                                            \
2786399665d2SSong Gao     T2 fd;                                                                   \
2787399665d2SSong Gao                                                                              \
2788399665d2SSong Gao     fd = FMT1 ##_to_## FMT2(fj, &env->fp_status);                            \
2789399665d2SSong Gao     if (get_float_exception_flags(&env->fp_status) & (float_flag_invalid)) { \
2790399665d2SSong Gao         if (FMT1 ##_is_any_nan(fj)) {                                        \
2791399665d2SSong Gao             fd = 0;                                                          \
2792399665d2SSong Gao         }                                                                    \
2793399665d2SSong Gao     }                                                                        \
2794399665d2SSong Gao     vec_update_fcsr0(env, GETPC());                                          \
2795399665d2SSong Gao     return fd;                                                               \
2796399665d2SSong Gao }
2797399665d2SSong Gao 
DO_FTINT(float32,int32,uint32_t,uint32_t)2798399665d2SSong Gao DO_FTINT(float32, int32, uint32_t, uint32_t)
2799399665d2SSong Gao DO_FTINT(float64, int64, uint64_t, uint64_t)
2800399665d2SSong Gao DO_FTINT(float32, uint32, uint32_t, uint32_t)
2801399665d2SSong Gao DO_FTINT(float64, uint64, uint64_t, uint64_t)
2802399665d2SSong Gao DO_FTINT(float64, int32, uint64_t, uint32_t)
2803399665d2SSong Gao DO_FTINT(float32, int64, uint32_t, uint64_t)
2804399665d2SSong Gao 
2805399665d2SSong Gao FTINT(rne_w_s, float32, int32, uint32_t, uint32_t, float_round_nearest_even)
2806399665d2SSong Gao FTINT(rne_l_d, float64, int64, uint64_t, uint64_t, float_round_nearest_even)
2807399665d2SSong Gao FTINT(rp_w_s, float32, int32, uint32_t, uint32_t, float_round_up)
2808399665d2SSong Gao FTINT(rp_l_d, float64, int64, uint64_t, uint64_t, float_round_up)
2809399665d2SSong Gao FTINT(rz_w_s, float32, int32, uint32_t, uint32_t, float_round_to_zero)
2810399665d2SSong Gao FTINT(rz_l_d, float64, int64, uint64_t, uint64_t, float_round_to_zero)
2811399665d2SSong Gao FTINT(rm_w_s, float32, int32, uint32_t, uint32_t, float_round_down)
2812399665d2SSong Gao FTINT(rm_l_d, float64, int64, uint64_t, uint64_t, float_round_down)
2813399665d2SSong Gao 
2814399665d2SSong Gao DO_2OP_F(vftintrne_w_s, 32, UW, do_ftintrne_w_s)
2815399665d2SSong Gao DO_2OP_F(vftintrne_l_d, 64, UD, do_ftintrne_l_d)
2816399665d2SSong Gao DO_2OP_F(vftintrp_w_s, 32, UW, do_ftintrp_w_s)
2817399665d2SSong Gao DO_2OP_F(vftintrp_l_d, 64, UD, do_ftintrp_l_d)
2818399665d2SSong Gao DO_2OP_F(vftintrz_w_s, 32, UW, do_ftintrz_w_s)
2819399665d2SSong Gao DO_2OP_F(vftintrz_l_d, 64, UD, do_ftintrz_l_d)
2820399665d2SSong Gao DO_2OP_F(vftintrm_w_s, 32, UW, do_ftintrm_w_s)
2821399665d2SSong Gao DO_2OP_F(vftintrm_l_d, 64, UD, do_ftintrm_l_d)
2822399665d2SSong Gao DO_2OP_F(vftint_w_s, 32, UW, do_float32_to_int32)
2823399665d2SSong Gao DO_2OP_F(vftint_l_d, 64, UD, do_float64_to_int64)
2824399665d2SSong Gao 
2825399665d2SSong Gao FTINT(rz_wu_s, float32, uint32, uint32_t, uint32_t, float_round_to_zero)
2826399665d2SSong Gao FTINT(rz_lu_d, float64, uint64, uint64_t, uint64_t, float_round_to_zero)
2827399665d2SSong Gao 
2828399665d2SSong Gao DO_2OP_F(vftintrz_wu_s, 32, UW, do_ftintrz_wu_s)
2829399665d2SSong Gao DO_2OP_F(vftintrz_lu_d, 64, UD, do_ftintrz_lu_d)
2830399665d2SSong Gao DO_2OP_F(vftint_wu_s, 32, UW, do_float32_to_uint32)
2831399665d2SSong Gao DO_2OP_F(vftint_lu_d, 64, UD, do_float64_to_uint64)
2832399665d2SSong Gao 
2833399665d2SSong Gao FTINT(rm_w_d, float64, int32, uint64_t, uint32_t, float_round_down)
2834399665d2SSong Gao FTINT(rp_w_d, float64, int32, uint64_t, uint32_t, float_round_up)
2835399665d2SSong Gao FTINT(rz_w_d, float64, int32, uint64_t, uint32_t, float_round_to_zero)
2836399665d2SSong Gao FTINT(rne_w_d, float64, int32, uint64_t, uint32_t, float_round_nearest_even)
2837399665d2SSong Gao 
2838399665d2SSong Gao #define FTINT_W_D(NAME, FN)                                               \
28393b286753SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk,                           \
28403b286753SSong Gao                   CPULoongArchState *env, uint32_t desc)                  \
2841399665d2SSong Gao {                                                                         \
284260df31a2SSong Gao     int i, j, ofs;                                                        \
284360df31a2SSong Gao     VReg temp = {};                                                       \
28443b286753SSong Gao     VReg *Vd = (VReg *)vd;                                                \
28453b286753SSong Gao     VReg *Vj = (VReg *)vj;                                                \
28463b286753SSong Gao     VReg *Vk = (VReg *)vk;                                                \
284760df31a2SSong Gao     int oprsz = simd_oprsz(desc);                                         \
2848399665d2SSong Gao                                                                           \
284960df31a2SSong Gao     ofs = LSX_LEN / 64;                                                   \
2850399665d2SSong Gao     vec_clear_cause(env);                                                 \
285160df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
285260df31a2SSong Gao         for (j = 0; j < ofs; j++) {                                       \
285360df31a2SSong Gao             temp.W(j + ofs * (2 * i + 1)) = FN(env, Vj->UD(j + ofs * i)); \
285460df31a2SSong Gao             temp.W(j + ofs * 2 * i) = FN(env, Vk->UD(j + ofs * i));       \
285560df31a2SSong Gao         }                                                                 \
2856399665d2SSong Gao     }                                                                     \
2857399665d2SSong Gao     *Vd = temp;                                                           \
2858399665d2SSong Gao }
2859399665d2SSong Gao 
2860399665d2SSong Gao FTINT_W_D(vftint_w_d, do_float64_to_int32)
2861399665d2SSong Gao FTINT_W_D(vftintrm_w_d, do_ftintrm_w_d)
2862399665d2SSong Gao FTINT_W_D(vftintrp_w_d, do_ftintrp_w_d)
2863399665d2SSong Gao FTINT_W_D(vftintrz_w_d, do_ftintrz_w_d)
2864399665d2SSong Gao FTINT_W_D(vftintrne_w_d, do_ftintrne_w_d)
2865399665d2SSong Gao 
2866399665d2SSong Gao FTINT(rml_l_s, float32, int64, uint32_t, uint64_t, float_round_down)
2867399665d2SSong Gao FTINT(rpl_l_s, float32, int64, uint32_t, uint64_t, float_round_up)
2868399665d2SSong Gao FTINT(rzl_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero)
2869399665d2SSong Gao FTINT(rnel_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even)
2870399665d2SSong Gao FTINT(rmh_l_s, float32, int64, uint32_t, uint64_t, float_round_down)
2871399665d2SSong Gao FTINT(rph_l_s, float32, int64, uint32_t, uint64_t, float_round_up)
2872399665d2SSong Gao FTINT(rzh_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero)
2873399665d2SSong Gao FTINT(rneh_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even)
2874399665d2SSong Gao 
2875399665d2SSong Gao #define FTINTL_L_S(NAME, FN)                                        \
2876226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj,                               \
2877226bf881SSong Gao                   CPULoongArchState *env, uint32_t desc)            \
2878399665d2SSong Gao {                                                                   \
287960df31a2SSong Gao     int i, j, ofs;                                                  \
2880399665d2SSong Gao     VReg temp;                                                      \
2881226bf881SSong Gao     VReg *Vd = (VReg *)vd;                                          \
2882226bf881SSong Gao     VReg *Vj = (VReg *)vj;                                          \
288360df31a2SSong Gao     int oprsz = simd_oprsz(desc);                                   \
2884399665d2SSong Gao                                                                     \
288560df31a2SSong Gao     ofs = LSX_LEN / 64;                                             \
2886399665d2SSong Gao     vec_clear_cause(env);                                           \
288760df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {                              \
288860df31a2SSong Gao         for (j = 0; j < ofs; j++) {                                 \
288960df31a2SSong Gao             temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * 2 * i)); \
289060df31a2SSong Gao         }                                                           \
2891399665d2SSong Gao     }                                                               \
2892399665d2SSong Gao     *Vd = temp;                                                     \
2893399665d2SSong Gao }
2894399665d2SSong Gao 
2895399665d2SSong Gao FTINTL_L_S(vftintl_l_s, do_float32_to_int64)
2896399665d2SSong Gao FTINTL_L_S(vftintrml_l_s, do_ftintrml_l_s)
2897399665d2SSong Gao FTINTL_L_S(vftintrpl_l_s, do_ftintrpl_l_s)
2898399665d2SSong Gao FTINTL_L_S(vftintrzl_l_s, do_ftintrzl_l_s)
2899399665d2SSong Gao FTINTL_L_S(vftintrnel_l_s, do_ftintrnel_l_s)
2900399665d2SSong Gao 
2901399665d2SSong Gao #define FTINTH_L_S(NAME, FN)                                              \
2902226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj,                                     \
2903226bf881SSong Gao                   CPULoongArchState *env, uint32_t desc)                  \
2904399665d2SSong Gao {                                                                         \
290560df31a2SSong Gao     int i, j, ofs;                                                        \
290660df31a2SSong Gao     VReg temp = {};                                                       \
2907226bf881SSong Gao     VReg *Vd = (VReg *)vd;                                                \
2908226bf881SSong Gao     VReg *Vj = (VReg *)vj;                                                \
290960df31a2SSong Gao     int oprsz = simd_oprsz(desc);                                         \
2910399665d2SSong Gao                                                                           \
291160df31a2SSong Gao     ofs = LSX_LEN / 64;                                                   \
2912399665d2SSong Gao     vec_clear_cause(env);                                                 \
291360df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
291460df31a2SSong Gao         for (j = 0; j < ofs; j++) {                                       \
291560df31a2SSong Gao             temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * (2 * i + 1))); \
291660df31a2SSong Gao         }                                                                 \
2917399665d2SSong Gao     }                                                                     \
2918399665d2SSong Gao     *Vd = temp;                                                           \
2919399665d2SSong Gao }
2920399665d2SSong Gao 
2921399665d2SSong Gao FTINTH_L_S(vftinth_l_s, do_float32_to_int64)
2922399665d2SSong Gao FTINTH_L_S(vftintrmh_l_s, do_ftintrmh_l_s)
2923399665d2SSong Gao FTINTH_L_S(vftintrph_l_s, do_ftintrph_l_s)
2924399665d2SSong Gao FTINTH_L_S(vftintrzh_l_s, do_ftintrzh_l_s)
2925399665d2SSong Gao FTINTH_L_S(vftintrneh_l_s, do_ftintrneh_l_s)
2926399665d2SSong Gao 
2927399665d2SSong Gao #define FFINT(NAME, FMT1, FMT2, T1, T2)                    \
2928399665d2SSong Gao static T2 do_ffint_ ## NAME(CPULoongArchState *env, T1 fj) \
2929399665d2SSong Gao {                                                          \
2930399665d2SSong Gao     T2 fd;                                                 \
2931399665d2SSong Gao                                                            \
2932399665d2SSong Gao     fd = FMT1 ##_to_## FMT2(fj, &env->fp_status);          \
2933399665d2SSong Gao     vec_update_fcsr0(env, GETPC());                        \
2934399665d2SSong Gao     return fd;                                             \
2935399665d2SSong Gao }
2936399665d2SSong Gao 
2937399665d2SSong Gao FFINT(s_w, int32, float32, int32_t, uint32_t)
2938399665d2SSong Gao FFINT(d_l, int64, float64, int64_t, uint64_t)
2939399665d2SSong Gao FFINT(s_wu, uint32, float32, uint32_t, uint32_t)
2940399665d2SSong Gao FFINT(d_lu, uint64, float64, uint64_t, uint64_t)
2941399665d2SSong Gao 
2942399665d2SSong Gao DO_2OP_F(vffint_s_w, 32, W, do_ffint_s_w)
2943399665d2SSong Gao DO_2OP_F(vffint_d_l, 64, D, do_ffint_d_l)
2944399665d2SSong Gao DO_2OP_F(vffint_s_wu, 32, UW, do_ffint_s_wu)
2945399665d2SSong Gao DO_2OP_F(vffint_d_lu, 64, UD, do_ffint_d_lu)
2946399665d2SSong Gao 
2947226bf881SSong Gao void HELPER(vffintl_d_w)(void *vd, void *vj,
2948226bf881SSong Gao                          CPULoongArchState *env, uint32_t desc)
2949399665d2SSong Gao {
295060df31a2SSong Gao     int i, j, ofs;
295160df31a2SSong Gao     VReg temp = {};
2952226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2953226bf881SSong Gao     VReg *Vj = (VReg *)vj;
295460df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2955399665d2SSong Gao 
295660df31a2SSong Gao     ofs = LSX_LEN / 64;
2957399665d2SSong Gao     vec_clear_cause(env);
295860df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
295960df31a2SSong Gao         for (j = 0; j < ofs; j++) {
296060df31a2SSong Gao             temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * 2 * i),
296160df31a2SSong Gao                                                    &env->fp_status);
296260df31a2SSong Gao         }
2963399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2964399665d2SSong Gao     }
2965399665d2SSong Gao     *Vd = temp;
2966399665d2SSong Gao }
2967399665d2SSong Gao 
HELPER(vffinth_d_w)2968226bf881SSong Gao void HELPER(vffinth_d_w)(void *vd, void *vj,
2969226bf881SSong Gao                          CPULoongArchState *env, uint32_t desc)
2970399665d2SSong Gao {
297160df31a2SSong Gao     int i, j, ofs;
297260df31a2SSong Gao     VReg temp = {};
2973226bf881SSong Gao     VReg *Vd = (VReg *)vd;
2974226bf881SSong Gao     VReg *Vj = (VReg *)vj;
297560df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2976399665d2SSong Gao 
297760df31a2SSong Gao     ofs = LSX_LEN / 64;
2978399665d2SSong Gao     vec_clear_cause(env);
297960df31a2SSong Gao     for (i = 0; i < oprsz /16; i++) {
298060df31a2SSong Gao         for (j = 0; j < ofs; j++) {
298160df31a2SSong Gao             temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * (2 * i + 1)),
298260df31a2SSong Gao                                                    &env->fp_status);
298360df31a2SSong Gao         }
2984399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
2985399665d2SSong Gao     }
2986399665d2SSong Gao     *Vd = temp;
2987399665d2SSong Gao }
2988399665d2SSong Gao 
HELPER(vffint_s_l)29893b286753SSong Gao void HELPER(vffint_s_l)(void *vd, void *vj, void *vk,
29903b286753SSong Gao                         CPULoongArchState *env, uint32_t desc)
2991399665d2SSong Gao {
299260df31a2SSong Gao     int i, j, ofs;
299360df31a2SSong Gao     VReg temp = {};
29943b286753SSong Gao     VReg *Vd = (VReg *)vd;
29953b286753SSong Gao     VReg *Vj = (VReg *)vj;
29963b286753SSong Gao     VReg *Vk = (VReg *)vk;
299760df31a2SSong Gao     int oprsz = simd_oprsz(desc);
2998399665d2SSong Gao 
299960df31a2SSong Gao     ofs = LSX_LEN / 64;
3000399665d2SSong Gao     vec_clear_cause(env);
300160df31a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
300260df31a2SSong Gao         for (j = 0; j < ofs; j++) {
300360df31a2SSong Gao             temp.W(j + ofs * (2 * i + 1)) = int64_to_float32(Vj->D(j + ofs * i),
300460df31a2SSong Gao                                                              &env->fp_status);
300560df31a2SSong Gao             temp.W(j + ofs * 2 * i) = int64_to_float32(Vk->D(j + ofs * i),
300660df31a2SSong Gao                                                        &env->fp_status);
300760df31a2SSong Gao         }
3008399665d2SSong Gao         vec_update_fcsr0(env, GETPC());
3009399665d2SSong Gao     }
3010399665d2SSong Gao     *Vd = temp;
3011399665d2SSong Gao }
3012f435e1e5SSong Gao 
3013f435e1e5SSong Gao #define VCMPI(NAME, BIT, E, DO_OP)                                 \
30144da72d43SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3015f435e1e5SSong Gao {                                                                  \
3016f435e1e5SSong Gao     int i;                                                         \
3017f435e1e5SSong Gao     VReg *Vd = (VReg *)vd;                                         \
3018f435e1e5SSong Gao     VReg *Vj = (VReg *)vj;                                         \
3019f435e1e5SSong Gao     typedef __typeof(Vd->E(0)) TD;                                 \
30204da72d43SSong Gao     int oprsz = simd_oprsz(desc);                                  \
3021f435e1e5SSong Gao                                                                    \
30224da72d43SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                      \
3023f435e1e5SSong Gao         Vd->E(i) = DO_OP(Vj->E(i), (TD)imm);                       \
3024f435e1e5SSong Gao     }                                                              \
3025f435e1e5SSong Gao }
3026f435e1e5SSong Gao 
3027f435e1e5SSong Gao VCMPI(vseqi_b, 8, B, VSEQ)
3028f435e1e5SSong Gao VCMPI(vseqi_h, 16, H, VSEQ)
3029f435e1e5SSong Gao VCMPI(vseqi_w, 32, W, VSEQ)
3030f435e1e5SSong Gao VCMPI(vseqi_d, 64, D, VSEQ)
3031f435e1e5SSong Gao VCMPI(vslei_b, 8, B, VSLE)
3032f435e1e5SSong Gao VCMPI(vslei_h, 16, H, VSLE)
3033f435e1e5SSong Gao VCMPI(vslei_w, 32, W, VSLE)
3034f435e1e5SSong Gao VCMPI(vslei_d, 64, D, VSLE)
3035f435e1e5SSong Gao VCMPI(vslei_bu, 8, UB, VSLE)
3036f435e1e5SSong Gao VCMPI(vslei_hu, 16, UH, VSLE)
3037f435e1e5SSong Gao VCMPI(vslei_wu, 32, UW, VSLE)
3038f435e1e5SSong Gao VCMPI(vslei_du, 64, UD, VSLE)
3039f435e1e5SSong Gao VCMPI(vslti_b, 8, B, VSLT)
3040f435e1e5SSong Gao VCMPI(vslti_h, 16, H, VSLT)
3041f435e1e5SSong Gao VCMPI(vslti_w, 32, W, VSLT)
3042f435e1e5SSong Gao VCMPI(vslti_d, 64, D, VSLT)
3043f435e1e5SSong Gao VCMPI(vslti_bu, 8, UB, VSLT)
3044f435e1e5SSong Gao VCMPI(vslti_hu, 16, UH, VSLT)
3045f435e1e5SSong Gao VCMPI(vslti_wu, 32, UW, VSLT)
3046f435e1e5SSong Gao VCMPI(vslti_du, 64, UD, VSLT)
3047386c4e86SSong Gao 
vfcmp_common(CPULoongArchState * env,FloatRelation cmp,uint32_t flags)3048386c4e86SSong Gao static uint64_t vfcmp_common(CPULoongArchState *env,
3049386c4e86SSong Gao                              FloatRelation cmp, uint32_t flags)
3050386c4e86SSong Gao {
3051386c4e86SSong Gao     uint64_t ret = 0;
3052386c4e86SSong Gao 
3053386c4e86SSong Gao     switch (cmp) {
3054386c4e86SSong Gao     case float_relation_less:
3055386c4e86SSong Gao         ret = (flags & FCMP_LT);
3056386c4e86SSong Gao         break;
3057386c4e86SSong Gao     case float_relation_equal:
3058386c4e86SSong Gao         ret = (flags & FCMP_EQ);
3059386c4e86SSong Gao         break;
3060386c4e86SSong Gao     case float_relation_greater:
3061386c4e86SSong Gao         ret = (flags & FCMP_GT);
3062386c4e86SSong Gao         break;
3063386c4e86SSong Gao     case float_relation_unordered:
3064386c4e86SSong Gao         ret = (flags & FCMP_UN);
3065386c4e86SSong Gao         break;
3066386c4e86SSong Gao     default:
3067386c4e86SSong Gao         g_assert_not_reached();
3068386c4e86SSong Gao     }
3069386c4e86SSong Gao 
3070386c4e86SSong Gao     if (ret) {
3071386c4e86SSong Gao         ret = -1;
3072386c4e86SSong Gao     }
3073386c4e86SSong Gao 
3074386c4e86SSong Gao     return ret;
3075386c4e86SSong Gao }
3076386c4e86SSong Gao 
3077386c4e86SSong Gao #define VFCMP(NAME, BIT, E, FN)                                          \
30783eeda5feSSong Gao void HELPER(NAME)(CPULoongArchState *env, uint32_t oprsz,                \
3079386c4e86SSong Gao                   uint32_t vd, uint32_t vj, uint32_t vk, uint32_t flags) \
3080386c4e86SSong Gao {                                                                        \
3081386c4e86SSong Gao     int i;                                                               \
3082386c4e86SSong Gao     VReg t;                                                              \
3083386c4e86SSong Gao     VReg *Vd = &(env->fpr[vd].vreg);                                     \
3084386c4e86SSong Gao     VReg *Vj = &(env->fpr[vj].vreg);                                     \
3085386c4e86SSong Gao     VReg *Vk = &(env->fpr[vk].vreg);                                     \
3086386c4e86SSong Gao                                                                          \
3087386c4e86SSong Gao     vec_clear_cause(env);                                                \
30883eeda5feSSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                            \
3089386c4e86SSong Gao         FloatRelation cmp;                                               \
3090386c4e86SSong Gao         cmp = FN(Vj->E(i), Vk->E(i), &env->fp_status);                   \
3091386c4e86SSong Gao         t.E(i) = vfcmp_common(env, cmp, flags);                          \
3092386c4e86SSong Gao         vec_update_fcsr0(env, GETPC());                                  \
3093386c4e86SSong Gao     }                                                                    \
3094386c4e86SSong Gao     *Vd = t;                                                             \
3095386c4e86SSong Gao }
3096386c4e86SSong Gao 
3097386c4e86SSong Gao VFCMP(vfcmp_c_s, 32, UW, float32_compare_quiet)
3098386c4e86SSong Gao VFCMP(vfcmp_s_s, 32, UW, float32_compare)
3099386c4e86SSong Gao VFCMP(vfcmp_c_d, 64, UD, float64_compare_quiet)
3100386c4e86SSong Gao VFCMP(vfcmp_s_d, 64, UD, float64_compare)
3101d0dfa19aSSong Gao 
HELPER(vbitseli_b)3102f3dfcc8bSSong Gao void HELPER(vbitseli_b)(void *vd, void *vj,  uint64_t imm, uint32_t desc)
3103d0dfa19aSSong Gao {
3104d0dfa19aSSong Gao     int i;
3105d0dfa19aSSong Gao     VReg *Vd = (VReg *)vd;
3106d0dfa19aSSong Gao     VReg *Vj = (VReg *)vj;
3107d0dfa19aSSong Gao 
3108f3dfcc8bSSong Gao     for (i = 0; i < simd_oprsz(desc); i++) {
3109d0dfa19aSSong Gao         Vd->B(i) = (~Vd->B(i) & Vj->B(i)) | (Vd->B(i) & imm);
3110d0dfa19aSSong Gao     }
3111d0dfa19aSSong Gao }
3112d0dfa19aSSong Gao 
3113d0dfa19aSSong Gao /* Copy from target/arm/tcg/sve_helper.c */
do_match2(uint64_t n,uint64_t m0,uint64_t m1,int esz)3114d0dfa19aSSong Gao static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
3115d0dfa19aSSong Gao {
3116f3dfcc8bSSong Gao     int bits = 8 << esz;
3117d0dfa19aSSong Gao     uint64_t ones = dup_const(esz, 1);
3118d0dfa19aSSong Gao     uint64_t signs = ones << (bits - 1);
3119d0dfa19aSSong Gao     uint64_t cmp0, cmp1;
3120d0dfa19aSSong Gao 
3121d0dfa19aSSong Gao     cmp1 = dup_const(esz, n);
3122d0dfa19aSSong Gao     cmp0 = cmp1 ^ m0;
3123d0dfa19aSSong Gao     cmp1 = cmp1 ^ m1;
3124d0dfa19aSSong Gao     cmp0 = (cmp0 - ones) & ~cmp0;
3125d0dfa19aSSong Gao     cmp1 = (cmp1 - ones) & ~cmp1;
3126d0dfa19aSSong Gao     return (cmp0 | cmp1) & signs;
3127d0dfa19aSSong Gao }
3128d0dfa19aSSong Gao 
3129d0dfa19aSSong Gao #define SETANYEQZ(NAME, MO)                                       \
3130f3dfcc8bSSong Gao void HELPER(NAME)(CPULoongArchState *env,                         \
3131f3dfcc8bSSong Gao                   uint32_t oprsz, uint32_t cd, uint32_t vj)       \
3132d0dfa19aSSong Gao {                                                                 \
3133d0dfa19aSSong Gao     VReg *Vj = &(env->fpr[vj].vreg);                              \
3134d0dfa19aSSong Gao                                                                   \
3135d0dfa19aSSong Gao     env->cf[cd & 0x7] = do_match2(0, Vj->D(0), Vj->D(1), MO);     \
3136f3dfcc8bSSong Gao     if (oprsz == 32) {                                            \
3137f3dfcc8bSSong Gao         env->cf[cd & 0x7] = env->cf[cd & 0x7] ||                  \
3138f3dfcc8bSSong Gao                             do_match2(0, Vj->D(2), Vj->D(3), MO); \
3139f3dfcc8bSSong Gao     }                                                             \
3140d0dfa19aSSong Gao }
3141f3dfcc8bSSong Gao 
SETANYEQZ(vsetanyeqz_b,MO_8)3142d0dfa19aSSong Gao SETANYEQZ(vsetanyeqz_b, MO_8)
3143d0dfa19aSSong Gao SETANYEQZ(vsetanyeqz_h, MO_16)
3144d0dfa19aSSong Gao SETANYEQZ(vsetanyeqz_w, MO_32)
3145d0dfa19aSSong Gao SETANYEQZ(vsetanyeqz_d, MO_64)
3146d0dfa19aSSong Gao 
3147d0dfa19aSSong Gao #define SETALLNEZ(NAME, MO)                                        \
3148f3dfcc8bSSong Gao void HELPER(NAME)(CPULoongArchState *env,                          \
3149f3dfcc8bSSong Gao                   uint32_t oprsz, uint32_t cd, uint32_t vj)        \
3150d0dfa19aSSong Gao {                                                                  \
3151d0dfa19aSSong Gao     VReg *Vj = &(env->fpr[vj].vreg);                               \
3152d0dfa19aSSong Gao                                                                    \
3153d0dfa19aSSong Gao     env->cf[cd & 0x7]= !do_match2(0, Vj->D(0), Vj->D(1), MO);      \
3154f3dfcc8bSSong Gao     if (oprsz == 32) {                                             \
3155f3dfcc8bSSong Gao         env->cf[cd & 0x7] = env->cf[cd & 0x7] &&                   \
3156f3dfcc8bSSong Gao                             !do_match2(0, Vj->D(2), Vj->D(3), MO); \
3157f3dfcc8bSSong Gao     }                                                              \
3158d0dfa19aSSong Gao }
3159f3dfcc8bSSong Gao 
3160d0dfa19aSSong Gao SETALLNEZ(vsetallnez_b, MO_8)
3161d0dfa19aSSong Gao SETALLNEZ(vsetallnez_h, MO_16)
3162d0dfa19aSSong Gao SETALLNEZ(vsetallnez_w, MO_32)
3163d0dfa19aSSong Gao SETALLNEZ(vsetallnez_d, MO_64)
3164d5e5563cSSong Gao 
3165df97f338SSong Gao #define XVINSVE0(NAME, E, MASK)                                    \
3166df97f338SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3167df97f338SSong Gao {                                                                  \
3168df97f338SSong Gao     VReg *Vd = (VReg *)vd;                                         \
3169df97f338SSong Gao     VReg *Vj = (VReg *)vj;                                         \
3170df97f338SSong Gao     Vd->E(imm & MASK) = Vj->E(0);                                  \
3171df97f338SSong Gao }
3172df97f338SSong Gao 
3173df97f338SSong Gao XVINSVE0(xvinsve0_w, W, 0x7)
3174df97f338SSong Gao XVINSVE0(xvinsve0_d, D, 0x3)
3175df97f338SSong Gao 
3176df97f338SSong Gao #define XVPICKVE(NAME, E, BIT, MASK)                               \
3177df97f338SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3178df97f338SSong Gao {                                                                  \
3179df97f338SSong Gao     int i;                                                         \
3180df97f338SSong Gao     VReg *Vd = (VReg *)vd;                                         \
3181df97f338SSong Gao     VReg *Vj = (VReg *)vj;                                         \
3182df97f338SSong Gao     int oprsz = simd_oprsz(desc);                                  \
3183df97f338SSong Gao                                                                    \
3184df97f338SSong Gao     Vd->E(0) = Vj->E(imm & MASK);                                  \
3185df97f338SSong Gao     for (i = 1; i < oprsz / (BIT / 8); i++) {                      \
3186df97f338SSong Gao         Vd->E(i) = 0;                                              \
3187df97f338SSong Gao     }                                                              \
3188df97f338SSong Gao }
3189df97f338SSong Gao 
3190df97f338SSong Gao XVPICKVE(xvpickve_w, W, 32, 0x7)
3191df97f338SSong Gao XVPICKVE(xvpickve_d, D, 64, 0x3)
3192df97f338SSong Gao 
3193d5e5563cSSong Gao #define VPACKEV(NAME, BIT, E)                                  \
319404711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
3195d5e5563cSSong Gao {                                                              \
3196d5e5563cSSong Gao     int i;                                                     \
3197ad292148SSong Gao     VReg temp = {};                                            \
319804711da1SSong Gao     VReg *Vd = (VReg *)vd;                                     \
319904711da1SSong Gao     VReg *Vj = (VReg *)vj;                                     \
320004711da1SSong Gao     VReg *Vk = (VReg *)vk;                                     \
3201ad292148SSong Gao     int oprsz = simd_oprsz(desc);                              \
3202d5e5563cSSong Gao                                                                \
3203ad292148SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                  \
3204d5e5563cSSong Gao         temp.E(2 * i + 1) = Vj->E(2 * i);                      \
3205d5e5563cSSong Gao         temp.E(2 *i) = Vk->E(2 * i);                           \
3206d5e5563cSSong Gao     }                                                          \
3207d5e5563cSSong Gao     *Vd = temp;                                                \
3208d5e5563cSSong Gao }
3209d5e5563cSSong Gao 
3210d5e5563cSSong Gao VPACKEV(vpackev_b, 16, B)
3211d5e5563cSSong Gao VPACKEV(vpackev_h, 32, H)
3212d5e5563cSSong Gao VPACKEV(vpackev_w, 64, W)
3213d5e5563cSSong Gao VPACKEV(vpackev_d, 128, D)
3214d5e5563cSSong Gao 
3215d5e5563cSSong Gao #define VPACKOD(NAME, BIT, E)                                  \
321604711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
3217d5e5563cSSong Gao {                                                              \
3218d5e5563cSSong Gao     int i;                                                     \
3219ad292148SSong Gao     VReg temp = {};                                            \
322004711da1SSong Gao     VReg *Vd = (VReg *)vd;                                     \
322104711da1SSong Gao     VReg *Vj = (VReg *)vj;                                     \
322204711da1SSong Gao     VReg *Vk = (VReg *)vk;                                     \
3223ad292148SSong Gao     int oprsz = simd_oprsz(desc);                              \
3224d5e5563cSSong Gao                                                                \
3225ad292148SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                 \
3226d5e5563cSSong Gao         temp.E(2 * i + 1) = Vj->E(2 * i + 1);                  \
3227d5e5563cSSong Gao         temp.E(2 * i) = Vk->E(2 * i + 1);                      \
3228d5e5563cSSong Gao     }                                                          \
3229d5e5563cSSong Gao     *Vd = temp;                                                \
3230d5e5563cSSong Gao }
3231d5e5563cSSong Gao 
3232d5e5563cSSong Gao VPACKOD(vpackod_b, 16, B)
3233d5e5563cSSong Gao VPACKOD(vpackod_h, 32, H)
3234d5e5563cSSong Gao VPACKOD(vpackod_w, 64, W)
3235d5e5563cSSong Gao VPACKOD(vpackod_d, 128, D)
3236d5e5563cSSong Gao 
3237d5e5563cSSong Gao #define VPICKEV(NAME, BIT, E)                                         \
323804711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)        \
3239d5e5563cSSong Gao {                                                                     \
3240ad292148SSong Gao     int i, j, ofs;                                                    \
3241ad292148SSong Gao     VReg temp = {};                                                   \
324204711da1SSong Gao     VReg *Vd = (VReg *)vd;                                            \
324304711da1SSong Gao     VReg *Vj = (VReg *)vj;                                            \
324404711da1SSong Gao     VReg *Vk = (VReg *)vk;                                            \
3245ad292148SSong Gao     int oprsz = simd_oprsz(desc);                                     \
3246d5e5563cSSong Gao                                                                       \
3247ad292148SSong Gao     ofs = LSX_LEN / BIT;                                              \
3248ad292148SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                \
3249ad292148SSong Gao         for (j = 0; j < ofs; j++) {                                   \
3250ad292148SSong Gao             temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i)); \
3251ad292148SSong Gao             temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i));       \
3252ad292148SSong Gao         }                                                             \
3253d5e5563cSSong Gao     }                                                                 \
3254d5e5563cSSong Gao     *Vd = temp;                                                       \
3255d5e5563cSSong Gao }
3256d5e5563cSSong Gao 
3257d5e5563cSSong Gao VPICKEV(vpickev_b, 16, B)
3258d5e5563cSSong Gao VPICKEV(vpickev_h, 32, H)
3259d5e5563cSSong Gao VPICKEV(vpickev_w, 64, W)
3260d5e5563cSSong Gao VPICKEV(vpickev_d, 128, D)
3261d5e5563cSSong Gao 
3262d5e5563cSSong Gao #define VPICKOD(NAME, BIT, E)                                             \
326304711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)            \
3264d5e5563cSSong Gao {                                                                         \
3265ad292148SSong Gao     int i, j, ofs;                                                        \
3266ad292148SSong Gao     VReg temp = {};                                                       \
326704711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                \
326804711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                \
326904711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                \
3270ad292148SSong Gao     int oprsz = simd_oprsz(desc);                                         \
3271d5e5563cSSong Gao                                                                           \
3272ad292148SSong Gao     ofs = LSX_LEN / BIT;                                                  \
3273ad292148SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
3274ad292148SSong Gao         for (j = 0; j < ofs; j++) {                                       \
3275ad292148SSong Gao             temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i) + 1); \
3276ad292148SSong Gao             temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i) + 1);       \
3277ad292148SSong Gao         }                                                                 \
3278d5e5563cSSong Gao     }                                                                     \
3279d5e5563cSSong Gao     *Vd = temp;                                                           \
3280d5e5563cSSong Gao }
3281d5e5563cSSong Gao 
3282d5e5563cSSong Gao VPICKOD(vpickod_b, 16, B)
3283d5e5563cSSong Gao VPICKOD(vpickod_h, 32, H)
3284d5e5563cSSong Gao VPICKOD(vpickod_w, 64, W)
3285d5e5563cSSong Gao VPICKOD(vpickod_d, 128, D)
3286e93dd431SSong Gao 
3287e93dd431SSong Gao #define VILVL(NAME, BIT, E)                                         \
328804711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)      \
3289e93dd431SSong Gao {                                                                   \
3290ad292148SSong Gao     int i, j, ofs;                                                  \
3291ad292148SSong Gao     VReg temp = {};                                                 \
329204711da1SSong Gao     VReg *Vd = (VReg *)vd;                                          \
329304711da1SSong Gao     VReg *Vj = (VReg *)vj;                                          \
329404711da1SSong Gao     VReg *Vk = (VReg *)vk;                                          \
3295ad292148SSong Gao     int oprsz = simd_oprsz(desc);                                   \
3296e93dd431SSong Gao                                                                     \
3297ad292148SSong Gao     ofs = LSX_LEN / BIT;                                            \
3298ad292148SSong Gao     for (i = 0; i < oprsz / 16; i++) {                              \
3299ad292148SSong Gao         for (j = 0; j < ofs; j++) {                                 \
3300ad292148SSong Gao             temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * 2 * i); \
3301ad292148SSong Gao             temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * 2 * i);     \
3302ad292148SSong Gao         }                                                           \
3303e93dd431SSong Gao     }                                                               \
3304e93dd431SSong Gao     *Vd = temp;                                                     \
3305e93dd431SSong Gao }
3306e93dd431SSong Gao 
3307e93dd431SSong Gao VILVL(vilvl_b, 16, B)
3308e93dd431SSong Gao VILVL(vilvl_h, 32, H)
3309e93dd431SSong Gao VILVL(vilvl_w, 64, W)
3310e93dd431SSong Gao VILVL(vilvl_d, 128, D)
3311e93dd431SSong Gao 
3312e93dd431SSong Gao #define VILVH(NAME, BIT, E)                                               \
331304711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)            \
3314e93dd431SSong Gao {                                                                         \
3315ad292148SSong Gao     int i, j, ofs;                                                        \
3316ad292148SSong Gao     VReg temp = {};                                                       \
331704711da1SSong Gao     VReg *Vd = (VReg *)vd;                                                \
331804711da1SSong Gao     VReg *Vj = (VReg *)vj;                                                \
331904711da1SSong Gao     VReg *Vk = (VReg *)vk;                                                \
3320ad292148SSong Gao     int oprsz = simd_oprsz(desc);                                         \
3321e93dd431SSong Gao                                                                           \
3322ad292148SSong Gao     ofs = LSX_LEN / BIT;                                                  \
3323ad292148SSong Gao     for (i = 0; i < oprsz / 16; i++) {                                    \
3324ad292148SSong Gao         for (j = 0; j < ofs; j++) {                                       \
3325ad292148SSong Gao             temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * (2 * i + 1)); \
3326ad292148SSong Gao             temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * (2 * i + 1));     \
3327ad292148SSong Gao         }                                                                 \
3328e93dd431SSong Gao     }                                                                     \
3329e93dd431SSong Gao     *Vd = temp;                                                           \
3330e93dd431SSong Gao }
3331e93dd431SSong Gao 
3332e93dd431SSong Gao VILVH(vilvh_b, 16, B)
3333e93dd431SSong Gao VILVH(vilvh_h, 32, H)
3334e93dd431SSong Gao VILVH(vilvh_w, 64, W)
3335e93dd431SSong Gao VILVH(vilvh_d, 128, D)
3336e93dd431SSong Gao 
3337eb48ab22SSong Gao void HELPER(vshuf_b)(void *vd, void *vj, void *vk, void *va, uint32_t desc)
3338e93dd431SSong Gao {
3339513e88a2SSong Gao     int i, j, m;
3340513e88a2SSong Gao     VReg temp = {};
3341eb48ab22SSong Gao     VReg *Vd = (VReg *)vd;
3342eb48ab22SSong Gao     VReg *Vj = (VReg *)vj;
3343eb48ab22SSong Gao     VReg *Vk = (VReg *)vk;
3344eb48ab22SSong Gao     VReg *Va = (VReg *)va;
3345513e88a2SSong Gao     int oprsz = simd_oprsz(desc);
3346e93dd431SSong Gao 
3347e93dd431SSong Gao     m = LSX_LEN / 8;
3348513e88a2SSong Gao     for (i = 0; i < (oprsz / 16) * m; i++) {
3349513e88a2SSong Gao         j = i < m ? 0 : 1;
3350e93dd431SSong Gao         uint64_t k = (uint8_t)Va->B(i) % (2 * m);
3351513e88a2SSong Gao         temp.B(i) = k < m ? Vk->B(k + j * m): Vj->B(k + (j - 1) * m);
3352e93dd431SSong Gao     }
3353e93dd431SSong Gao     *Vd = temp;
3354e93dd431SSong Gao }
3355e93dd431SSong Gao 
3356e93dd431SSong Gao #define VSHUF(NAME, BIT, E)                                            \
335704711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc)         \
3358e93dd431SSong Gao {                                                                      \
3359513e88a2SSong Gao     int i, j, m;                                                       \
3360513e88a2SSong Gao     VReg temp = {};                                                    \
336104711da1SSong Gao     VReg *Vd = (VReg *)vd;                                             \
336204711da1SSong Gao     VReg *Vj = (VReg *)vj;                                             \
336304711da1SSong Gao     VReg *Vk = (VReg *)vk;                                             \
3364513e88a2SSong Gao     int oprsz = simd_oprsz(desc);                                      \
3365e93dd431SSong Gao                                                                        \
3366e93dd431SSong Gao     m = LSX_LEN / BIT;                                                 \
3367513e88a2SSong Gao     for (i = 0; i < (oprsz / 16) * m; i++) {                           \
3368513e88a2SSong Gao         j = i < m ? 0 : 1;                                             \
3369e93dd431SSong Gao         uint64_t k  = ((uint8_t)Vd->E(i)) % (2 * m);                   \
3370513e88a2SSong Gao         temp.E(i) = k < m ? Vk->E(k + j * m) : Vj->E(k + (j - 1) * m); \
3371e93dd431SSong Gao     }                                                                  \
3372e93dd431SSong Gao     *Vd = temp;                                                        \
3373e93dd431SSong Gao }
3374e93dd431SSong Gao 
3375e93dd431SSong Gao VSHUF(vshuf_h, 16, H)
3376e93dd431SSong Gao VSHUF(vshuf_w, 32, W)
3377e93dd431SSong Gao VSHUF(vshuf_d, 64, D)
3378e93dd431SSong Gao 
3379e93dd431SSong Gao #define VSHUF4I(NAME, BIT, E)                                               \
3380329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc)          \
3381e93dd431SSong Gao {                                                                           \
3382513e88a2SSong Gao     int i, j, max;                                                          \
3383513e88a2SSong Gao     VReg temp = {};                                                         \
3384329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                                  \
3385329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                                  \
3386513e88a2SSong Gao     int oprsz = simd_oprsz(desc);                                           \
3387e93dd431SSong Gao                                                                             \
3388513e88a2SSong Gao     max = LSX_LEN / BIT;                                                    \
3389513e88a2SSong Gao     for (i = 0; i < oprsz / (BIT / 8); i++) {                               \
3390513e88a2SSong Gao         j = i < max ? 1 : 2;                                                \
3391513e88a2SSong Gao         temp.E(i) = Vj->E(SHF_POS(i - ((j -1)* max), imm) + (j - 1) * max); \
3392e93dd431SSong Gao     }                                                                       \
3393e93dd431SSong Gao     *Vd = temp;                                                             \
3394e93dd431SSong Gao }
3395e93dd431SSong Gao 
3396e93dd431SSong Gao VSHUF4I(vshuf4i_b, 8, B)
3397e93dd431SSong Gao VSHUF4I(vshuf4i_h, 16, H)
3398e93dd431SSong Gao VSHUF4I(vshuf4i_w, 32, W)
3399e93dd431SSong Gao 
HELPER(vshuf4i_d)3400329517d5SSong Gao void HELPER(vshuf4i_d)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3401e93dd431SSong Gao {
3402513e88a2SSong Gao     int i;
3403513e88a2SSong Gao     VReg temp = {};
3404329517d5SSong Gao     VReg *Vd = (VReg *)vd;
3405329517d5SSong Gao     VReg *Vj = (VReg *)vj;
3406513e88a2SSong Gao     int oprsz = simd_oprsz(desc);
3407e93dd431SSong Gao 
3408513e88a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
3409513e88a2SSong Gao         temp.D(2 * i) = (imm & 2 ? Vj : Vd)->D((imm & 1) + 2 * i);
3410513e88a2SSong Gao         temp.D(2 * i + 1) = (imm & 8 ? Vj : Vd)->D(((imm >> 2) & 1) + 2 * i);
3411513e88a2SSong Gao     }
3412513e88a2SSong Gao     *Vd = temp;
3413513e88a2SSong Gao }
3414513e88a2SSong Gao 
HELPER(vperm_w)3415513e88a2SSong Gao void HELPER(vperm_w)(void *vd, void *vj, void *vk, uint32_t desc)
3416513e88a2SSong Gao {
3417513e88a2SSong Gao     int i, m;
3418513e88a2SSong Gao     VReg temp = {};
3419513e88a2SSong Gao     VReg *Vd = (VReg *)vd;
3420513e88a2SSong Gao     VReg *Vj = (VReg *)vj;
3421513e88a2SSong Gao     VReg *Vk = (VReg *)vk;
3422513e88a2SSong Gao 
3423513e88a2SSong Gao     m = LASX_LEN / 32;
3424513e88a2SSong Gao     for (i = 0; i < m ; i++) {
3425513e88a2SSong Gao         uint64_t k = (uint8_t)Vk->W(i) % 8;
3426513e88a2SSong Gao         temp.W(i) = Vj->W(k);
3427513e88a2SSong Gao     }
3428e93dd431SSong Gao     *Vd = temp;
3429e93dd431SSong Gao }
3430e93dd431SSong Gao 
HELPER(vpermi_w)3431329517d5SSong Gao void HELPER(vpermi_w)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3432e93dd431SSong Gao {
3433513e88a2SSong Gao     int i;
3434513e88a2SSong Gao     VReg temp = {};
3435513e88a2SSong Gao     VReg *Vd = (VReg *)vd;
3436513e88a2SSong Gao     VReg *Vj = (VReg *)vj;
3437513e88a2SSong Gao     int oprsz = simd_oprsz(desc);
3438513e88a2SSong Gao 
3439513e88a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {
3440513e88a2SSong Gao         temp.W(4 * i) = Vj->W((imm & 0x3) + 4 * i);
3441513e88a2SSong Gao         temp.W(4 * i + 1) = Vj->W(((imm >> 2) & 0x3) + 4 * i);
3442513e88a2SSong Gao         temp.W(4 * i + 2) = Vd->W(((imm >> 4) & 0x3) + 4 * i);
3443513e88a2SSong Gao         temp.W(4 * i + 3) = Vd->W(((imm >> 6) & 0x3) + 4 * i);
3444513e88a2SSong Gao     }
3445513e88a2SSong Gao     *Vd = temp;
3446513e88a2SSong Gao }
3447513e88a2SSong Gao 
HELPER(vpermi_d)3448513e88a2SSong Gao void HELPER(vpermi_d)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3449513e88a2SSong Gao {
3450513e88a2SSong Gao     VReg temp = {};
3451513e88a2SSong Gao     VReg *Vd = (VReg *)vd;
3452513e88a2SSong Gao     VReg *Vj = (VReg *)vj;
3453513e88a2SSong Gao 
3454513e88a2SSong Gao     temp.D(0) = Vj->D(imm & 0x3);
3455513e88a2SSong Gao     temp.D(1) = Vj->D((imm >> 2) & 0x3);
3456513e88a2SSong Gao     temp.D(2) = Vj->D((imm >> 4) & 0x3);
3457513e88a2SSong Gao     temp.D(3) = Vj->D((imm >> 6) & 0x3);
3458513e88a2SSong Gao     *Vd = temp;
3459513e88a2SSong Gao }
3460513e88a2SSong Gao 
HELPER(vpermi_q)3461513e88a2SSong Gao void HELPER(vpermi_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3462513e88a2SSong Gao {
3463513e88a2SSong Gao     int i;
3464e93dd431SSong Gao     VReg temp;
3465329517d5SSong Gao     VReg *Vd = (VReg *)vd;
3466329517d5SSong Gao     VReg *Vj = (VReg *)vj;
3467e93dd431SSong Gao 
3468513e88a2SSong Gao     for (i = 0; i < 2; i++, imm >>= 4) {
3469513e88a2SSong Gao         temp.Q(i) = (imm & 2 ? Vd: Vj)->Q(imm & 1);
3470513e88a2SSong Gao     }
3471e93dd431SSong Gao     *Vd = temp;
3472e93dd431SSong Gao }
3473e93dd431SSong Gao 
3474e93dd431SSong Gao #define VEXTRINS(NAME, BIT, E, MASK)                               \
3475329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3476e93dd431SSong Gao {                                                                  \
3477513e88a2SSong Gao     int i, ins, extr, max;                                         \
3478329517d5SSong Gao     VReg *Vd = (VReg *)vd;                                         \
3479329517d5SSong Gao     VReg *Vj = (VReg *)vj;                                         \
3480513e88a2SSong Gao     int oprsz = simd_oprsz(desc);                                  \
3481e93dd431SSong Gao                                                                    \
3482513e88a2SSong Gao     max = LSX_LEN / BIT;                                           \
3483e93dd431SSong Gao     ins = (imm >> 4) & MASK;                                       \
3484e93dd431SSong Gao     extr = imm & MASK;                                             \
3485513e88a2SSong Gao     for (i = 0; i < oprsz / 16; i++) {                             \
3486513e88a2SSong Gao         Vd->E(ins + i * max) = Vj->E(extr + i * max);              \
3487513e88a2SSong Gao     }                                                              \
3488e93dd431SSong Gao }
3489e93dd431SSong Gao 
3490e93dd431SSong Gao VEXTRINS(vextrins_b, 8, B, 0xf)
3491e93dd431SSong Gao VEXTRINS(vextrins_h, 16, H, 0x7)
3492e93dd431SSong Gao VEXTRINS(vextrins_w, 32, W, 0x3)
3493e93dd431SSong Gao VEXTRINS(vextrins_d, 64, D, 0x1)
3494