1a0c9400aSSong Gao /* SPDX-License-Identifier: GPL-2.0-or-later */ 2a0c9400aSSong Gao /* 31dc33f26SSong Gao * QEMU LoongArch vector helper functions. 4a0c9400aSSong Gao * 5a0c9400aSSong Gao * Copyright (c) 2022-2023 Loongson Technology Corporation Limited 6a0c9400aSSong Gao */ 7c037fbc9SSong Gao 8c037fbc9SSong Gao #include "qemu/osdep.h" 9c037fbc9SSong Gao #include "cpu.h" 10c037fbc9SSong Gao #include "exec/exec-all.h" 11c037fbc9SSong Gao #include "exec/helper-proto.h" 12aca67472SSong Gao #include "fpu/softfloat.h" 13aca67472SSong Gao #include "internals.h" 14d0dfa19aSSong Gao #include "tcg/tcg.h" 15008a3b16SSong Gao #include "vec.h" 1664cf6b99SSong Gao #include "tcg/tcg-gvec-desc.h" 17c037fbc9SSong Gao 18c037fbc9SSong Gao #define DO_ADD(a, b) (a + b) 19c037fbc9SSong Gao #define DO_SUB(a, b) (a - b) 20c037fbc9SSong Gao 21c037fbc9SSong Gao #define DO_ODD_EVEN(NAME, BIT, E1, E2, DO_OP) \ 2204711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 23c037fbc9SSong Gao { \ 24c037fbc9SSong Gao int i; \ 2504711da1SSong Gao VReg *Vd = (VReg *)vd; \ 2604711da1SSong Gao VReg *Vj = (VReg *)vj; \ 2704711da1SSong Gao VReg *Vk = (VReg *)vk; \ 28c037fbc9SSong Gao typedef __typeof(Vd->E1(0)) TD; \ 2964cf6b99SSong Gao int oprsz = simd_oprsz(desc); \ 30c037fbc9SSong Gao \ 3164cf6b99SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 32c037fbc9SSong Gao Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i)); \ 33c037fbc9SSong Gao } \ 34c037fbc9SSong Gao } 35c037fbc9SSong Gao 36c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_h_b, 16, H, B, DO_ADD) 37c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_w_h, 32, W, H, DO_ADD) 38c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_d_w, 64, D, W, DO_ADD) 39c037fbc9SSong Gao 4004711da1SSong Gao void HELPER(vhaddw_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 41c037fbc9SSong Gao { 4264cf6b99SSong Gao int i; 4304711da1SSong Gao VReg *Vd = (VReg *)vd; 4404711da1SSong Gao VReg *Vj = (VReg *)vj; 4504711da1SSong Gao VReg *Vk = (VReg *)vk; 4664cf6b99SSong Gao int oprsz = simd_oprsz(desc); 47c037fbc9SSong Gao 4864cf6b99SSong Gao for (i = 0; i < oprsz / 16 ; i++) { 4964cf6b99SSong Gao Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i + 1)), 5064cf6b99SSong Gao int128_makes64(Vk->D(2 * i))); 5164cf6b99SSong Gao } 52c037fbc9SSong Gao } 53c037fbc9SSong Gao 54c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_h_b, 16, H, B, DO_SUB) 55c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_w_h, 32, W, H, DO_SUB) 56c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_d_w, 64, D, W, DO_SUB) 57c037fbc9SSong Gao 5804711da1SSong Gao void HELPER(vhsubw_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 59c037fbc9SSong Gao { 6064cf6b99SSong Gao int i; 6104711da1SSong Gao VReg *Vd = (VReg *)vd; 6204711da1SSong Gao VReg *Vj = (VReg *)vj; 6304711da1SSong Gao VReg *Vk = (VReg *)vk; 6464cf6b99SSong Gao int oprsz = simd_oprsz(desc); 65c037fbc9SSong Gao 6664cf6b99SSong Gao for (i = 0; i < oprsz / 16; i++) { 6764cf6b99SSong Gao Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)), 6864cf6b99SSong Gao int128_makes64(Vk->D(2 * i))); 6964cf6b99SSong Gao } 70c037fbc9SSong Gao } 71c037fbc9SSong Gao 72c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_hu_bu, 16, UH, UB, DO_ADD) 73c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_wu_hu, 32, UW, UH, DO_ADD) 74c037fbc9SSong Gao DO_ODD_EVEN(vhaddw_du_wu, 64, UD, UW, DO_ADD) 75c037fbc9SSong Gao 7604711da1SSong Gao void HELPER(vhaddw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc) 77c037fbc9SSong Gao { 7864cf6b99SSong Gao int i; 7904711da1SSong Gao VReg *Vd = (VReg *)vd; 8004711da1SSong Gao VReg *Vj = (VReg *)vj; 8104711da1SSong Gao VReg *Vk = (VReg *)vk; 8264cf6b99SSong Gao int oprsz = simd_oprsz(desc); 83c037fbc9SSong Gao 8464cf6b99SSong Gao for (i = 0; i < oprsz / 16; i ++) { 8564cf6b99SSong Gao Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), 8664cf6b99SSong Gao int128_make64(Vk->UD(2 * i))); 8764cf6b99SSong Gao } 88c037fbc9SSong Gao } 89c037fbc9SSong Gao 90c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_hu_bu, 16, UH, UB, DO_SUB) 91c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_wu_hu, 32, UW, UH, DO_SUB) 92c037fbc9SSong Gao DO_ODD_EVEN(vhsubw_du_wu, 64, UD, UW, DO_SUB) 93c037fbc9SSong Gao 9404711da1SSong Gao void HELPER(vhsubw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc) 95c037fbc9SSong Gao { 9664cf6b99SSong Gao int i; 9704711da1SSong Gao VReg *Vd = (VReg *)vd; 9804711da1SSong Gao VReg *Vj = (VReg *)vj; 9904711da1SSong Gao VReg *Vk = (VReg *)vk; 10064cf6b99SSong Gao int oprsz = simd_oprsz(desc); 101c037fbc9SSong Gao 10264cf6b99SSong Gao for (i = 0; i < oprsz / 16; i++) { 10364cf6b99SSong Gao Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)), 10464cf6b99SSong Gao int128_make64(Vk->UD(2 * i))); 10564cf6b99SSong Gao } 106c037fbc9SSong Gao } 1072d5f950cSSong Gao 1082d5f950cSSong Gao #define DO_EVEN(NAME, BIT, E1, E2, DO_OP) \ 10985995f07SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1102d5f950cSSong Gao { \ 1112d5f950cSSong Gao int i; \ 1122d5f950cSSong Gao VReg *Vd = (VReg *)vd; \ 1132d5f950cSSong Gao VReg *Vj = (VReg *)vj; \ 1142d5f950cSSong Gao VReg *Vk = (VReg *)vk; \ 1152d5f950cSSong Gao typedef __typeof(Vd->E1(0)) TD; \ 11685995f07SSong Gao int oprsz = simd_oprsz(desc); \ 11785995f07SSong Gao \ 11885995f07SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1192d5f950cSSong Gao Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i) ,(TD)Vk->E2(2 * i)); \ 1202d5f950cSSong Gao } \ 1212d5f950cSSong Gao } 1222d5f950cSSong Gao 1232d5f950cSSong Gao #define DO_ODD(NAME, BIT, E1, E2, DO_OP) \ 12485995f07SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1252d5f950cSSong Gao { \ 1262d5f950cSSong Gao int i; \ 1272d5f950cSSong Gao VReg *Vd = (VReg *)vd; \ 1282d5f950cSSong Gao VReg *Vj = (VReg *)vj; \ 1292d5f950cSSong Gao VReg *Vk = (VReg *)vk; \ 1302d5f950cSSong Gao typedef __typeof(Vd->E1(0)) TD; \ 13185995f07SSong Gao int oprsz = simd_oprsz(desc); \ 13285995f07SSong Gao \ 13385995f07SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1342d5f950cSSong Gao Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i + 1)); \ 1352d5f950cSSong Gao } \ 1362d5f950cSSong Gao } 1372d5f950cSSong Gao 13885995f07SSong Gao void HELPER(vaddwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 1392d5f950cSSong Gao { 14085995f07SSong Gao int i; 1412d5f950cSSong Gao VReg *Vd = (VReg *)vd; 1422d5f950cSSong Gao VReg *Vj = (VReg *)vj; 1432d5f950cSSong Gao VReg *Vk = (VReg *)vk; 14485995f07SSong Gao int oprsz = simd_oprsz(desc); 1452d5f950cSSong Gao 14685995f07SSong Gao for (i = 0; i < oprsz / 16; i++) { 14785995f07SSong Gao Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i)), 14885995f07SSong Gao int128_makes64(Vk->D(2 * i))); 14985995f07SSong Gao } 1502d5f950cSSong Gao } 1512d5f950cSSong Gao 1522d5f950cSSong Gao DO_EVEN(vaddwev_h_b, 16, H, B, DO_ADD) 1532d5f950cSSong Gao DO_EVEN(vaddwev_w_h, 32, W, H, DO_ADD) 1542d5f950cSSong Gao DO_EVEN(vaddwev_d_w, 64, D, W, DO_ADD) 1552d5f950cSSong Gao 15685995f07SSong Gao void HELPER(vaddwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 1572d5f950cSSong Gao { 15885995f07SSong Gao int i; 1592d5f950cSSong Gao VReg *Vd = (VReg *)vd; 1602d5f950cSSong Gao VReg *Vj = (VReg *)vj; 1612d5f950cSSong Gao VReg *Vk = (VReg *)vk; 16285995f07SSong Gao int oprsz = simd_oprsz(desc); 1632d5f950cSSong Gao 16485995f07SSong Gao for (i = 0; i < oprsz / 16; i++) { 16585995f07SSong Gao Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i +1)), 16685995f07SSong Gao int128_makes64(Vk->D(2 * i +1))); 16785995f07SSong Gao } 1682d5f950cSSong Gao } 1692d5f950cSSong Gao 1702d5f950cSSong Gao DO_ODD(vaddwod_h_b, 16, H, B, DO_ADD) 1712d5f950cSSong Gao DO_ODD(vaddwod_w_h, 32, W, H, DO_ADD) 1722d5f950cSSong Gao DO_ODD(vaddwod_d_w, 64, D, W, DO_ADD) 1732d5f950cSSong Gao 17485995f07SSong Gao void HELPER(vsubwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 1752d5f950cSSong Gao { 17685995f07SSong Gao int i; 1772d5f950cSSong Gao VReg *Vd = (VReg *)vd; 1782d5f950cSSong Gao VReg *Vj = (VReg *)vj; 1792d5f950cSSong Gao VReg *Vk = (VReg *)vk; 18085995f07SSong Gao int oprsz = simd_oprsz(desc); 1812d5f950cSSong Gao 18285995f07SSong Gao for (i = 0; i < oprsz / 16; i++) { 18385995f07SSong Gao Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i)), 18485995f07SSong Gao int128_makes64(Vk->D(2 * i))); 18585995f07SSong Gao } 1862d5f950cSSong Gao } 1872d5f950cSSong Gao 1882d5f950cSSong Gao DO_EVEN(vsubwev_h_b, 16, H, B, DO_SUB) 1892d5f950cSSong Gao DO_EVEN(vsubwev_w_h, 32, W, H, DO_SUB) 1902d5f950cSSong Gao DO_EVEN(vsubwev_d_w, 64, D, W, DO_SUB) 1912d5f950cSSong Gao 19285995f07SSong Gao void HELPER(vsubwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 1932d5f950cSSong Gao { 19485995f07SSong Gao int i; 1952d5f950cSSong Gao VReg *Vd = (VReg *)vd; 1962d5f950cSSong Gao VReg *Vj = (VReg *)vj; 1972d5f950cSSong Gao VReg *Vk = (VReg *)vk; 19885995f07SSong Gao int oprsz = simd_oprsz(desc); 1992d5f950cSSong Gao 20085995f07SSong Gao for (i = 0; i < oprsz / 16; i++) { 20185995f07SSong Gao Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)), 20285995f07SSong Gao int128_makes64(Vk->D(2 * i + 1))); 20385995f07SSong Gao } 2042d5f950cSSong Gao } 2052d5f950cSSong Gao 2062d5f950cSSong Gao DO_ODD(vsubwod_h_b, 16, H, B, DO_SUB) 2072d5f950cSSong Gao DO_ODD(vsubwod_w_h, 32, W, H, DO_SUB) 2082d5f950cSSong Gao DO_ODD(vsubwod_d_w, 64, D, W, DO_SUB) 2092d5f950cSSong Gao 21085995f07SSong Gao void HELPER(vaddwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc) 2112d5f950cSSong Gao { 21285995f07SSong Gao int i; 2132d5f950cSSong Gao VReg *Vd = (VReg *)vd; 2142d5f950cSSong Gao VReg *Vj = (VReg *)vj; 2152d5f950cSSong Gao VReg *Vk = (VReg *)vk; 21685995f07SSong Gao int oprsz = simd_oprsz(desc); 2172d5f950cSSong Gao 21885995f07SSong Gao for (i = 0; i < oprsz / 16; i++) { 21985995f07SSong Gao Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)), 22085995f07SSong Gao int128_make64(Vk->UD(2 * i))); 22185995f07SSong Gao } 2222d5f950cSSong Gao } 2232d5f950cSSong Gao 2242d5f950cSSong Gao DO_EVEN(vaddwev_h_bu, 16, UH, UB, DO_ADD) 2252d5f950cSSong Gao DO_EVEN(vaddwev_w_hu, 32, UW, UH, DO_ADD) 2262d5f950cSSong Gao DO_EVEN(vaddwev_d_wu, 64, UD, UW, DO_ADD) 2272d5f950cSSong Gao 22885995f07SSong Gao void HELPER(vaddwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc) 2292d5f950cSSong Gao { 23085995f07SSong Gao int i; 2312d5f950cSSong Gao VReg *Vd = (VReg *)vd; 2322d5f950cSSong Gao VReg *Vj = (VReg *)vj; 2332d5f950cSSong Gao VReg *Vk = (VReg *)vk; 23485995f07SSong Gao int oprsz = simd_oprsz(desc); 2352d5f950cSSong Gao 23685995f07SSong Gao for (i = 0; i < oprsz / 16; i++) { 23785995f07SSong Gao Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), 23885995f07SSong Gao int128_make64(Vk->UD(2 * i + 1))); 23985995f07SSong Gao } 2402d5f950cSSong Gao } 2412d5f950cSSong Gao 2422d5f950cSSong Gao DO_ODD(vaddwod_h_bu, 16, UH, UB, DO_ADD) 2432d5f950cSSong Gao DO_ODD(vaddwod_w_hu, 32, UW, UH, DO_ADD) 2442d5f950cSSong Gao DO_ODD(vaddwod_d_wu, 64, UD, UW, DO_ADD) 2452d5f950cSSong Gao 24685995f07SSong Gao void HELPER(vsubwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc) 2472d5f950cSSong Gao { 24885995f07SSong Gao int i; 2492d5f950cSSong Gao VReg *Vd = (VReg *)vd; 2502d5f950cSSong Gao VReg *Vj = (VReg *)vj; 2512d5f950cSSong Gao VReg *Vk = (VReg *)vk; 25285995f07SSong Gao int oprsz = simd_oprsz(desc); 2532d5f950cSSong Gao 25485995f07SSong Gao for (i = 0; i < oprsz / 16; i++) { 25585995f07SSong Gao Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i)), 25685995f07SSong Gao int128_make64(Vk->UD(2 * i))); 25785995f07SSong Gao } 2582d5f950cSSong Gao } 2592d5f950cSSong Gao 2602d5f950cSSong Gao DO_EVEN(vsubwev_h_bu, 16, UH, UB, DO_SUB) 2612d5f950cSSong Gao DO_EVEN(vsubwev_w_hu, 32, UW, UH, DO_SUB) 2622d5f950cSSong Gao DO_EVEN(vsubwev_d_wu, 64, UD, UW, DO_SUB) 2632d5f950cSSong Gao 26485995f07SSong Gao void HELPER(vsubwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc) 2652d5f950cSSong Gao { 26685995f07SSong Gao int i; 2672d5f950cSSong Gao VReg *Vd = (VReg *)vd; 2682d5f950cSSong Gao VReg *Vj = (VReg *)vj; 2692d5f950cSSong Gao VReg *Vk = (VReg *)vk; 27085995f07SSong Gao int oprsz = simd_oprsz(desc); 2712d5f950cSSong Gao 27285995f07SSong Gao for (i = 0; i < oprsz / 16; i++) { 27385995f07SSong Gao Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)), 27485995f07SSong Gao int128_make64(Vk->UD(2 * i + 1))); 27585995f07SSong Gao } 2762d5f950cSSong Gao } 2772d5f950cSSong Gao 2782d5f950cSSong Gao DO_ODD(vsubwod_h_bu, 16, UH, UB, DO_SUB) 2792d5f950cSSong Gao DO_ODD(vsubwod_w_hu, 32, UW, UH, DO_SUB) 2802d5f950cSSong Gao DO_ODD(vsubwod_d_wu, 64, UD, UW, DO_SUB) 2812d5f950cSSong Gao 2822d5f950cSSong Gao #define DO_EVEN_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ 28385995f07SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 2842d5f950cSSong Gao { \ 2852d5f950cSSong Gao int i; \ 2862d5f950cSSong Gao VReg *Vd = (VReg *)vd; \ 2872d5f950cSSong Gao VReg *Vj = (VReg *)vj; \ 2882d5f950cSSong Gao VReg *Vk = (VReg *)vk; \ 2892d5f950cSSong Gao typedef __typeof(Vd->ES1(0)) TDS; \ 2902d5f950cSSong Gao typedef __typeof(Vd->EU1(0)) TDU; \ 29185995f07SSong Gao int oprsz = simd_oprsz(desc); \ 29285995f07SSong Gao \ 29385995f07SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2942d5f950cSSong Gao Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i) ,(TDS)Vk->ES2(2 * i)); \ 2952d5f950cSSong Gao } \ 2962d5f950cSSong Gao } 2972d5f950cSSong Gao 2982d5f950cSSong Gao #define DO_ODD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ 29985995f07SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3002d5f950cSSong Gao { \ 3012d5f950cSSong Gao int i; \ 3022d5f950cSSong Gao VReg *Vd = (VReg *)vd; \ 3032d5f950cSSong Gao VReg *Vj = (VReg *)vj; \ 3042d5f950cSSong Gao VReg *Vk = (VReg *)vk; \ 3052d5f950cSSong Gao typedef __typeof(Vd->ES1(0)) TDS; \ 3062d5f950cSSong Gao typedef __typeof(Vd->EU1(0)) TDU; \ 30785995f07SSong Gao int oprsz = simd_oprsz(desc); \ 30885995f07SSong Gao \ 30985995f07SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3102d5f950cSSong Gao Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i + 1), (TDS)Vk->ES2(2 * i + 1)); \ 3112d5f950cSSong Gao } \ 3122d5f950cSSong Gao } 3132d5f950cSSong Gao 31485995f07SSong Gao void HELPER(vaddwev_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc) 3152d5f950cSSong Gao { 31685995f07SSong Gao int i; 3172d5f950cSSong Gao VReg *Vd = (VReg *)vd; 3182d5f950cSSong Gao VReg *Vj = (VReg *)vj; 3192d5f950cSSong Gao VReg *Vk = (VReg *)vk; 32085995f07SSong Gao int oprsz = simd_oprsz(desc); 3212d5f950cSSong Gao 32285995f07SSong Gao for (i = 0; i < oprsz / 16; i++) { 32385995f07SSong Gao Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)), 32485995f07SSong Gao int128_makes64(Vk->D(2 * i))); 32585995f07SSong Gao } 3262d5f950cSSong Gao } 3272d5f950cSSong Gao 3282d5f950cSSong Gao DO_EVEN_U_S(vaddwev_h_bu_b, 16, H, UH, B, UB, DO_ADD) 3292d5f950cSSong Gao DO_EVEN_U_S(vaddwev_w_hu_h, 32, W, UW, H, UH, DO_ADD) 3302d5f950cSSong Gao DO_EVEN_U_S(vaddwev_d_wu_w, 64, D, UD, W, UW, DO_ADD) 3312d5f950cSSong Gao 33285995f07SSong Gao void HELPER(vaddwod_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc) 3332d5f950cSSong Gao { 33485995f07SSong Gao int i; 3352d5f950cSSong Gao VReg *Vd = (VReg *)vd; 3362d5f950cSSong Gao VReg *Vj = (VReg *)vj; 3372d5f950cSSong Gao VReg *Vk = (VReg *)vk; 33885995f07SSong Gao int oprsz = simd_oprsz(desc); 3392d5f950cSSong Gao 34085995f07SSong Gao for (i = 0; i < oprsz / 16; i++) { 34185995f07SSong Gao Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), 34285995f07SSong Gao int128_makes64(Vk->D(2 * i + 1))); 34385995f07SSong Gao } 3442d5f950cSSong Gao } 3452d5f950cSSong Gao 3462d5f950cSSong Gao DO_ODD_U_S(vaddwod_h_bu_b, 16, H, UH, B, UB, DO_ADD) 3472d5f950cSSong Gao DO_ODD_U_S(vaddwod_w_hu_h, 32, W, UW, H, UH, DO_ADD) 3482d5f950cSSong Gao DO_ODD_U_S(vaddwod_d_wu_w, 64, D, UD, W, UW, DO_ADD) 34939e9b0a7SSong Gao 35039e9b0a7SSong Gao #define DO_VAVG(a, b) ((a >> 1) + (b >> 1) + (a & b & 1)) 35139e9b0a7SSong Gao #define DO_VAVGR(a, b) ((a >> 1) + (b >> 1) + ((a | b) & 1)) 35239e9b0a7SSong Gao 35339e9b0a7SSong Gao #define DO_3OP(NAME, BIT, E, DO_OP) \ 354ee7250d0SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 35539e9b0a7SSong Gao { \ 35639e9b0a7SSong Gao int i; \ 35739e9b0a7SSong Gao VReg *Vd = (VReg *)vd; \ 35839e9b0a7SSong Gao VReg *Vj = (VReg *)vj; \ 35939e9b0a7SSong Gao VReg *Vk = (VReg *)vk; \ 360ee7250d0SSong Gao int oprsz = simd_oprsz(desc); \ 361ee7250d0SSong Gao \ 362ee7250d0SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 36339e9b0a7SSong Gao Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \ 36439e9b0a7SSong Gao } \ 36539e9b0a7SSong Gao } 36639e9b0a7SSong Gao 36739e9b0a7SSong Gao DO_3OP(vavg_b, 8, B, DO_VAVG) 36839e9b0a7SSong Gao DO_3OP(vavg_h, 16, H, DO_VAVG) 36939e9b0a7SSong Gao DO_3OP(vavg_w, 32, W, DO_VAVG) 37039e9b0a7SSong Gao DO_3OP(vavg_d, 64, D, DO_VAVG) 37139e9b0a7SSong Gao DO_3OP(vavgr_b, 8, B, DO_VAVGR) 37239e9b0a7SSong Gao DO_3OP(vavgr_h, 16, H, DO_VAVGR) 37339e9b0a7SSong Gao DO_3OP(vavgr_w, 32, W, DO_VAVGR) 37439e9b0a7SSong Gao DO_3OP(vavgr_d, 64, D, DO_VAVGR) 37539e9b0a7SSong Gao DO_3OP(vavg_bu, 8, UB, DO_VAVG) 37639e9b0a7SSong Gao DO_3OP(vavg_hu, 16, UH, DO_VAVG) 37739e9b0a7SSong Gao DO_3OP(vavg_wu, 32, UW, DO_VAVG) 37839e9b0a7SSong Gao DO_3OP(vavg_du, 64, UD, DO_VAVG) 37939e9b0a7SSong Gao DO_3OP(vavgr_bu, 8, UB, DO_VAVGR) 38039e9b0a7SSong Gao DO_3OP(vavgr_hu, 16, UH, DO_VAVGR) 38139e9b0a7SSong Gao DO_3OP(vavgr_wu, 32, UW, DO_VAVGR) 38239e9b0a7SSong Gao DO_3OP(vavgr_du, 64, UD, DO_VAVGR) 38349725659SSong Gao 38449725659SSong Gao #define DO_VABSD(a, b) ((a > b) ? (a -b) : (b-a)) 38549725659SSong Gao 38649725659SSong Gao DO_3OP(vabsd_b, 8, B, DO_VABSD) 38749725659SSong Gao DO_3OP(vabsd_h, 16, H, DO_VABSD) 38849725659SSong Gao DO_3OP(vabsd_w, 32, W, DO_VABSD) 38949725659SSong Gao DO_3OP(vabsd_d, 64, D, DO_VABSD) 39049725659SSong Gao DO_3OP(vabsd_bu, 8, UB, DO_VABSD) 39149725659SSong Gao DO_3OP(vabsd_hu, 16, UH, DO_VABSD) 39249725659SSong Gao DO_3OP(vabsd_wu, 32, UW, DO_VABSD) 39349725659SSong Gao DO_3OP(vabsd_du, 64, UD, DO_VABSD) 394af448cb3SSong Gao 395af448cb3SSong Gao #define DO_VABS(a) ((a < 0) ? (-a) : (a)) 396af448cb3SSong Gao 39727f5485dSSong Gao #define DO_VADDA(NAME, BIT, E) \ 39827f5485dSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 399af448cb3SSong Gao { \ 400af448cb3SSong Gao int i; \ 401af448cb3SSong Gao VReg *Vd = (VReg *)vd; \ 402af448cb3SSong Gao VReg *Vj = (VReg *)vj; \ 403af448cb3SSong Gao VReg *Vk = (VReg *)vk; \ 40427f5485dSSong Gao int oprsz = simd_oprsz(desc); \ 40527f5485dSSong Gao \ 40627f5485dSSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 40727f5485dSSong Gao Vd->E(i) = DO_VABS(Vj->E(i)) + DO_VABS(Vk->E(i)); \ 408af448cb3SSong Gao } \ 409af448cb3SSong Gao } 410af448cb3SSong Gao 41127f5485dSSong Gao DO_VADDA(vadda_b, 8, B) 41227f5485dSSong Gao DO_VADDA(vadda_h, 16, H) 41327f5485dSSong Gao DO_VADDA(vadda_w, 32, W) 41427f5485dSSong Gao DO_VADDA(vadda_d, 64, D) 4159ab29520SSong Gao 4169ab29520SSong Gao #define DO_MIN(a, b) (a < b ? a : b) 4179ab29520SSong Gao #define DO_MAX(a, b) (a > b ? a : b) 4189ab29520SSong Gao 4199ab29520SSong Gao #define VMINMAXI(NAME, BIT, E, DO_OP) \ 420c09360faSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 4219ab29520SSong Gao { \ 4229ab29520SSong Gao int i; \ 4239ab29520SSong Gao VReg *Vd = (VReg *)vd; \ 4249ab29520SSong Gao VReg *Vj = (VReg *)vj; \ 4259ab29520SSong Gao typedef __typeof(Vd->E(0)) TD; \ 426c09360faSSong Gao int oprsz = simd_oprsz(desc); \ 4279ab29520SSong Gao \ 428c09360faSSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 4299ab29520SSong Gao Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \ 4309ab29520SSong Gao } \ 4319ab29520SSong Gao } 4329ab29520SSong Gao 4339ab29520SSong Gao VMINMAXI(vmini_b, 8, B, DO_MIN) 4349ab29520SSong Gao VMINMAXI(vmini_h, 16, H, DO_MIN) 4359ab29520SSong Gao VMINMAXI(vmini_w, 32, W, DO_MIN) 4369ab29520SSong Gao VMINMAXI(vmini_d, 64, D, DO_MIN) 4379ab29520SSong Gao VMINMAXI(vmaxi_b, 8, B, DO_MAX) 4389ab29520SSong Gao VMINMAXI(vmaxi_h, 16, H, DO_MAX) 4399ab29520SSong Gao VMINMAXI(vmaxi_w, 32, W, DO_MAX) 4409ab29520SSong Gao VMINMAXI(vmaxi_d, 64, D, DO_MAX) 4419ab29520SSong Gao VMINMAXI(vmini_bu, 8, UB, DO_MIN) 4429ab29520SSong Gao VMINMAXI(vmini_hu, 16, UH, DO_MIN) 4439ab29520SSong Gao VMINMAXI(vmini_wu, 32, UW, DO_MIN) 4449ab29520SSong Gao VMINMAXI(vmini_du, 64, UD, DO_MIN) 4459ab29520SSong Gao VMINMAXI(vmaxi_bu, 8, UB, DO_MAX) 4469ab29520SSong Gao VMINMAXI(vmaxi_hu, 16, UH, DO_MAX) 4479ab29520SSong Gao VMINMAXI(vmaxi_wu, 32, UW, DO_MAX) 4489ab29520SSong Gao VMINMAXI(vmaxi_du, 64, UD, DO_MAX) 449cd1c49adSSong Gao 450cd1c49adSSong Gao #define DO_VMUH(NAME, BIT, E1, E2, DO_OP) \ 451342dc1cfSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 452cd1c49adSSong Gao { \ 453cd1c49adSSong Gao int i; \ 454cd1c49adSSong Gao VReg *Vd = (VReg *)vd; \ 455cd1c49adSSong Gao VReg *Vj = (VReg *)vj; \ 456cd1c49adSSong Gao VReg *Vk = (VReg *)vk; \ 457cd1c49adSSong Gao typedef __typeof(Vd->E1(0)) T; \ 458342dc1cfSSong Gao int oprsz = simd_oprsz(desc); \ 459cd1c49adSSong Gao \ 460342dc1cfSSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 461cd1c49adSSong Gao Vd->E2(i) = ((T)Vj->E2(i)) * ((T)Vk->E2(i)) >> BIT; \ 462cd1c49adSSong Gao } \ 463cd1c49adSSong Gao } 464cd1c49adSSong Gao 465342dc1cfSSong Gao void HELPER(vmuh_d)(void *vd, void *vj, void *vk, uint32_t desc) 466cd1c49adSSong Gao { 467342dc1cfSSong Gao int i; 468342dc1cfSSong Gao uint64_t l, h; 469cd1c49adSSong Gao VReg *Vd = (VReg *)vd; 470cd1c49adSSong Gao VReg *Vj = (VReg *)vj; 471cd1c49adSSong Gao VReg *Vk = (VReg *)vk; 472342dc1cfSSong Gao int oprsz = simd_oprsz(desc); 473cd1c49adSSong Gao 474342dc1cfSSong Gao for (i = 0; i < oprsz / 8; i++) { 475342dc1cfSSong Gao muls64(&l, &h, Vj->D(i), Vk->D(i)); 476342dc1cfSSong Gao Vd->D(i) = h; 477342dc1cfSSong Gao } 478cd1c49adSSong Gao } 479cd1c49adSSong Gao 480cd1c49adSSong Gao DO_VMUH(vmuh_b, 8, H, B, DO_MUH) 481cd1c49adSSong Gao DO_VMUH(vmuh_h, 16, W, H, DO_MUH) 482cd1c49adSSong Gao DO_VMUH(vmuh_w, 32, D, W, DO_MUH) 483cd1c49adSSong Gao 484342dc1cfSSong Gao void HELPER(vmuh_du)(void *vd, void *vj, void *vk, uint32_t desc) 485cd1c49adSSong Gao { 486342dc1cfSSong Gao int i; 487342dc1cfSSong Gao uint64_t l, h; 488cd1c49adSSong Gao VReg *Vd = (VReg *)vd; 489cd1c49adSSong Gao VReg *Vj = (VReg *)vj; 490cd1c49adSSong Gao VReg *Vk = (VReg *)vk; 491342dc1cfSSong Gao int oprsz = simd_oprsz(desc); 492cd1c49adSSong Gao 493342dc1cfSSong Gao for (i = 0; i < oprsz / 8; i++) { 494342dc1cfSSong Gao mulu64(&l, &h, Vj->D(i), Vk->D(i)); 495342dc1cfSSong Gao Vd->D(i) = h; 496342dc1cfSSong Gao } 497cd1c49adSSong Gao } 498cd1c49adSSong Gao 499cd1c49adSSong Gao DO_VMUH(vmuh_bu, 8, UH, UB, DO_MUH) 500cd1c49adSSong Gao DO_VMUH(vmuh_hu, 16, UW, UH, DO_MUH) 501cd1c49adSSong Gao DO_VMUH(vmuh_wu, 32, UD, UW, DO_MUH) 502cd1c49adSSong Gao 503cd1c49adSSong Gao #define DO_MUL(a, b) (a * b) 504cd1c49adSSong Gao 505cd1c49adSSong Gao DO_EVEN(vmulwev_h_b, 16, H, B, DO_MUL) 506cd1c49adSSong Gao DO_EVEN(vmulwev_w_h, 32, W, H, DO_MUL) 507cd1c49adSSong Gao DO_EVEN(vmulwev_d_w, 64, D, W, DO_MUL) 508cd1c49adSSong Gao 509cd1c49adSSong Gao DO_ODD(vmulwod_h_b, 16, H, B, DO_MUL) 510cd1c49adSSong Gao DO_ODD(vmulwod_w_h, 32, W, H, DO_MUL) 511cd1c49adSSong Gao DO_ODD(vmulwod_d_w, 64, D, W, DO_MUL) 512cd1c49adSSong Gao 513cd1c49adSSong Gao DO_EVEN(vmulwev_h_bu, 16, UH, UB, DO_MUL) 514cd1c49adSSong Gao DO_EVEN(vmulwev_w_hu, 32, UW, UH, DO_MUL) 515cd1c49adSSong Gao DO_EVEN(vmulwev_d_wu, 64, UD, UW, DO_MUL) 516cd1c49adSSong Gao 517cd1c49adSSong Gao DO_ODD(vmulwod_h_bu, 16, UH, UB, DO_MUL) 518cd1c49adSSong Gao DO_ODD(vmulwod_w_hu, 32, UW, UH, DO_MUL) 519cd1c49adSSong Gao DO_ODD(vmulwod_d_wu, 64, UD, UW, DO_MUL) 520cd1c49adSSong Gao 521cd1c49adSSong Gao DO_EVEN_U_S(vmulwev_h_bu_b, 16, H, UH, B, UB, DO_MUL) 522cd1c49adSSong Gao DO_EVEN_U_S(vmulwev_w_hu_h, 32, W, UW, H, UH, DO_MUL) 523cd1c49adSSong Gao DO_EVEN_U_S(vmulwev_d_wu_w, 64, D, UD, W, UW, DO_MUL) 524cd1c49adSSong Gao 525cd1c49adSSong Gao DO_ODD_U_S(vmulwod_h_bu_b, 16, H, UH, B, UB, DO_MUL) 526cd1c49adSSong Gao DO_ODD_U_S(vmulwod_w_hu_h, 32, W, UW, H, UH, DO_MUL) 527cd1c49adSSong Gao DO_ODD_U_S(vmulwod_d_wu_w, 64, D, UD, W, UW, DO_MUL) 528d3aec65bSSong Gao 529d3aec65bSSong Gao #define DO_MADD(a, b, c) (a + b * c) 530d3aec65bSSong Gao #define DO_MSUB(a, b, c) (a - b * c) 531d3aec65bSSong Gao 532d3aec65bSSong Gao #define VMADDSUB(NAME, BIT, E, DO_OP) \ 5333f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 534d3aec65bSSong Gao { \ 535d3aec65bSSong Gao int i; \ 536d3aec65bSSong Gao VReg *Vd = (VReg *)vd; \ 537d3aec65bSSong Gao VReg *Vj = (VReg *)vj; \ 538d3aec65bSSong Gao VReg *Vk = (VReg *)vk; \ 5393f450c17SSong Gao int oprsz = simd_oprsz(desc); \ 5403f450c17SSong Gao \ 5413f450c17SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 542d3aec65bSSong Gao Vd->E(i) = DO_OP(Vd->E(i), Vj->E(i) ,Vk->E(i)); \ 543d3aec65bSSong Gao } \ 544d3aec65bSSong Gao } 545d3aec65bSSong Gao 546d3aec65bSSong Gao VMADDSUB(vmadd_b, 8, B, DO_MADD) 547d3aec65bSSong Gao VMADDSUB(vmadd_h, 16, H, DO_MADD) 548d3aec65bSSong Gao VMADDSUB(vmadd_w, 32, W, DO_MADD) 549d3aec65bSSong Gao VMADDSUB(vmadd_d, 64, D, DO_MADD) 550d3aec65bSSong Gao VMADDSUB(vmsub_b, 8, B, DO_MSUB) 551d3aec65bSSong Gao VMADDSUB(vmsub_h, 16, H, DO_MSUB) 552d3aec65bSSong Gao VMADDSUB(vmsub_w, 32, W, DO_MSUB) 553d3aec65bSSong Gao VMADDSUB(vmsub_d, 64, D, DO_MSUB) 554d3aec65bSSong Gao 555d3aec65bSSong Gao #define VMADDWEV(NAME, BIT, E1, E2, DO_OP) \ 5563f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 557d3aec65bSSong Gao { \ 558d3aec65bSSong Gao int i; \ 559d3aec65bSSong Gao VReg *Vd = (VReg *)vd; \ 560d3aec65bSSong Gao VReg *Vj = (VReg *)vj; \ 561d3aec65bSSong Gao VReg *Vk = (VReg *)vk; \ 562d3aec65bSSong Gao typedef __typeof(Vd->E1(0)) TD; \ 5633f450c17SSong Gao int oprsz = simd_oprsz(desc); \ 564d3aec65bSSong Gao \ 5653f450c17SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 566d3aec65bSSong Gao Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i), (TD)Vk->E2(2 * i)); \ 567d3aec65bSSong Gao } \ 568d3aec65bSSong Gao } 569d3aec65bSSong Gao 570d3aec65bSSong Gao VMADDWEV(vmaddwev_h_b, 16, H, B, DO_MUL) 571d3aec65bSSong Gao VMADDWEV(vmaddwev_w_h, 32, W, H, DO_MUL) 572d3aec65bSSong Gao VMADDWEV(vmaddwev_d_w, 64, D, W, DO_MUL) 573d3aec65bSSong Gao VMADDWEV(vmaddwev_h_bu, 16, UH, UB, DO_MUL) 574d3aec65bSSong Gao VMADDWEV(vmaddwev_w_hu, 32, UW, UH, DO_MUL) 575d3aec65bSSong Gao VMADDWEV(vmaddwev_d_wu, 64, UD, UW, DO_MUL) 576d3aec65bSSong Gao 577d3aec65bSSong Gao #define VMADDWOD(NAME, BIT, E1, E2, DO_OP) \ 5783f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 579d3aec65bSSong Gao { \ 580d3aec65bSSong Gao int i; \ 581d3aec65bSSong Gao VReg *Vd = (VReg *)vd; \ 582d3aec65bSSong Gao VReg *Vj = (VReg *)vj; \ 583d3aec65bSSong Gao VReg *Vk = (VReg *)vk; \ 584d3aec65bSSong Gao typedef __typeof(Vd->E1(0)) TD; \ 5853f450c17SSong Gao int oprsz = simd_oprsz(desc); \ 586d3aec65bSSong Gao \ 5873f450c17SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 588d3aec65bSSong Gao Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i + 1), \ 589d3aec65bSSong Gao (TD)Vk->E2(2 * i + 1)); \ 590d3aec65bSSong Gao } \ 591d3aec65bSSong Gao } 592d3aec65bSSong Gao 593d3aec65bSSong Gao VMADDWOD(vmaddwod_h_b, 16, H, B, DO_MUL) 594d3aec65bSSong Gao VMADDWOD(vmaddwod_w_h, 32, W, H, DO_MUL) 595d3aec65bSSong Gao VMADDWOD(vmaddwod_d_w, 64, D, W, DO_MUL) 596d3aec65bSSong Gao VMADDWOD(vmaddwod_h_bu, 16, UH, UB, DO_MUL) 597d3aec65bSSong Gao VMADDWOD(vmaddwod_w_hu, 32, UW, UH, DO_MUL) 598d3aec65bSSong Gao VMADDWOD(vmaddwod_d_wu, 64, UD, UW, DO_MUL) 599d3aec65bSSong Gao 600d3aec65bSSong Gao #define VMADDWEV_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ 6013f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 602d3aec65bSSong Gao { \ 603d3aec65bSSong Gao int i; \ 604d3aec65bSSong Gao VReg *Vd = (VReg *)vd; \ 605d3aec65bSSong Gao VReg *Vj = (VReg *)vj; \ 606d3aec65bSSong Gao VReg *Vk = (VReg *)vk; \ 607d3aec65bSSong Gao typedef __typeof(Vd->ES1(0)) TS1; \ 608d3aec65bSSong Gao typedef __typeof(Vd->EU1(0)) TU1; \ 6093f450c17SSong Gao int oprsz = simd_oprsz(desc); \ 610d3aec65bSSong Gao \ 6113f450c17SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 612d3aec65bSSong Gao Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i), \ 613d3aec65bSSong Gao (TS1)Vk->ES2(2 * i)); \ 614d3aec65bSSong Gao } \ 615d3aec65bSSong Gao } 616d3aec65bSSong Gao 617d3aec65bSSong Gao VMADDWEV_U_S(vmaddwev_h_bu_b, 16, H, UH, B, UB, DO_MUL) 618d3aec65bSSong Gao VMADDWEV_U_S(vmaddwev_w_hu_h, 32, W, UW, H, UH, DO_MUL) 619d3aec65bSSong Gao VMADDWEV_U_S(vmaddwev_d_wu_w, 64, D, UD, W, UW, DO_MUL) 620d3aec65bSSong Gao 621d3aec65bSSong Gao #define VMADDWOD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ 6223f450c17SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 623d3aec65bSSong Gao { \ 624d3aec65bSSong Gao int i; \ 625d3aec65bSSong Gao VReg *Vd = (VReg *)vd; \ 626d3aec65bSSong Gao VReg *Vj = (VReg *)vj; \ 627d3aec65bSSong Gao VReg *Vk = (VReg *)vk; \ 628d3aec65bSSong Gao typedef __typeof(Vd->ES1(0)) TS1; \ 629d3aec65bSSong Gao typedef __typeof(Vd->EU1(0)) TU1; \ 6303f450c17SSong Gao int oprsz = simd_oprsz(desc); \ 631d3aec65bSSong Gao \ 6323f450c17SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 633d3aec65bSSong Gao Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i + 1), \ 634d3aec65bSSong Gao (TS1)Vk->ES2(2 * i + 1)); \ 635d3aec65bSSong Gao } \ 636d3aec65bSSong Gao } 637d3aec65bSSong Gao 638d3aec65bSSong Gao VMADDWOD_U_S(vmaddwod_h_bu_b, 16, H, UH, B, UB, DO_MUL) 639d3aec65bSSong Gao VMADDWOD_U_S(vmaddwod_w_hu_h, 32, W, UW, H, UH, DO_MUL) 640d3aec65bSSong Gao VMADDWOD_U_S(vmaddwod_d_wu_w, 64, D, UD, W, UW, DO_MUL) 6414cc4c0f7SSong Gao 6424cc4c0f7SSong Gao #define DO_DIVU(N, M) (unlikely(M == 0) ? 0 : N / M) 6434cc4c0f7SSong Gao #define DO_REMU(N, M) (unlikely(M == 0) ? 0 : N % M) 6444cc4c0f7SSong Gao #define DO_DIV(N, M) (unlikely(M == 0) ? 0 :\ 6454cc4c0f7SSong Gao unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M) 6464cc4c0f7SSong Gao #define DO_REM(N, M) (unlikely(M == 0) ? 0 :\ 6474cc4c0f7SSong Gao unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M) 6484cc4c0f7SSong Gao 6494cc4c0f7SSong Gao #define VDIV(NAME, BIT, E, DO_OP) \ 65004711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 6514cc4c0f7SSong Gao { \ 6524cc4c0f7SSong Gao int i; \ 65304711da1SSong Gao VReg *Vd = (VReg *)vd; \ 65404711da1SSong Gao VReg *Vj = (VReg *)vj; \ 65504711da1SSong Gao VReg *Vk = (VReg *)vk; \ 656abb693deSSong Gao int oprsz = simd_oprsz(desc); \ 657abb693deSSong Gao \ 658abb693deSSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 6594cc4c0f7SSong Gao Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \ 6604cc4c0f7SSong Gao } \ 6614cc4c0f7SSong Gao } 6624cc4c0f7SSong Gao 6634cc4c0f7SSong Gao VDIV(vdiv_b, 8, B, DO_DIV) 6644cc4c0f7SSong Gao VDIV(vdiv_h, 16, H, DO_DIV) 6654cc4c0f7SSong Gao VDIV(vdiv_w, 32, W, DO_DIV) 6664cc4c0f7SSong Gao VDIV(vdiv_d, 64, D, DO_DIV) 6674cc4c0f7SSong Gao VDIV(vdiv_bu, 8, UB, DO_DIVU) 6684cc4c0f7SSong Gao VDIV(vdiv_hu, 16, UH, DO_DIVU) 6694cc4c0f7SSong Gao VDIV(vdiv_wu, 32, UW, DO_DIVU) 6704cc4c0f7SSong Gao VDIV(vdiv_du, 64, UD, DO_DIVU) 6714cc4c0f7SSong Gao VDIV(vmod_b, 8, B, DO_REM) 6724cc4c0f7SSong Gao VDIV(vmod_h, 16, H, DO_REM) 6734cc4c0f7SSong Gao VDIV(vmod_w, 32, W, DO_REM) 6744cc4c0f7SSong Gao VDIV(vmod_d, 64, D, DO_REM) 6754cc4c0f7SSong Gao VDIV(vmod_bu, 8, UB, DO_REMU) 6764cc4c0f7SSong Gao VDIV(vmod_hu, 16, UH, DO_REMU) 6774cc4c0f7SSong Gao VDIV(vmod_wu, 32, UW, DO_REMU) 6784cc4c0f7SSong Gao VDIV(vmod_du, 64, UD, DO_REMU) 679cbe44190SSong Gao 680cbe44190SSong Gao #define VSAT_S(NAME, BIT, E) \ 681e5c7f031SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \ 682cbe44190SSong Gao { \ 683cbe44190SSong Gao int i; \ 684cbe44190SSong Gao VReg *Vd = (VReg *)vd; \ 685cbe44190SSong Gao VReg *Vj = (VReg *)vj; \ 686cbe44190SSong Gao typedef __typeof(Vd->E(0)) TD; \ 687e5c7f031SSong Gao int oprsz = simd_oprsz(desc); \ 688cbe44190SSong Gao \ 689e5c7f031SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 690cbe44190SSong Gao Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : \ 691cbe44190SSong Gao Vj->E(i) < (TD)~max ? (TD)~max: Vj->E(i); \ 692cbe44190SSong Gao } \ 693cbe44190SSong Gao } 694cbe44190SSong Gao 695cbe44190SSong Gao VSAT_S(vsat_b, 8, B) 696cbe44190SSong Gao VSAT_S(vsat_h, 16, H) 697cbe44190SSong Gao VSAT_S(vsat_w, 32, W) 698cbe44190SSong Gao VSAT_S(vsat_d, 64, D) 699cbe44190SSong Gao 700cbe44190SSong Gao #define VSAT_U(NAME, BIT, E) \ 701e5c7f031SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \ 702cbe44190SSong Gao { \ 703cbe44190SSong Gao int i; \ 704cbe44190SSong Gao VReg *Vd = (VReg *)vd; \ 705cbe44190SSong Gao VReg *Vj = (VReg *)vj; \ 706cbe44190SSong Gao typedef __typeof(Vd->E(0)) TD; \ 707e5c7f031SSong Gao int oprsz = simd_oprsz(desc); \ 708cbe44190SSong Gao \ 709e5c7f031SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 710cbe44190SSong Gao Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : Vj->E(i); \ 711cbe44190SSong Gao } \ 712cbe44190SSong Gao } 713cbe44190SSong Gao 714cbe44190SSong Gao VSAT_U(vsat_bu, 8, UB) 715cbe44190SSong Gao VSAT_U(vsat_hu, 16, UH) 716cbe44190SSong Gao VSAT_U(vsat_wu, 32, UW) 717cbe44190SSong Gao VSAT_U(vsat_du, 64, UD) 7183734ad93SSong Gao 7193734ad93SSong Gao #define VEXTH(NAME, BIT, E1, E2) \ 720ff27e335SSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ 7213734ad93SSong Gao { \ 722f0db0bebSSong Gao int i, j, ofs; \ 723ff27e335SSong Gao VReg *Vd = (VReg *)vd; \ 724ff27e335SSong Gao VReg *Vj = (VReg *)vj; \ 725f0db0bebSSong Gao int oprsz = simd_oprsz(desc); \ 7263734ad93SSong Gao \ 727f0db0bebSSong Gao ofs = LSX_LEN / BIT; \ 728f0db0bebSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 729f0db0bebSSong Gao for (j = 0; j < ofs; j++) { \ 730f0db0bebSSong Gao Vd->E1(j + i * ofs) = Vj->E2(j + ofs + ofs * 2 * i); \ 731f0db0bebSSong Gao } \ 7323734ad93SSong Gao } \ 7333734ad93SSong Gao } 7343734ad93SSong Gao 735ff27e335SSong Gao void HELPER(vexth_q_d)(void *vd, void *vj, uint32_t desc) 7363734ad93SSong Gao { 737f0db0bebSSong Gao int i; 738ff27e335SSong Gao VReg *Vd = (VReg *)vd; 739ff27e335SSong Gao VReg *Vj = (VReg *)vj; 740f0db0bebSSong Gao int oprsz = simd_oprsz(desc); 7413734ad93SSong Gao 742f0db0bebSSong Gao for (i = 0; i < oprsz / 16; i++) { 743f0db0bebSSong Gao Vd->Q(i) = int128_makes64(Vj->D(2 * i + 1)); 744f0db0bebSSong Gao } 7453734ad93SSong Gao } 7463734ad93SSong Gao 747ff27e335SSong Gao void HELPER(vexth_qu_du)(void *vd, void *vj, uint32_t desc) 7483734ad93SSong Gao { 749f0db0bebSSong Gao int i; 750ff27e335SSong Gao VReg *Vd = (VReg *)vd; 751ff27e335SSong Gao VReg *Vj = (VReg *)vj; 752f0db0bebSSong Gao int oprsz = simd_oprsz(desc); 7533734ad93SSong Gao 754f0db0bebSSong Gao for (i = 0; i < oprsz / 16; i++) { 755f0db0bebSSong Gao Vd->Q(i) = int128_make64(Vj->UD(2 * i + 1)); 756f0db0bebSSong Gao } 7573734ad93SSong Gao } 7583734ad93SSong Gao 7593734ad93SSong Gao VEXTH(vexth_h_b, 16, H, B) 7603734ad93SSong Gao VEXTH(vexth_w_h, 32, W, H) 7613734ad93SSong Gao VEXTH(vexth_d_w, 64, D, W) 7623734ad93SSong Gao VEXTH(vexth_hu_bu, 16, UH, UB) 7633734ad93SSong Gao VEXTH(vexth_wu_hu, 32, UW, UH) 7643734ad93SSong Gao VEXTH(vexth_du_wu, 64, UD, UW) 765f0e395dfSSong Gao 766790acb2aSSong Gao #define VEXT2XV(NAME, BIT, E1, E2) \ 767790acb2aSSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ 768790acb2aSSong Gao { \ 769790acb2aSSong Gao int i; \ 770790acb2aSSong Gao VReg temp = {}; \ 771790acb2aSSong Gao VReg *Vd = (VReg *)vd; \ 772790acb2aSSong Gao VReg *Vj = (VReg *)vj; \ 773790acb2aSSong Gao int oprsz = simd_oprsz(desc); \ 774790acb2aSSong Gao \ 775790acb2aSSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 776790acb2aSSong Gao temp.E1(i) = Vj->E2(i); \ 777790acb2aSSong Gao } \ 778790acb2aSSong Gao *Vd = temp; \ 779790acb2aSSong Gao } 780790acb2aSSong Gao 781790acb2aSSong Gao VEXT2XV(vext2xv_h_b, 16, H, B) 782790acb2aSSong Gao VEXT2XV(vext2xv_w_b, 32, W, B) 783790acb2aSSong Gao VEXT2XV(vext2xv_d_b, 64, D, B) 784790acb2aSSong Gao VEXT2XV(vext2xv_w_h, 32, W, H) 785790acb2aSSong Gao VEXT2XV(vext2xv_d_h, 64, D, H) 786790acb2aSSong Gao VEXT2XV(vext2xv_d_w, 64, D, W) 787790acb2aSSong Gao VEXT2XV(vext2xv_hu_bu, 16, UH, UB) 788790acb2aSSong Gao VEXT2XV(vext2xv_wu_bu, 32, UW, UB) 789790acb2aSSong Gao VEXT2XV(vext2xv_du_bu, 64, UD, UB) 790790acb2aSSong Gao VEXT2XV(vext2xv_wu_hu, 32, UW, UH) 791790acb2aSSong Gao VEXT2XV(vext2xv_du_hu, 64, UD, UH) 792790acb2aSSong Gao VEXT2XV(vext2xv_du_wu, 64, UD, UW) 793790acb2aSSong Gao 794f0e395dfSSong Gao #define DO_SIGNCOV(a, b) (a == 0 ? 0 : a < 0 ? -b : b) 795f0e395dfSSong Gao 796f0e395dfSSong Gao DO_3OP(vsigncov_b, 8, B, DO_SIGNCOV) 797f0e395dfSSong Gao DO_3OP(vsigncov_h, 16, H, DO_SIGNCOV) 798f0e395dfSSong Gao DO_3OP(vsigncov_w, 32, W, DO_SIGNCOV) 799f0e395dfSSong Gao DO_3OP(vsigncov_d, 64, D, DO_SIGNCOV) 800789f4a4cSSong Gao 801789f4a4cSSong Gao static uint64_t do_vmskltz_b(int64_t val) 802789f4a4cSSong Gao { 803789f4a4cSSong Gao uint64_t m = 0x8080808080808080ULL; 804789f4a4cSSong Gao uint64_t c = val & m; 805789f4a4cSSong Gao c |= c << 7; 806789f4a4cSSong Gao c |= c << 14; 807789f4a4cSSong Gao c |= c << 28; 808789f4a4cSSong Gao return c >> 56; 809789f4a4cSSong Gao } 810789f4a4cSSong Gao 811ff27e335SSong Gao void HELPER(vmskltz_b)(void *vd, void *vj, uint32_t desc) 812789f4a4cSSong Gao { 81397074674SSong Gao int i; 814789f4a4cSSong Gao uint16_t temp = 0; 815ff27e335SSong Gao VReg *Vd = (VReg *)vd; 816ff27e335SSong Gao VReg *Vj = (VReg *)vj; 81797074674SSong Gao int oprsz = simd_oprsz(desc); 818789f4a4cSSong Gao 81997074674SSong Gao for (i = 0; i < oprsz / 16; i++) { 82097074674SSong Gao temp = 0; 82197074674SSong Gao temp = do_vmskltz_b(Vj->D(2 * i)); 82297074674SSong Gao temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8); 82397074674SSong Gao Vd->D(2 * i) = temp; 82497074674SSong Gao Vd->D(2 * i + 1) = 0; 82597074674SSong Gao } 826789f4a4cSSong Gao } 827789f4a4cSSong Gao 828789f4a4cSSong Gao static uint64_t do_vmskltz_h(int64_t val) 829789f4a4cSSong Gao { 830789f4a4cSSong Gao uint64_t m = 0x8000800080008000ULL; 831789f4a4cSSong Gao uint64_t c = val & m; 832789f4a4cSSong Gao c |= c << 15; 833789f4a4cSSong Gao c |= c << 30; 834789f4a4cSSong Gao return c >> 60; 835789f4a4cSSong Gao } 836789f4a4cSSong Gao 837ff27e335SSong Gao void HELPER(vmskltz_h)(void *vd, void *vj, uint32_t desc) 838789f4a4cSSong Gao { 83997074674SSong Gao int i; 840789f4a4cSSong Gao uint16_t temp = 0; 841ff27e335SSong Gao VReg *Vd = (VReg *)vd; 842ff27e335SSong Gao VReg *Vj = (VReg *)vj; 84397074674SSong Gao int oprsz = simd_oprsz(desc); 844789f4a4cSSong Gao 84597074674SSong Gao for (i = 0; i < oprsz / 16; i++) { 84697074674SSong Gao temp = 0; 84797074674SSong Gao temp = do_vmskltz_h(Vj->D(2 * i)); 84897074674SSong Gao temp |= (do_vmskltz_h(Vj->D(2 * i + 1)) << 4); 84997074674SSong Gao Vd->D(2 * i) = temp; 85097074674SSong Gao Vd->D(2 * i + 1) = 0; 85197074674SSong Gao } 852789f4a4cSSong Gao } 853789f4a4cSSong Gao 854789f4a4cSSong Gao static uint64_t do_vmskltz_w(int64_t val) 855789f4a4cSSong Gao { 856789f4a4cSSong Gao uint64_t m = 0x8000000080000000ULL; 857789f4a4cSSong Gao uint64_t c = val & m; 858789f4a4cSSong Gao c |= c << 31; 859789f4a4cSSong Gao return c >> 62; 860789f4a4cSSong Gao } 861789f4a4cSSong Gao 862ff27e335SSong Gao void HELPER(vmskltz_w)(void *vd, void *vj, uint32_t desc) 863789f4a4cSSong Gao { 86497074674SSong Gao int i; 865789f4a4cSSong Gao uint16_t temp = 0; 866ff27e335SSong Gao VReg *Vd = (VReg *)vd; 867ff27e335SSong Gao VReg *Vj = (VReg *)vj; 86897074674SSong Gao int oprsz = simd_oprsz(desc); 869789f4a4cSSong Gao 87097074674SSong Gao for (i = 0; i < oprsz / 16; i++) { 87197074674SSong Gao temp = 0; 87297074674SSong Gao temp = do_vmskltz_w(Vj->D(2 * i)); 87397074674SSong Gao temp |= (do_vmskltz_w(Vj->D(2 * i + 1)) << 2); 87497074674SSong Gao Vd->D(2 * i) = temp; 87597074674SSong Gao Vd->D(2 * i + 1) = 0; 87697074674SSong Gao } 877789f4a4cSSong Gao } 878789f4a4cSSong Gao 879789f4a4cSSong Gao static uint64_t do_vmskltz_d(int64_t val) 880789f4a4cSSong Gao { 881789f4a4cSSong Gao return (uint64_t)val >> 63; 882789f4a4cSSong Gao } 883ff27e335SSong Gao void HELPER(vmskltz_d)(void *vd, void *vj, uint32_t desc) 884789f4a4cSSong Gao { 88597074674SSong Gao int i; 886789f4a4cSSong Gao uint16_t temp = 0; 887ff27e335SSong Gao VReg *Vd = (VReg *)vd; 888ff27e335SSong Gao VReg *Vj = (VReg *)vj; 88997074674SSong Gao int oprsz = simd_oprsz(desc); 890789f4a4cSSong Gao 89197074674SSong Gao for (i = 0; i < oprsz / 16; i++) { 89297074674SSong Gao temp = 0; 89397074674SSong Gao temp = do_vmskltz_d(Vj->D(2 * i)); 89497074674SSong Gao temp |= (do_vmskltz_d(Vj->D(2 * i + 1)) << 1); 89597074674SSong Gao Vd->D(2 * i) = temp; 89697074674SSong Gao Vd->D(2 * i + 1) = 0; 89797074674SSong Gao } 898789f4a4cSSong Gao } 899789f4a4cSSong Gao 900ff27e335SSong Gao void HELPER(vmskgez_b)(void *vd, void *vj, uint32_t desc) 901789f4a4cSSong Gao { 90297074674SSong Gao int i; 903789f4a4cSSong Gao uint16_t temp = 0; 904ff27e335SSong Gao VReg *Vd = (VReg *)vd; 905ff27e335SSong Gao VReg *Vj = (VReg *)vj; 90697074674SSong Gao int oprsz = simd_oprsz(desc); 907789f4a4cSSong Gao 90897074674SSong Gao for (i = 0; i < oprsz / 16; i++) { 90997074674SSong Gao temp = 0; 91097074674SSong Gao temp = do_vmskltz_b(Vj->D(2 * i)); 91197074674SSong Gao temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8); 91297074674SSong Gao Vd->D(2 * i) = (uint16_t)(~temp); 91397074674SSong Gao Vd->D(2 * i + 1) = 0; 91497074674SSong Gao } 915789f4a4cSSong Gao } 916789f4a4cSSong Gao 917789f4a4cSSong Gao static uint64_t do_vmskez_b(uint64_t a) 918789f4a4cSSong Gao { 919789f4a4cSSong Gao uint64_t m = 0x7f7f7f7f7f7f7f7fULL; 920789f4a4cSSong Gao uint64_t c = ~(((a & m) + m) | a | m); 921789f4a4cSSong Gao c |= c << 7; 922789f4a4cSSong Gao c |= c << 14; 923789f4a4cSSong Gao c |= c << 28; 924789f4a4cSSong Gao return c >> 56; 925789f4a4cSSong Gao } 926789f4a4cSSong Gao 927ff27e335SSong Gao void HELPER(vmsknz_b)(void *vd, void *vj, uint32_t desc) 928789f4a4cSSong Gao { 92997074674SSong Gao int i; 930789f4a4cSSong Gao uint16_t temp = 0; 931ff27e335SSong Gao VReg *Vd = (VReg *)vd; 932ff27e335SSong Gao VReg *Vj = (VReg *)vj; 93397074674SSong Gao int oprsz = simd_oprsz(desc); 934789f4a4cSSong Gao 93597074674SSong Gao for (i = 0; i < oprsz / 16; i++) { 93697074674SSong Gao temp = 0; 93797074674SSong Gao temp = do_vmskez_b(Vj->D(2 * i)); 93897074674SSong Gao temp |= (do_vmskez_b(Vj->D(2 * i + 1)) << 8); 93997074674SSong Gao Vd->D(2 * i) = (uint16_t)(~temp); 94097074674SSong Gao Vd->D(2 * i + 1) = 0; 94197074674SSong Gao } 942789f4a4cSSong Gao } 943f205a539SSong Gao 9444472a45aSSong Gao void HELPER(vnori_b)(void *vd, void *vj, uint64_t imm, uint32_t desc) 945f205a539SSong Gao { 946f205a539SSong Gao int i; 947f205a539SSong Gao VReg *Vd = (VReg *)vd; 948f205a539SSong Gao VReg *Vj = (VReg *)vj; 949f205a539SSong Gao 9504472a45aSSong Gao for (i = 0; i < simd_oprsz(desc); i++) { 951f205a539SSong Gao Vd->B(i) = ~(Vj->B(i) | (uint8_t)imm); 952f205a539SSong Gao } 953f205a539SSong Gao } 9549b21a7a5SSong Gao 9559b21a7a5SSong Gao #define VSLLWIL(NAME, BIT, E1, E2) \ 956329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 9579b21a7a5SSong Gao { \ 9586567eac7SSong Gao int i, j, ofs; \ 9596567eac7SSong Gao VReg temp = {}; \ 960329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 961329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 9626567eac7SSong Gao int oprsz = simd_oprsz(desc); \ 9639b21a7a5SSong Gao typedef __typeof(temp.E1(0)) TD; \ 9649b21a7a5SSong Gao \ 9656567eac7SSong Gao ofs = LSX_LEN / BIT; \ 9666567eac7SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 9676567eac7SSong Gao for (j = 0; j < ofs; j++) { \ 9686567eac7SSong Gao temp.E1(j + ofs * i) = (TD)Vj->E2(j + ofs * 2 * i) << (imm % BIT); \ 9696567eac7SSong Gao } \ 9709b21a7a5SSong Gao } \ 9719b21a7a5SSong Gao *Vd = temp; \ 9729b21a7a5SSong Gao } 9739b21a7a5SSong Gao 9746567eac7SSong Gao 975ff27e335SSong Gao void HELPER(vextl_q_d)(void *vd, void *vj, uint32_t desc) 9769b21a7a5SSong Gao { 9776567eac7SSong Gao int i; 978ff27e335SSong Gao VReg *Vd = (VReg *)vd; 979ff27e335SSong Gao VReg *Vj = (VReg *)vj; 9806567eac7SSong Gao int oprsz = simd_oprsz(desc); 9819b21a7a5SSong Gao 9826567eac7SSong Gao for (i = 0; i < oprsz / 16; i++) { 9836567eac7SSong Gao Vd->Q(i) = int128_makes64(Vj->D(2 * i)); 9846567eac7SSong Gao } 9859b21a7a5SSong Gao } 9869b21a7a5SSong Gao 987ff27e335SSong Gao void HELPER(vextl_qu_du)(void *vd, void *vj, uint32_t desc) 9889b21a7a5SSong Gao { 9896567eac7SSong Gao int i; 990ff27e335SSong Gao VReg *Vd = (VReg *)vd; 991ff27e335SSong Gao VReg *Vj = (VReg *)vj; 9926567eac7SSong Gao int oprsz = simd_oprsz(desc); 9939b21a7a5SSong Gao 9946567eac7SSong Gao for (i = 0; i < oprsz / 16; i++) { 9956567eac7SSong Gao Vd->Q(i) = int128_make64(Vj->UD(2 * i)); 9966567eac7SSong Gao } 9979b21a7a5SSong Gao } 9989b21a7a5SSong Gao 9999b21a7a5SSong Gao VSLLWIL(vsllwil_h_b, 16, H, B) 10009b21a7a5SSong Gao VSLLWIL(vsllwil_w_h, 32, W, H) 10019b21a7a5SSong Gao VSLLWIL(vsllwil_d_w, 64, D, W) 10029b21a7a5SSong Gao VSLLWIL(vsllwil_hu_bu, 16, UH, UB) 10039b21a7a5SSong Gao VSLLWIL(vsllwil_wu_hu, 32, UW, UH) 10049b21a7a5SSong Gao VSLLWIL(vsllwil_du_wu, 64, UD, UW) 1005ecb93716SSong Gao 1006ecb93716SSong Gao #define do_vsrlr(E, T) \ 1007ecb93716SSong Gao static T do_vsrlr_ ##E(T s1, int sh) \ 1008ecb93716SSong Gao { \ 1009ecb93716SSong Gao if (sh == 0) { \ 1010ecb93716SSong Gao return s1; \ 1011ecb93716SSong Gao } else { \ 1012ecb93716SSong Gao return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \ 1013ecb93716SSong Gao } \ 1014ecb93716SSong Gao } 1015ecb93716SSong Gao 1016ecb93716SSong Gao do_vsrlr(B, uint8_t) 1017ecb93716SSong Gao do_vsrlr(H, uint16_t) 1018ecb93716SSong Gao do_vsrlr(W, uint32_t) 1019ecb93716SSong Gao do_vsrlr(D, uint64_t) 1020ecb93716SSong Gao 1021ecb93716SSong Gao #define VSRLR(NAME, BIT, T, E) \ 102204711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1023ecb93716SSong Gao { \ 1024ecb93716SSong Gao int i; \ 102504711da1SSong Gao VReg *Vd = (VReg *)vd; \ 102604711da1SSong Gao VReg *Vj = (VReg *)vj; \ 102704711da1SSong Gao VReg *Vk = (VReg *)vk; \ 10288c272fe8SSong Gao int oprsz = simd_oprsz(desc); \ 1029ecb93716SSong Gao \ 10308c272fe8SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1031ecb93716SSong Gao Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \ 1032ecb93716SSong Gao } \ 1033ecb93716SSong Gao } 1034ecb93716SSong Gao 1035ecb93716SSong Gao VSRLR(vsrlr_b, 8, uint8_t, B) 1036ecb93716SSong Gao VSRLR(vsrlr_h, 16, uint16_t, H) 1037ecb93716SSong Gao VSRLR(vsrlr_w, 32, uint32_t, W) 1038ecb93716SSong Gao VSRLR(vsrlr_d, 64, uint64_t, D) 1039ecb93716SSong Gao 1040ecb93716SSong Gao #define VSRLRI(NAME, BIT, E) \ 1041329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1042ecb93716SSong Gao { \ 1043ecb93716SSong Gao int i; \ 1044329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 1045329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 10468c272fe8SSong Gao int oprsz = simd_oprsz(desc); \ 1047ecb93716SSong Gao \ 10488c272fe8SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1049ecb93716SSong Gao Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), imm); \ 1050ecb93716SSong Gao } \ 1051ecb93716SSong Gao } 1052ecb93716SSong Gao 1053ecb93716SSong Gao VSRLRI(vsrlri_b, 8, B) 1054ecb93716SSong Gao VSRLRI(vsrlri_h, 16, H) 1055ecb93716SSong Gao VSRLRI(vsrlri_w, 32, W) 1056ecb93716SSong Gao VSRLRI(vsrlri_d, 64, D) 1057ecb93716SSong Gao 1058ecb93716SSong Gao #define do_vsrar(E, T) \ 1059ecb93716SSong Gao static T do_vsrar_ ##E(T s1, int sh) \ 1060ecb93716SSong Gao { \ 1061ecb93716SSong Gao if (sh == 0) { \ 1062ecb93716SSong Gao return s1; \ 1063ecb93716SSong Gao } else { \ 1064ecb93716SSong Gao return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \ 1065ecb93716SSong Gao } \ 1066ecb93716SSong Gao } 1067ecb93716SSong Gao 1068ecb93716SSong Gao do_vsrar(B, int8_t) 1069ecb93716SSong Gao do_vsrar(H, int16_t) 1070ecb93716SSong Gao do_vsrar(W, int32_t) 1071ecb93716SSong Gao do_vsrar(D, int64_t) 1072ecb93716SSong Gao 1073ecb93716SSong Gao #define VSRAR(NAME, BIT, T, E) \ 107404711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1075ecb93716SSong Gao { \ 1076ecb93716SSong Gao int i; \ 107704711da1SSong Gao VReg *Vd = (VReg *)vd; \ 107804711da1SSong Gao VReg *Vj = (VReg *)vj; \ 107904711da1SSong Gao VReg *Vk = (VReg *)vk; \ 10808c272fe8SSong Gao int oprsz = simd_oprsz(desc); \ 1081ecb93716SSong Gao \ 10828c272fe8SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1083ecb93716SSong Gao Vd->E(i) = do_vsrar_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \ 1084ecb93716SSong Gao } \ 1085ecb93716SSong Gao } 1086ecb93716SSong Gao 1087ecb93716SSong Gao VSRAR(vsrar_b, 8, uint8_t, B) 1088ecb93716SSong Gao VSRAR(vsrar_h, 16, uint16_t, H) 1089ecb93716SSong Gao VSRAR(vsrar_w, 32, uint32_t, W) 1090ecb93716SSong Gao VSRAR(vsrar_d, 64, uint64_t, D) 1091ecb93716SSong Gao 1092ecb93716SSong Gao #define VSRARI(NAME, BIT, E) \ 1093329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1094ecb93716SSong Gao { \ 1095ecb93716SSong Gao int i; \ 1096329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 1097329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 10988c272fe8SSong Gao int oprsz = simd_oprsz(desc); \ 1099ecb93716SSong Gao \ 11008c272fe8SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1101ecb93716SSong Gao Vd->E(i) = do_vsrar_ ## E(Vj->E(i), imm); \ 1102ecb93716SSong Gao } \ 1103ecb93716SSong Gao } 1104ecb93716SSong Gao 1105ecb93716SSong Gao VSRARI(vsrari_b, 8, B) 1106ecb93716SSong Gao VSRARI(vsrari_h, 16, H) 1107ecb93716SSong Gao VSRARI(vsrari_w, 32, W) 1108ecb93716SSong Gao VSRARI(vsrari_d, 64, D) 1109d79fb8ddSSong Gao 1110d79fb8ddSSong Gao #define R_SHIFT(a, b) (a >> b) 1111d79fb8ddSSong Gao 111240c7674eSSong Gao #define VSRLN(NAME, BIT, E1, E2) \ 111304711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1114d79fb8ddSSong Gao { \ 111540c7674eSSong Gao int i, j, ofs; \ 111604711da1SSong Gao VReg *Vd = (VReg *)vd; \ 111704711da1SSong Gao VReg *Vj = (VReg *)vj; \ 111804711da1SSong Gao VReg *Vk = (VReg *)vk; \ 111940c7674eSSong Gao int oprsz = simd_oprsz(desc); \ 1120d79fb8ddSSong Gao \ 112140c7674eSSong Gao ofs = LSX_LEN / BIT; \ 112240c7674eSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 112340c7674eSSong Gao for (j = 0; j < ofs; j++) { \ 112440c7674eSSong Gao Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), \ 112540c7674eSSong Gao Vk->E2(j + ofs * i) % BIT); \ 1126d79fb8ddSSong Gao } \ 112740c7674eSSong Gao Vd->D(2 * i + 1) = 0; \ 112840c7674eSSong Gao } \ 1129d79fb8ddSSong Gao } 1130d79fb8ddSSong Gao 113140c7674eSSong Gao VSRLN(vsrln_b_h, 16, B, UH) 113240c7674eSSong Gao VSRLN(vsrln_h_w, 32, H, UW) 113340c7674eSSong Gao VSRLN(vsrln_w_d, 64, W, UD) 1134d79fb8ddSSong Gao 113540c7674eSSong Gao #define VSRAN(NAME, BIT, E1, E2, E3) \ 113604711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1137d79fb8ddSSong Gao { \ 113840c7674eSSong Gao int i, j, ofs; \ 113904711da1SSong Gao VReg *Vd = (VReg *)vd; \ 114004711da1SSong Gao VReg *Vj = (VReg *)vj; \ 114104711da1SSong Gao VReg *Vk = (VReg *)vk; \ 114240c7674eSSong Gao int oprsz = simd_oprsz(desc); \ 1143d79fb8ddSSong Gao \ 114440c7674eSSong Gao ofs = LSX_LEN / BIT; \ 114540c7674eSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 114640c7674eSSong Gao for (j = 0; j < ofs; j++) { \ 114740c7674eSSong Gao Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), \ 114840c7674eSSong Gao Vk->E3(j + ofs * i) % BIT); \ 1149d79fb8ddSSong Gao } \ 115040c7674eSSong Gao Vd->D(2 * i + 1) = 0; \ 115140c7674eSSong Gao } \ 1152d79fb8ddSSong Gao } 1153d79fb8ddSSong Gao 115440c7674eSSong Gao VSRAN(vsran_b_h, 16, B, H, UH) 115540c7674eSSong Gao VSRAN(vsran_h_w, 32, H, W, UW) 115640c7674eSSong Gao VSRAN(vsran_w_d, 64, W, D, UD) 1157d79fb8ddSSong Gao 115840c7674eSSong Gao #define VSRLNI(NAME, BIT, E1, E2) \ 1159329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1160d79fb8ddSSong Gao { \ 116140c7674eSSong Gao int i, j, ofs; \ 116240c7674eSSong Gao VReg temp = {}; \ 1163329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 1164329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 116540c7674eSSong Gao int oprsz = simd_oprsz(desc); \ 1166d79fb8ddSSong Gao \ 116740c7674eSSong Gao ofs = LSX_LEN / BIT; \ 116840c7674eSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 116940c7674eSSong Gao for (j = 0; j < ofs; j++) { \ 117040c7674eSSong Gao temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \ 117140c7674eSSong Gao temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \ 117240c7674eSSong Gao imm); \ 117340c7674eSSong Gao } \ 1174d79fb8ddSSong Gao } \ 1175d79fb8ddSSong Gao *Vd = temp; \ 1176d79fb8ddSSong Gao } 1177d79fb8ddSSong Gao 1178329517d5SSong Gao void HELPER(vsrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1179d79fb8ddSSong Gao { 118040c7674eSSong Gao int i; 118140c7674eSSong Gao VReg temp = {}; 1182329517d5SSong Gao VReg *Vd = (VReg *)vd; 1183329517d5SSong Gao VReg *Vj = (VReg *)vj; 1184d79fb8ddSSong Gao 118540c7674eSSong Gao for (i = 0; i < 2; i++) { 118640c7674eSSong Gao temp.D(2 * i) = int128_getlo(int128_urshift(Vj->Q(i), imm % 128)); 118740c7674eSSong Gao temp.D(2 * i +1) = int128_getlo(int128_urshift(Vd->Q(i), imm % 128)); 118840c7674eSSong Gao } 1189d79fb8ddSSong Gao *Vd = temp; 1190d79fb8ddSSong Gao } 1191d79fb8ddSSong Gao 119240c7674eSSong Gao VSRLNI(vsrlni_b_h, 16, B, UH) 119340c7674eSSong Gao VSRLNI(vsrlni_h_w, 32, H, UW) 119440c7674eSSong Gao VSRLNI(vsrlni_w_d, 64, W, UD) 1195d79fb8ddSSong Gao 1196d79fb8ddSSong Gao #define VSRANI(NAME, BIT, E1, E2) \ 1197329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1198d79fb8ddSSong Gao { \ 119940c7674eSSong Gao int i, j, ofs; \ 120040c7674eSSong Gao VReg temp = {}; \ 1201329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 1202329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 120340c7674eSSong Gao int oprsz = simd_oprsz(desc); \ 1204d79fb8ddSSong Gao \ 120540c7674eSSong Gao ofs = LSX_LEN / BIT; \ 120640c7674eSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 120740c7674eSSong Gao for (j = 0; j < ofs; j++) { \ 120840c7674eSSong Gao temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \ 120940c7674eSSong Gao temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \ 121040c7674eSSong Gao imm); \ 121140c7674eSSong Gao } \ 1212d79fb8ddSSong Gao } \ 1213d79fb8ddSSong Gao *Vd = temp; \ 1214d79fb8ddSSong Gao } 1215d79fb8ddSSong Gao 1216329517d5SSong Gao void HELPER(vsrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1217d79fb8ddSSong Gao { 121840c7674eSSong Gao int i; 121940c7674eSSong Gao VReg temp = {}; 1220329517d5SSong Gao VReg *Vd = (VReg *)vd; 1221329517d5SSong Gao VReg *Vj = (VReg *)vj; 1222d79fb8ddSSong Gao 122340c7674eSSong Gao for (i = 0; i < 2; i++) { 122440c7674eSSong Gao temp.D(2 * i) = int128_getlo(int128_rshift(Vj->Q(i), imm % 128)); 122540c7674eSSong Gao temp.D(2 * i + 1) = int128_getlo(int128_rshift(Vd->Q(i), imm % 128)); 122640c7674eSSong Gao } 1227d79fb8ddSSong Gao *Vd = temp; 1228d79fb8ddSSong Gao } 1229d79fb8ddSSong Gao 1230d79fb8ddSSong Gao VSRANI(vsrani_b_h, 16, B, H) 1231d79fb8ddSSong Gao VSRANI(vsrani_h_w, 32, H, W) 1232d79fb8ddSSong Gao VSRANI(vsrani_w_d, 64, W, D) 1233a5200a17SSong Gao 1234c50ce38aSSong Gao #define VSRLRN(NAME, BIT, E1, E2, E3) \ 123504711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1236a5200a17SSong Gao { \ 1237c50ce38aSSong Gao int i, j, ofs; \ 123804711da1SSong Gao VReg *Vd = (VReg *)vd; \ 123904711da1SSong Gao VReg *Vj = (VReg *)vj; \ 124004711da1SSong Gao VReg *Vk = (VReg *)vk; \ 1241c50ce38aSSong Gao int oprsz = simd_oprsz(desc); \ 1242a5200a17SSong Gao \ 1243c50ce38aSSong Gao ofs = LSX_LEN / BIT; \ 1244c50ce38aSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 1245c50ce38aSSong Gao for (j = 0; j < ofs; j++) { \ 1246c50ce38aSSong Gao Vd->E1(j + ofs * 2 * i) = do_vsrlr_ ##E2(Vj->E2(j + ofs * i), \ 1247c50ce38aSSong Gao Vk->E3(j + ofs * i) % BIT); \ 1248a5200a17SSong Gao } \ 1249c50ce38aSSong Gao Vd->D(2 * i + 1) = 0; \ 1250c50ce38aSSong Gao } \ 1251a5200a17SSong Gao } 1252a5200a17SSong Gao 1253c50ce38aSSong Gao VSRLRN(vsrlrn_b_h, 16, B, H, UH) 1254c50ce38aSSong Gao VSRLRN(vsrlrn_h_w, 32, H, W, UW) 1255c50ce38aSSong Gao VSRLRN(vsrlrn_w_d, 64, W, D, UD) 1256a5200a17SSong Gao 1257c50ce38aSSong Gao #define VSRARN(NAME, BIT, E1, E2, E3) \ 125804711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1259a5200a17SSong Gao { \ 1260c50ce38aSSong Gao int i, j, ofs; \ 126104711da1SSong Gao VReg *Vd = (VReg *)vd; \ 126204711da1SSong Gao VReg *Vj = (VReg *)vj; \ 126304711da1SSong Gao VReg *Vk = (VReg *)vk; \ 1264c50ce38aSSong Gao int oprsz = simd_oprsz(desc); \ 1265a5200a17SSong Gao \ 1266c50ce38aSSong Gao ofs = LSX_LEN / BIT; \ 1267c50ce38aSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 1268c50ce38aSSong Gao for (j = 0; j < ofs; j++) { \ 1269c50ce38aSSong Gao Vd->E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i), \ 1270c50ce38aSSong Gao Vk->E3(j + ofs * i) % BIT); \ 1271a5200a17SSong Gao } \ 1272c50ce38aSSong Gao Vd->D(2 * i + 1) = 0; \ 1273c50ce38aSSong Gao } \ 1274a5200a17SSong Gao } 1275a5200a17SSong Gao 1276c50ce38aSSong Gao VSRARN(vsrarn_b_h, 16, B, H, UH) 1277c50ce38aSSong Gao VSRARN(vsrarn_h_w, 32, H, W, UW) 1278c50ce38aSSong Gao VSRARN(vsrarn_w_d, 64, W, D, UD) 1279a5200a17SSong Gao 1280a5200a17SSong Gao #define VSRLRNI(NAME, BIT, E1, E2) \ 1281329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1282a5200a17SSong Gao { \ 1283c50ce38aSSong Gao int i, j, ofs; \ 1284c50ce38aSSong Gao VReg temp = {}; \ 1285329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 1286329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 1287c50ce38aSSong Gao int oprsz = simd_oprsz(desc); \ 1288a5200a17SSong Gao \ 1289c50ce38aSSong Gao ofs = LSX_LEN / BIT; \ 1290c50ce38aSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 1291c50ce38aSSong Gao for (j = 0; j < ofs; j++) { \ 1292c50ce38aSSong Gao temp.E1(j + ofs * 2 * i) = do_vsrlr_ ## E2(Vj->E2(j + ofs * i), imm); \ 1293c50ce38aSSong Gao temp.E1(j + ofs * (2 * i + 1)) = do_vsrlr_ ## E2(Vd->E2(j + ofs * i), \ 1294c50ce38aSSong Gao imm); \ 1295c50ce38aSSong Gao } \ 1296a5200a17SSong Gao } \ 1297a5200a17SSong Gao *Vd = temp; \ 1298a5200a17SSong Gao } 1299a5200a17SSong Gao 1300329517d5SSong Gao void HELPER(vsrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1301a5200a17SSong Gao { 1302c50ce38aSSong Gao int i; 1303c50ce38aSSong Gao VReg temp = {}; 1304329517d5SSong Gao VReg *Vd = (VReg *)vd; 1305329517d5SSong Gao VReg *Vj = (VReg *)vj; 1306c50ce38aSSong Gao Int128 r[4]; 1307c50ce38aSSong Gao int oprsz = simd_oprsz(desc); 1308a5200a17SSong Gao 1309c50ce38aSSong Gao for (i = 0; i < oprsz / 16; i++) { 1310a5200a17SSong Gao if (imm == 0) { 1311c50ce38aSSong Gao temp.D(2 * i) = int128_getlo(Vj->Q(i)); 1312c50ce38aSSong Gao temp.D(2 * i + 1) = int128_getlo(Vd->Q(i)); 1313a5200a17SSong Gao } else { 1314c50ce38aSSong Gao r[2 * i] = int128_and(int128_urshift(Vj->Q(i), (imm - 1)), 1315c50ce38aSSong Gao int128_one()); 1316c50ce38aSSong Gao r[2 * i + 1] = int128_and(int128_urshift(Vd->Q(i), (imm - 1)), 1317c50ce38aSSong Gao int128_one()); 1318c50ce38aSSong Gao temp.D(2 * i) = int128_getlo(int128_add(int128_urshift(Vj->Q(i), 1319c50ce38aSSong Gao imm), r[2 * i])); 1320c50ce38aSSong Gao temp.D(2 * i + 1) = int128_getlo(int128_add(int128_urshift(Vd->Q(i), 1321c50ce38aSSong Gao imm), r[ 2 * i + 1])); 1322c50ce38aSSong Gao } 1323a5200a17SSong Gao } 1324a5200a17SSong Gao *Vd = temp; 1325a5200a17SSong Gao } 1326a5200a17SSong Gao 1327a5200a17SSong Gao VSRLRNI(vsrlrni_b_h, 16, B, H) 1328a5200a17SSong Gao VSRLRNI(vsrlrni_h_w, 32, H, W) 1329a5200a17SSong Gao VSRLRNI(vsrlrni_w_d, 64, W, D) 1330a5200a17SSong Gao 1331a5200a17SSong Gao #define VSRARNI(NAME, BIT, E1, E2) \ 1332329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1333a5200a17SSong Gao { \ 1334c50ce38aSSong Gao int i, j, ofs; \ 1335c50ce38aSSong Gao VReg temp = {}; \ 1336329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 1337329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 1338c50ce38aSSong Gao int oprsz = simd_oprsz(desc); \ 1339a5200a17SSong Gao \ 1340c50ce38aSSong Gao ofs = LSX_LEN / BIT; \ 1341c50ce38aSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 1342c50ce38aSSong Gao for (j = 0; j < ofs; j++) { \ 1343c50ce38aSSong Gao temp.E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i), imm); \ 1344c50ce38aSSong Gao temp.E1(j + ofs * (2 * i + 1)) = do_vsrar_ ## E2(Vd->E2(j + ofs * i), \ 1345c50ce38aSSong Gao imm); \ 1346c50ce38aSSong Gao } \ 1347a5200a17SSong Gao } \ 1348a5200a17SSong Gao *Vd = temp; \ 1349a5200a17SSong Gao } 1350a5200a17SSong Gao 1351329517d5SSong Gao void HELPER(vsrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1352a5200a17SSong Gao { 1353c50ce38aSSong Gao int i; 1354c50ce38aSSong Gao VReg temp = {}; 1355329517d5SSong Gao VReg *Vd = (VReg *)vd; 1356329517d5SSong Gao VReg *Vj = (VReg *)vj; 1357c50ce38aSSong Gao Int128 r[4]; 1358c50ce38aSSong Gao int oprsz = simd_oprsz(desc); 1359a5200a17SSong Gao 1360c50ce38aSSong Gao for (i = 0; i < oprsz / 16; i++) { 1361a5200a17SSong Gao if (imm == 0) { 1362c50ce38aSSong Gao temp.D(2 * i) = int128_getlo(Vj->Q(i)); 1363c50ce38aSSong Gao temp.D(2 * i + 1) = int128_getlo(Vd->Q(i)); 1364a5200a17SSong Gao } else { 1365c50ce38aSSong Gao r[2 * i] = int128_and(int128_rshift(Vj->Q(i), (imm - 1)), 1366c50ce38aSSong Gao int128_one()); 1367c50ce38aSSong Gao r[2 * i + 1] = int128_and(int128_rshift(Vd->Q(i), (imm - 1)), 1368c50ce38aSSong Gao int128_one()); 1369c50ce38aSSong Gao temp.D(2 * i) = int128_getlo(int128_add(int128_rshift(Vj->Q(i), 1370c50ce38aSSong Gao imm), r[2 * i])); 1371c50ce38aSSong Gao temp.D(2 * i + 1) = int128_getlo(int128_add(int128_rshift(Vd->Q(i), 1372c50ce38aSSong Gao imm), r[2 * i + 1])); 1373c50ce38aSSong Gao } 1374a5200a17SSong Gao } 1375a5200a17SSong Gao *Vd = temp; 1376a5200a17SSong Gao } 1377a5200a17SSong Gao 1378a5200a17SSong Gao VSRARNI(vsrarni_b_h, 16, B, H) 1379a5200a17SSong Gao VSRARNI(vsrarni_h_w, 32, H, W) 1380a5200a17SSong Gao VSRARNI(vsrarni_w_d, 64, W, D) 138183b3815dSSong Gao 138283b3815dSSong Gao #define SSRLNS(NAME, T1, T2, T3) \ 138383b3815dSSong Gao static T1 do_ssrlns_ ## NAME(T2 e2, int sa, int sh) \ 138483b3815dSSong Gao { \ 138583b3815dSSong Gao T1 shft_res; \ 138683b3815dSSong Gao if (sa == 0) { \ 138783b3815dSSong Gao shft_res = e2; \ 138883b3815dSSong Gao } else { \ 138983b3815dSSong Gao shft_res = (((T1)e2) >> sa); \ 139083b3815dSSong Gao } \ 139183b3815dSSong Gao T3 mask; \ 139283b3815dSSong Gao mask = (1ull << sh) -1; \ 139383b3815dSSong Gao if (shft_res > mask) { \ 139483b3815dSSong Gao return mask; \ 139583b3815dSSong Gao } else { \ 139683b3815dSSong Gao return shft_res; \ 139783b3815dSSong Gao } \ 139883b3815dSSong Gao } 139983b3815dSSong Gao 140083b3815dSSong Gao SSRLNS(B, uint16_t, int16_t, uint8_t) 140183b3815dSSong Gao SSRLNS(H, uint32_t, int32_t, uint16_t) 140283b3815dSSong Gao SSRLNS(W, uint64_t, int64_t, uint32_t) 140383b3815dSSong Gao 14046256c8caSSong Gao #define VSSRLN(NAME, BIT, E1, E2, E3) \ 140504711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 140683b3815dSSong Gao { \ 14076256c8caSSong Gao int i, j, ofs; \ 140804711da1SSong Gao VReg *Vd = (VReg *)vd; \ 140904711da1SSong Gao VReg *Vj = (VReg *)vj; \ 141004711da1SSong Gao VReg *Vk = (VReg *)vk; \ 14116256c8caSSong Gao int oprsz = simd_oprsz(desc); \ 141283b3815dSSong Gao \ 14136256c8caSSong Gao ofs = LSX_LEN / BIT; \ 14146256c8caSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 14156256c8caSSong Gao for (j = 0; j < ofs; j++) { \ 14166256c8caSSong Gao Vd->E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i), \ 14176256c8caSSong Gao Vk->E3(j + ofs * i) % BIT, \ 14186256c8caSSong Gao BIT / 2 - 1); \ 141983b3815dSSong Gao } \ 14206256c8caSSong Gao Vd->D(2 * i + 1) = 0; \ 14216256c8caSSong Gao } \ 142283b3815dSSong Gao } 142383b3815dSSong Gao 14246256c8caSSong Gao VSSRLN(vssrln_b_h, 16, B, H, UH) 14256256c8caSSong Gao VSSRLN(vssrln_h_w, 32, H, W, UW) 14266256c8caSSong Gao VSSRLN(vssrln_w_d, 64, W, D, UD) 142783b3815dSSong Gao 142883b3815dSSong Gao #define SSRANS(E, T1, T2) \ 142983b3815dSSong Gao static T1 do_ssrans_ ## E(T1 e2, int sa, int sh) \ 143083b3815dSSong Gao { \ 143183b3815dSSong Gao T1 shft_res; \ 143283b3815dSSong Gao if (sa == 0) { \ 143383b3815dSSong Gao shft_res = e2; \ 143483b3815dSSong Gao } else { \ 143583b3815dSSong Gao shft_res = e2 >> sa; \ 143683b3815dSSong Gao } \ 143783b3815dSSong Gao T2 mask; \ 143883b3815dSSong Gao mask = (1ll << sh) - 1; \ 143983b3815dSSong Gao if (shft_res > mask) { \ 144083b3815dSSong Gao return mask; \ 144183b3815dSSong Gao } else if (shft_res < -(mask + 1)) { \ 144283b3815dSSong Gao return ~mask; \ 144383b3815dSSong Gao } else { \ 144483b3815dSSong Gao return shft_res; \ 144583b3815dSSong Gao } \ 144683b3815dSSong Gao } 144783b3815dSSong Gao 144883b3815dSSong Gao SSRANS(B, int16_t, int8_t) 144983b3815dSSong Gao SSRANS(H, int32_t, int16_t) 145083b3815dSSong Gao SSRANS(W, int64_t, int32_t) 145183b3815dSSong Gao 14526256c8caSSong Gao #define VSSRAN(NAME, BIT, E1, E2, E3) \ 145304711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 145483b3815dSSong Gao { \ 14556256c8caSSong Gao int i, j, ofs; \ 145604711da1SSong Gao VReg *Vd = (VReg *)vd; \ 145704711da1SSong Gao VReg *Vj = (VReg *)vj; \ 145804711da1SSong Gao VReg *Vk = (VReg *)vk; \ 14596256c8caSSong Gao int oprsz = simd_oprsz(desc); \ 146083b3815dSSong Gao \ 14616256c8caSSong Gao ofs = LSX_LEN / BIT; \ 14626256c8caSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 14636256c8caSSong Gao for (j = 0; j < ofs; j++) { \ 14646256c8caSSong Gao Vd->E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i), \ 14656256c8caSSong Gao Vk->E3(j + ofs * i) % BIT, \ 14666256c8caSSong Gao BIT / 2 - 1); \ 146783b3815dSSong Gao } \ 14686256c8caSSong Gao Vd->D(2 * i + 1) = 0; \ 14696256c8caSSong Gao } \ 147083b3815dSSong Gao } 147183b3815dSSong Gao 14726256c8caSSong Gao VSSRAN(vssran_b_h, 16, B, H, UH) 14736256c8caSSong Gao VSSRAN(vssran_h_w, 32, H, W, UW) 14746256c8caSSong Gao VSSRAN(vssran_w_d, 64, W, D, UD) 147583b3815dSSong Gao 147683b3815dSSong Gao #define SSRLNU(E, T1, T2, T3) \ 147783b3815dSSong Gao static T1 do_ssrlnu_ ## E(T3 e2, int sa, int sh) \ 147883b3815dSSong Gao { \ 147983b3815dSSong Gao T1 shft_res; \ 148083b3815dSSong Gao if (sa == 0) { \ 148183b3815dSSong Gao shft_res = e2; \ 148283b3815dSSong Gao } else { \ 148383b3815dSSong Gao shft_res = (((T1)e2) >> sa); \ 148483b3815dSSong Gao } \ 148583b3815dSSong Gao T2 mask; \ 148683b3815dSSong Gao mask = (1ull << sh) - 1; \ 148783b3815dSSong Gao if (shft_res > mask) { \ 148883b3815dSSong Gao return mask; \ 148983b3815dSSong Gao } else { \ 149083b3815dSSong Gao return shft_res; \ 149183b3815dSSong Gao } \ 149283b3815dSSong Gao } 149383b3815dSSong Gao 149483b3815dSSong Gao SSRLNU(B, uint16_t, uint8_t, int16_t) 149583b3815dSSong Gao SSRLNU(H, uint32_t, uint16_t, int32_t) 149683b3815dSSong Gao SSRLNU(W, uint64_t, uint32_t, int64_t) 149783b3815dSSong Gao 14986256c8caSSong Gao #define VSSRLNU(NAME, BIT, E1, E2, E3) \ 149904711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 150083b3815dSSong Gao { \ 15016256c8caSSong Gao int i, j, ofs; \ 150204711da1SSong Gao VReg *Vd = (VReg *)vd; \ 150304711da1SSong Gao VReg *Vj = (VReg *)vj; \ 150404711da1SSong Gao VReg *Vk = (VReg *)vk; \ 15056256c8caSSong Gao int oprsz = simd_oprsz(desc); \ 150683b3815dSSong Gao \ 15076256c8caSSong Gao ofs = LSX_LEN / BIT; \ 15086256c8caSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 15096256c8caSSong Gao for (j = 0; j < ofs; j++) { \ 15106256c8caSSong Gao Vd->E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i), \ 15116256c8caSSong Gao Vk->E3(j + ofs * i) % BIT, \ 15126256c8caSSong Gao BIT / 2); \ 151383b3815dSSong Gao } \ 15146256c8caSSong Gao Vd->D(2 * i + 1) = 0; \ 15156256c8caSSong Gao } \ 151683b3815dSSong Gao } 151783b3815dSSong Gao 15186256c8caSSong Gao VSSRLNU(vssrln_bu_h, 16, B, H, UH) 15196256c8caSSong Gao VSSRLNU(vssrln_hu_w, 32, H, W, UW) 15206256c8caSSong Gao VSSRLNU(vssrln_wu_d, 64, W, D, UD) 152183b3815dSSong Gao 152283b3815dSSong Gao #define SSRANU(E, T1, T2, T3) \ 152383b3815dSSong Gao static T1 do_ssranu_ ## E(T3 e2, int sa, int sh) \ 152483b3815dSSong Gao { \ 152583b3815dSSong Gao T1 shft_res; \ 152683b3815dSSong Gao if (sa == 0) { \ 152783b3815dSSong Gao shft_res = e2; \ 152883b3815dSSong Gao } else { \ 152983b3815dSSong Gao shft_res = e2 >> sa; \ 153083b3815dSSong Gao } \ 153183b3815dSSong Gao if (e2 < 0) { \ 153283b3815dSSong Gao shft_res = 0; \ 153383b3815dSSong Gao } \ 153483b3815dSSong Gao T2 mask; \ 153583b3815dSSong Gao mask = (1ull << sh) - 1; \ 153683b3815dSSong Gao if (shft_res > mask) { \ 153783b3815dSSong Gao return mask; \ 153883b3815dSSong Gao } else { \ 153983b3815dSSong Gao return shft_res; \ 154083b3815dSSong Gao } \ 154183b3815dSSong Gao } 154283b3815dSSong Gao 154383b3815dSSong Gao SSRANU(B, uint16_t, uint8_t, int16_t) 154483b3815dSSong Gao SSRANU(H, uint32_t, uint16_t, int32_t) 154583b3815dSSong Gao SSRANU(W, uint64_t, uint32_t, int64_t) 154683b3815dSSong Gao 15476256c8caSSong Gao #define VSSRANU(NAME, BIT, E1, E2, E3) \ 154804711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 154983b3815dSSong Gao { \ 15506256c8caSSong Gao int i, j, ofs; \ 155104711da1SSong Gao VReg *Vd = (VReg *)vd; \ 155204711da1SSong Gao VReg *Vj = (VReg *)vj; \ 155304711da1SSong Gao VReg *Vk = (VReg *)vk; \ 15546256c8caSSong Gao int oprsz = simd_oprsz(desc); \ 155583b3815dSSong Gao \ 15566256c8caSSong Gao ofs = LSX_LEN / BIT; \ 15576256c8caSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 15586256c8caSSong Gao for (j = 0; j < ofs; j++) { \ 15596256c8caSSong Gao Vd->E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i), \ 15606256c8caSSong Gao Vk->E3(j + ofs * i) % BIT, \ 15616256c8caSSong Gao BIT / 2); \ 156283b3815dSSong Gao } \ 15636256c8caSSong Gao Vd->D(2 * i + 1) = 0; \ 15646256c8caSSong Gao } \ 156583b3815dSSong Gao } 156683b3815dSSong Gao 15676256c8caSSong Gao VSSRANU(vssran_bu_h, 16, B, H, UH) 15686256c8caSSong Gao VSSRANU(vssran_hu_w, 32, H, W, UW) 15696256c8caSSong Gao VSSRANU(vssran_wu_d, 64, W, D, UD) 157083b3815dSSong Gao 157183b3815dSSong Gao #define VSSRLNI(NAME, BIT, E1, E2) \ 1572329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 157383b3815dSSong Gao { \ 15746256c8caSSong Gao int i, j, ofs; \ 15756256c8caSSong Gao VReg temp = {}; \ 1576329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 1577329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 15786256c8caSSong Gao int oprsz = simd_oprsz(desc); \ 157983b3815dSSong Gao \ 15806256c8caSSong Gao ofs = LSX_LEN / BIT; \ 15816256c8caSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 15826256c8caSSong Gao for (j = 0; j < ofs; j++) { \ 15836256c8caSSong Gao temp.E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i), \ 15846256c8caSSong Gao imm, BIT / 2 - 1); \ 15856256c8caSSong Gao temp.E1(j + ofs * (2 * i + 1)) = do_ssrlns_ ## E1(Vd->E2(j + ofs * i), \ 15866256c8caSSong Gao imm, BIT / 2 - 1); \ 15876256c8caSSong Gao } \ 158883b3815dSSong Gao } \ 158983b3815dSSong Gao *Vd = temp; \ 159083b3815dSSong Gao } 159183b3815dSSong Gao 15926256c8caSSong Gao static void do_vssrlni_q(VReg *Vd, VReg *Vj, 15936256c8caSSong Gao uint64_t imm, int idx, Int128 mask) 159483b3815dSSong Gao { 15956256c8caSSong Gao Int128 shft_res1, shft_res2; 159683b3815dSSong Gao 159783b3815dSSong Gao if (imm == 0) { 15986256c8caSSong Gao shft_res1 = Vj->Q(idx); 15996256c8caSSong Gao shft_res2 = Vd->Q(idx); 160083b3815dSSong Gao } else { 16016256c8caSSong Gao shft_res1 = int128_urshift(Vj->Q(idx), imm); 16026256c8caSSong Gao shft_res2 = int128_urshift(Vd->Q(idx), imm); 160383b3815dSSong Gao } 160483b3815dSSong Gao 160583b3815dSSong Gao if (int128_ult(mask, shft_res1)) { 16066256c8caSSong Gao Vd->D(idx * 2) = int128_getlo(mask); 160783b3815dSSong Gao }else { 16086256c8caSSong Gao Vd->D(idx * 2) = int128_getlo(shft_res1); 160983b3815dSSong Gao } 161083b3815dSSong Gao 161183b3815dSSong Gao if (int128_ult(mask, shft_res2)) { 16126256c8caSSong Gao Vd->D(idx * 2 + 1) = int128_getlo(mask); 161383b3815dSSong Gao }else { 16146256c8caSSong Gao Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 16156256c8caSSong Gao } 16166256c8caSSong Gao } 16176256c8caSSong Gao 16186256c8caSSong Gao void HELPER(vssrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 16196256c8caSSong Gao { 16206256c8caSSong Gao int i; 16216256c8caSSong Gao Int128 mask; 16226256c8caSSong Gao VReg *Vd = (VReg *)vd; 16236256c8caSSong Gao VReg *Vj = (VReg *)vj; 16246256c8caSSong Gao int oprsz = simd_oprsz(desc); 16256256c8caSSong Gao 16266256c8caSSong Gao mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); 16276256c8caSSong Gao 16286256c8caSSong Gao for (i = 0; i < oprsz / 16; i++) { 16296256c8caSSong Gao do_vssrlni_q(Vd, Vj, imm, i, mask); 163083b3815dSSong Gao } 163183b3815dSSong Gao } 163283b3815dSSong Gao 163383b3815dSSong Gao VSSRLNI(vssrlni_b_h, 16, B, H) 163483b3815dSSong Gao VSSRLNI(vssrlni_h_w, 32, H, W) 163583b3815dSSong Gao VSSRLNI(vssrlni_w_d, 64, W, D) 163683b3815dSSong Gao 163783b3815dSSong Gao #define VSSRANI(NAME, BIT, E1, E2) \ 1638329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 163983b3815dSSong Gao { \ 16406256c8caSSong Gao int i, j, ofs; \ 16416256c8caSSong Gao VReg temp = {}; \ 1642329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 1643329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 16446256c8caSSong Gao int oprsz = simd_oprsz(desc); \ 164583b3815dSSong Gao \ 16466256c8caSSong Gao ofs = LSX_LEN / BIT; \ 16476256c8caSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 16486256c8caSSong Gao for (j = 0; j < ofs; j++) { \ 16496256c8caSSong Gao temp.E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i), \ 16506256c8caSSong Gao imm, BIT / 2 - 1); \ 16516256c8caSSong Gao temp.E1(j + ofs * (2 * i + 1)) = do_ssrans_ ## E1(Vd->E2(j + ofs * i), \ 16526256c8caSSong Gao imm, BIT / 2 - 1); \ 16536256c8caSSong Gao } \ 165483b3815dSSong Gao } \ 165583b3815dSSong Gao *Vd = temp; \ 165683b3815dSSong Gao } 165783b3815dSSong Gao 16586256c8caSSong Gao static void do_vssrani_d_q(VReg *Vd, VReg *Vj, 16596256c8caSSong Gao uint64_t imm, int idx, Int128 mask, Int128 min) 166083b3815dSSong Gao { 16616256c8caSSong Gao Int128 shft_res1, shft_res2; 166283b3815dSSong Gao 166383b3815dSSong Gao if (imm == 0) { 16646256c8caSSong Gao shft_res1 = Vj->Q(idx); 16656256c8caSSong Gao shft_res2 = Vd->Q(idx); 166683b3815dSSong Gao } else { 16676256c8caSSong Gao shft_res1 = int128_rshift(Vj->Q(idx), imm); 16686256c8caSSong Gao shft_res2 = int128_rshift(Vd->Q(idx), imm); 166983b3815dSSong Gao } 167083b3815dSSong Gao 167183b3815dSSong Gao if (int128_gt(shft_res1, mask)) { 16726256c8caSSong Gao Vd->D(idx * 2) = int128_getlo(mask); 167383b3815dSSong Gao } else if (int128_lt(shft_res1, int128_neg(min))) { 16746256c8caSSong Gao Vd->D(idx * 2) = int128_getlo(min); 167583b3815dSSong Gao } else { 16766256c8caSSong Gao Vd->D(idx * 2) = int128_getlo(shft_res1); 167783b3815dSSong Gao } 167883b3815dSSong Gao 167983b3815dSSong Gao if (int128_gt(shft_res2, mask)) { 16806256c8caSSong Gao Vd->D(idx * 2 + 1) = int128_getlo(mask); 168183b3815dSSong Gao } else if (int128_lt(shft_res2, int128_neg(min))) { 16826256c8caSSong Gao Vd->D(idx * 2 + 1) = int128_getlo(min); 168383b3815dSSong Gao } else { 16846256c8caSSong Gao Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 168583b3815dSSong Gao } 168683b3815dSSong Gao } 168783b3815dSSong Gao 16886256c8caSSong Gao void HELPER(vssrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 16896256c8caSSong Gao { 16906256c8caSSong Gao int i; 16916256c8caSSong Gao Int128 mask, min; 16926256c8caSSong Gao VReg *Vd = (VReg *)vd; 16936256c8caSSong Gao VReg *Vj = (VReg *)vj; 16946256c8caSSong Gao int oprsz = simd_oprsz(desc); 16956256c8caSSong Gao 16966256c8caSSong Gao mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); 16976256c8caSSong Gao min = int128_lshift(int128_one(), 63); 16986256c8caSSong Gao 16996256c8caSSong Gao for (i = 0; i < oprsz / 16; i++) { 17006256c8caSSong Gao do_vssrani_d_q(Vd, Vj, imm, i, mask, min); 17016256c8caSSong Gao } 17026256c8caSSong Gao } 17036256c8caSSong Gao 17046256c8caSSong Gao 170583b3815dSSong Gao VSSRANI(vssrani_b_h, 16, B, H) 170683b3815dSSong Gao VSSRANI(vssrani_h_w, 32, H, W) 170783b3815dSSong Gao VSSRANI(vssrani_w_d, 64, W, D) 170883b3815dSSong Gao 170983b3815dSSong Gao #define VSSRLNUI(NAME, BIT, E1, E2) \ 1710329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 171183b3815dSSong Gao { \ 17126256c8caSSong Gao int i, j, ofs; \ 17136256c8caSSong Gao VReg temp = {}; \ 1714329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 1715329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 17166256c8caSSong Gao int oprsz = simd_oprsz(desc); \ 171783b3815dSSong Gao \ 17186256c8caSSong Gao ofs = LSX_LEN / BIT; \ 17196256c8caSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 17206256c8caSSong Gao for (j = 0; j < ofs; j++) { \ 17216256c8caSSong Gao temp.E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i), \ 17226256c8caSSong Gao imm, BIT / 2); \ 17236256c8caSSong Gao temp.E1(j + ofs * (2 * i + 1)) = do_ssrlnu_ ## E1(Vd->E2(j + ofs * i), \ 17246256c8caSSong Gao imm, BIT / 2); \ 17256256c8caSSong Gao } \ 172683b3815dSSong Gao } \ 172783b3815dSSong Gao *Vd = temp; \ 172883b3815dSSong Gao } 172983b3815dSSong Gao 1730329517d5SSong Gao void HELPER(vssrlni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 173183b3815dSSong Gao { 17326256c8caSSong Gao int i; 17336256c8caSSong Gao Int128 mask; 1734329517d5SSong Gao VReg *Vd = (VReg *)vd; 1735329517d5SSong Gao VReg *Vj = (VReg *)vj; 17366256c8caSSong Gao int oprsz = simd_oprsz(desc); 173783b3815dSSong Gao 173883b3815dSSong Gao mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); 173983b3815dSSong Gao 17406256c8caSSong Gao for (i = 0; i < oprsz / 16; i++) { 17416256c8caSSong Gao do_vssrlni_q(Vd, Vj, imm, i, mask); 174283b3815dSSong Gao } 174383b3815dSSong Gao } 174483b3815dSSong Gao 174583b3815dSSong Gao VSSRLNUI(vssrlni_bu_h, 16, B, H) 174683b3815dSSong Gao VSSRLNUI(vssrlni_hu_w, 32, H, W) 174783b3815dSSong Gao VSSRLNUI(vssrlni_wu_d, 64, W, D) 174883b3815dSSong Gao 174983b3815dSSong Gao #define VSSRANUI(NAME, BIT, E1, E2) \ 1750329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 175183b3815dSSong Gao { \ 17526256c8caSSong Gao int i, j, ofs; \ 17536256c8caSSong Gao VReg temp = {}; \ 1754329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 1755329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 17566256c8caSSong Gao int oprsz = simd_oprsz(desc); \ 175783b3815dSSong Gao \ 17586256c8caSSong Gao ofs = LSX_LEN / BIT; \ 17596256c8caSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 17606256c8caSSong Gao for (j = 0; j < ofs; j++) { \ 17616256c8caSSong Gao temp.E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i), \ 17626256c8caSSong Gao imm, BIT / 2); \ 17636256c8caSSong Gao temp.E1(j + ofs * (2 * i + 1)) = do_ssranu_ ## E1(Vd->E2(j + ofs * i), \ 17646256c8caSSong Gao imm, BIT / 2); \ 17656256c8caSSong Gao } \ 176683b3815dSSong Gao } \ 176783b3815dSSong Gao *Vd = temp; \ 176883b3815dSSong Gao } 176983b3815dSSong Gao 17706256c8caSSong Gao static void do_vssrani_du_q(VReg *Vd, VReg *Vj, 17716256c8caSSong Gao uint64_t imm, int idx, Int128 mask) 177283b3815dSSong Gao { 17736256c8caSSong Gao Int128 shft_res1, shft_res2; 177483b3815dSSong Gao 177583b3815dSSong Gao if (imm == 0) { 17766256c8caSSong Gao shft_res1 = Vj->Q(idx); 17776256c8caSSong Gao shft_res2 = Vd->Q(idx); 177883b3815dSSong Gao } else { 17796256c8caSSong Gao shft_res1 = int128_rshift(Vj->Q(idx), imm); 17806256c8caSSong Gao shft_res2 = int128_rshift(Vd->Q(idx), imm); 178183b3815dSSong Gao } 178283b3815dSSong Gao 17836256c8caSSong Gao if (int128_lt(Vj->Q(idx), int128_zero())) { 178483b3815dSSong Gao shft_res1 = int128_zero(); 178583b3815dSSong Gao } 178683b3815dSSong Gao 17876256c8caSSong Gao if (int128_lt(Vd->Q(idx), int128_zero())) { 178883b3815dSSong Gao shft_res2 = int128_zero(); 178983b3815dSSong Gao } 179083b3815dSSong Gao if (int128_ult(mask, shft_res1)) { 17916256c8caSSong Gao Vd->D(idx * 2) = int128_getlo(mask); 179283b3815dSSong Gao }else { 17936256c8caSSong Gao Vd->D(idx * 2) = int128_getlo(shft_res1); 179483b3815dSSong Gao } 179583b3815dSSong Gao 179683b3815dSSong Gao if (int128_ult(mask, shft_res2)) { 17976256c8caSSong Gao Vd->D(idx * 2 + 1) = int128_getlo(mask); 179883b3815dSSong Gao }else { 17996256c8caSSong Gao Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 18006256c8caSSong Gao } 18016256c8caSSong Gao 18026256c8caSSong Gao } 18036256c8caSSong Gao 18046256c8caSSong Gao void HELPER(vssrani_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 18056256c8caSSong Gao { 18066256c8caSSong Gao int i; 18076256c8caSSong Gao Int128 mask; 18086256c8caSSong Gao VReg *Vd = (VReg *)vd; 18096256c8caSSong Gao VReg *Vj = (VReg *)vj; 18106256c8caSSong Gao int oprsz = simd_oprsz(desc); 18116256c8caSSong Gao 18126256c8caSSong Gao mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); 18136256c8caSSong Gao 18146256c8caSSong Gao for (i = 0; i < oprsz / 16; i++) { 18156256c8caSSong Gao do_vssrani_du_q(Vd, Vj, imm, i, mask); 181683b3815dSSong Gao } 181783b3815dSSong Gao } 181883b3815dSSong Gao 181983b3815dSSong Gao VSSRANUI(vssrani_bu_h, 16, B, H) 182083b3815dSSong Gao VSSRANUI(vssrani_hu_w, 32, H, W) 182183b3815dSSong Gao VSSRANUI(vssrani_wu_d, 64, W, D) 1822162cd32cSSong Gao 1823162cd32cSSong Gao #define SSRLRNS(E1, E2, T1, T2, T3) \ 1824162cd32cSSong Gao static T1 do_ssrlrns_ ## E1(T2 e2, int sa, int sh) \ 1825162cd32cSSong Gao { \ 1826162cd32cSSong Gao T1 shft_res; \ 1827162cd32cSSong Gao \ 1828162cd32cSSong Gao shft_res = do_vsrlr_ ## E2(e2, sa); \ 1829162cd32cSSong Gao T1 mask; \ 1830162cd32cSSong Gao mask = (1ull << sh) - 1; \ 1831162cd32cSSong Gao if (shft_res > mask) { \ 1832162cd32cSSong Gao return mask; \ 1833162cd32cSSong Gao } else { \ 1834162cd32cSSong Gao return shft_res; \ 1835162cd32cSSong Gao } \ 1836162cd32cSSong Gao } 1837162cd32cSSong Gao 1838162cd32cSSong Gao SSRLRNS(B, H, uint16_t, int16_t, uint8_t) 1839162cd32cSSong Gao SSRLRNS(H, W, uint32_t, int32_t, uint16_t) 1840162cd32cSSong Gao SSRLRNS(W, D, uint64_t, int64_t, uint32_t) 1841162cd32cSSong Gao 184277fca794SSong Gao #define VSSRLRN(NAME, BIT, E1, E2, E3) \ 184304711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1844162cd32cSSong Gao { \ 184577fca794SSong Gao int i, j, ofs; \ 184604711da1SSong Gao VReg *Vd = (VReg *)vd; \ 184704711da1SSong Gao VReg *Vj = (VReg *)vj; \ 184804711da1SSong Gao VReg *Vk = (VReg *)vk; \ 184977fca794SSong Gao int oprsz = simd_oprsz(desc); \ 1850162cd32cSSong Gao \ 185177fca794SSong Gao ofs = LSX_LEN / BIT; \ 185277fca794SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 185377fca794SSong Gao for (j = 0; j < ofs; j++) { \ 185477fca794SSong Gao Vd->E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i), \ 185577fca794SSong Gao Vk->E3(j + ofs * i) % BIT, \ 185677fca794SSong Gao BIT / 2 - 1); \ 1857162cd32cSSong Gao } \ 185877fca794SSong Gao Vd->D(2 * i + 1) = 0; \ 185977fca794SSong Gao } \ 1860162cd32cSSong Gao } 1861162cd32cSSong Gao 186277fca794SSong Gao VSSRLRN(vssrlrn_b_h, 16, B, H, UH) 186377fca794SSong Gao VSSRLRN(vssrlrn_h_w, 32, H, W, UW) 186477fca794SSong Gao VSSRLRN(vssrlrn_w_d, 64, W, D, UD) 1865162cd32cSSong Gao 1866162cd32cSSong Gao #define SSRARNS(E1, E2, T1, T2) \ 1867162cd32cSSong Gao static T1 do_ssrarns_ ## E1(T1 e2, int sa, int sh) \ 1868162cd32cSSong Gao { \ 1869162cd32cSSong Gao T1 shft_res; \ 1870162cd32cSSong Gao \ 1871162cd32cSSong Gao shft_res = do_vsrar_ ## E2(e2, sa); \ 1872162cd32cSSong Gao T2 mask; \ 1873162cd32cSSong Gao mask = (1ll << sh) - 1; \ 1874162cd32cSSong Gao if (shft_res > mask) { \ 1875162cd32cSSong Gao return mask; \ 1876162cd32cSSong Gao } else if (shft_res < -(mask +1)) { \ 1877162cd32cSSong Gao return ~mask; \ 1878162cd32cSSong Gao } else { \ 1879162cd32cSSong Gao return shft_res; \ 1880162cd32cSSong Gao } \ 1881162cd32cSSong Gao } 1882162cd32cSSong Gao 1883162cd32cSSong Gao SSRARNS(B, H, int16_t, int8_t) 1884162cd32cSSong Gao SSRARNS(H, W, int32_t, int16_t) 1885162cd32cSSong Gao SSRARNS(W, D, int64_t, int32_t) 1886162cd32cSSong Gao 188777fca794SSong Gao #define VSSRARN(NAME, BIT, E1, E2, E3) \ 188804711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1889162cd32cSSong Gao { \ 189077fca794SSong Gao int i, j, ofs; \ 189104711da1SSong Gao VReg *Vd = (VReg *)vd; \ 189204711da1SSong Gao VReg *Vj = (VReg *)vj; \ 189304711da1SSong Gao VReg *Vk = (VReg *)vk; \ 189477fca794SSong Gao int oprsz = simd_oprsz(desc); \ 1895162cd32cSSong Gao \ 189677fca794SSong Gao ofs = LSX_LEN / BIT; \ 189777fca794SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 189877fca794SSong Gao for (j = 0; j < ofs; j++) { \ 189977fca794SSong Gao Vd->E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i), \ 190077fca794SSong Gao Vk->E3(j + ofs * i) % BIT, \ 190177fca794SSong Gao BIT/ 2 - 1); \ 1902162cd32cSSong Gao } \ 190377fca794SSong Gao Vd->D(2 * i + 1) = 0; \ 190477fca794SSong Gao } \ 1905162cd32cSSong Gao } 1906162cd32cSSong Gao 190777fca794SSong Gao VSSRARN(vssrarn_b_h, 16, B, H, UH) 190877fca794SSong Gao VSSRARN(vssrarn_h_w, 32, H, W, UW) 190977fca794SSong Gao VSSRARN(vssrarn_w_d, 64, W, D, UD) 1910162cd32cSSong Gao 1911162cd32cSSong Gao #define SSRLRNU(E1, E2, T1, T2, T3) \ 1912162cd32cSSong Gao static T1 do_ssrlrnu_ ## E1(T3 e2, int sa, int sh) \ 1913162cd32cSSong Gao { \ 1914162cd32cSSong Gao T1 shft_res; \ 1915162cd32cSSong Gao \ 1916162cd32cSSong Gao shft_res = do_vsrlr_ ## E2(e2, sa); \ 1917162cd32cSSong Gao \ 1918162cd32cSSong Gao T2 mask; \ 1919162cd32cSSong Gao mask = (1ull << sh) - 1; \ 1920162cd32cSSong Gao if (shft_res > mask) { \ 1921162cd32cSSong Gao return mask; \ 1922162cd32cSSong Gao } else { \ 1923162cd32cSSong Gao return shft_res; \ 1924162cd32cSSong Gao } \ 1925162cd32cSSong Gao } 1926162cd32cSSong Gao 1927162cd32cSSong Gao SSRLRNU(B, H, uint16_t, uint8_t, int16_t) 1928162cd32cSSong Gao SSRLRNU(H, W, uint32_t, uint16_t, int32_t) 1929162cd32cSSong Gao SSRLRNU(W, D, uint64_t, uint32_t, int64_t) 1930162cd32cSSong Gao 193177fca794SSong Gao #define VSSRLRNU(NAME, BIT, E1, E2, E3) \ 193204711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1933162cd32cSSong Gao { \ 193477fca794SSong Gao int i, j, ofs; \ 193504711da1SSong Gao VReg *Vd = (VReg *)vd; \ 193604711da1SSong Gao VReg *Vj = (VReg *)vj; \ 193704711da1SSong Gao VReg *Vk = (VReg *)vk; \ 193877fca794SSong Gao int oprsz = simd_oprsz(desc); \ 1939162cd32cSSong Gao \ 194077fca794SSong Gao ofs = LSX_LEN / BIT; \ 194177fca794SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 194277fca794SSong Gao for (j = 0; j < ofs; j++) { \ 194377fca794SSong Gao Vd->E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i), \ 194477fca794SSong Gao Vk->E3(j + ofs * i) % BIT, \ 194577fca794SSong Gao BIT / 2); \ 1946162cd32cSSong Gao } \ 194777fca794SSong Gao Vd->D(2 * i + 1) = 0; \ 194877fca794SSong Gao } \ 1949162cd32cSSong Gao } 1950162cd32cSSong Gao 195177fca794SSong Gao VSSRLRNU(vssrlrn_bu_h, 16, B, H, UH) 195277fca794SSong Gao VSSRLRNU(vssrlrn_hu_w, 32, H, W, UW) 195377fca794SSong Gao VSSRLRNU(vssrlrn_wu_d, 64, W, D, UD) 1954162cd32cSSong Gao 1955162cd32cSSong Gao #define SSRARNU(E1, E2, T1, T2, T3) \ 1956162cd32cSSong Gao static T1 do_ssrarnu_ ## E1(T3 e2, int sa, int sh) \ 1957162cd32cSSong Gao { \ 1958162cd32cSSong Gao T1 shft_res; \ 1959162cd32cSSong Gao \ 1960162cd32cSSong Gao if (e2 < 0) { \ 1961162cd32cSSong Gao shft_res = 0; \ 1962162cd32cSSong Gao } else { \ 1963162cd32cSSong Gao shft_res = do_vsrar_ ## E2(e2, sa); \ 1964162cd32cSSong Gao } \ 1965162cd32cSSong Gao T2 mask; \ 1966162cd32cSSong Gao mask = (1ull << sh) - 1; \ 1967162cd32cSSong Gao if (shft_res > mask) { \ 1968162cd32cSSong Gao return mask; \ 1969162cd32cSSong Gao } else { \ 1970162cd32cSSong Gao return shft_res; \ 1971162cd32cSSong Gao } \ 1972162cd32cSSong Gao } 1973162cd32cSSong Gao 1974162cd32cSSong Gao SSRARNU(B, H, uint16_t, uint8_t, int16_t) 1975162cd32cSSong Gao SSRARNU(H, W, uint32_t, uint16_t, int32_t) 1976162cd32cSSong Gao SSRARNU(W, D, uint64_t, uint32_t, int64_t) 1977162cd32cSSong Gao 197877fca794SSong Gao #define VSSRARNU(NAME, BIT, E1, E2, E3) \ 197904711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1980162cd32cSSong Gao { \ 198177fca794SSong Gao int i, j, ofs; \ 198204711da1SSong Gao VReg *Vd = (VReg *)vd; \ 198304711da1SSong Gao VReg *Vj = (VReg *)vj; \ 198404711da1SSong Gao VReg *Vk = (VReg *)vk; \ 198577fca794SSong Gao int oprsz = simd_oprsz(desc); \ 1986162cd32cSSong Gao \ 198777fca794SSong Gao ofs = LSX_LEN / BIT; \ 198877fca794SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 198977fca794SSong Gao for (j = 0; j < ofs; j++) { \ 199077fca794SSong Gao Vd->E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i), \ 199177fca794SSong Gao Vk->E3(j + ofs * i) % BIT, \ 199277fca794SSong Gao BIT / 2); \ 1993162cd32cSSong Gao } \ 199477fca794SSong Gao Vd->D(2 * i + 1) = 0; \ 199577fca794SSong Gao } \ 1996162cd32cSSong Gao } 1997162cd32cSSong Gao 199877fca794SSong Gao VSSRARNU(vssrarn_bu_h, 16, B, H, UH) 199977fca794SSong Gao VSSRARNU(vssrarn_hu_w, 32, H, W, UW) 200077fca794SSong Gao VSSRARNU(vssrarn_wu_d, 64, W, D, UD) 2001162cd32cSSong Gao 2002162cd32cSSong Gao #define VSSRLRNI(NAME, BIT, E1, E2) \ 2003329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2004162cd32cSSong Gao { \ 200577fca794SSong Gao int i, j, ofs; \ 200677fca794SSong Gao VReg temp = {}; \ 2007329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 2008329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 200977fca794SSong Gao int oprsz = simd_oprsz(desc); \ 2010162cd32cSSong Gao \ 201177fca794SSong Gao ofs = LSX_LEN / BIT; \ 201277fca794SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 201377fca794SSong Gao for (j = 0; j < ofs; j++) { \ 201477fca794SSong Gao temp.E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i), \ 201577fca794SSong Gao imm, BIT / 2 - 1); \ 201677fca794SSong Gao temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrns_ ## E1(Vd->E2(j + ofs * i), \ 201777fca794SSong Gao imm, BIT / 2 - 1); \ 201877fca794SSong Gao } \ 2019162cd32cSSong Gao } \ 2020162cd32cSSong Gao *Vd = temp; \ 2021162cd32cSSong Gao } 2022162cd32cSSong Gao 202377fca794SSong Gao static void do_vssrlrni_q(VReg *Vd, VReg * Vj, 202477fca794SSong Gao uint64_t imm, int idx, Int128 mask) 202577fca794SSong Gao { 202677fca794SSong Gao Int128 shft_res1, shft_res2, r1, r2; 202777fca794SSong Gao if (imm == 0) { 202877fca794SSong Gao shft_res1 = Vj->Q(idx); 202977fca794SSong Gao shft_res2 = Vd->Q(idx); 203077fca794SSong Gao } else { 203177fca794SSong Gao r1 = int128_and(int128_urshift(Vj->Q(idx), (imm - 1)), int128_one()); 203277fca794SSong Gao r2 = int128_and(int128_urshift(Vd->Q(idx), (imm - 1)), int128_one()); 203377fca794SSong Gao shft_res1 = (int128_add(int128_urshift(Vj->Q(idx), imm), r1)); 203477fca794SSong Gao shft_res2 = (int128_add(int128_urshift(Vd->Q(idx), imm), r2)); 203577fca794SSong Gao } 203677fca794SSong Gao 203777fca794SSong Gao if (int128_ult(mask, shft_res1)) { 203877fca794SSong Gao Vd->D(idx * 2) = int128_getlo(mask); 203977fca794SSong Gao }else { 204077fca794SSong Gao Vd->D(idx * 2) = int128_getlo(shft_res1); 204177fca794SSong Gao } 204277fca794SSong Gao 204377fca794SSong Gao if (int128_ult(mask, shft_res2)) { 204477fca794SSong Gao Vd->D(idx * 2 + 1) = int128_getlo(mask); 204577fca794SSong Gao }else { 204677fca794SSong Gao Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 204777fca794SSong Gao } 204877fca794SSong Gao } 204977fca794SSong Gao 205077fca794SSong Gao void HELPER(vssrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 205177fca794SSong Gao { 205277fca794SSong Gao int i; 205377fca794SSong Gao Int128 mask; 205477fca794SSong Gao VReg *Vd = (VReg *)vd; 205577fca794SSong Gao VReg *Vj = (VReg *)vj; 205677fca794SSong Gao int oprsz = simd_oprsz(desc); 205777fca794SSong Gao 205877fca794SSong Gao mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); 205977fca794SSong Gao 206077fca794SSong Gao for (i = 0; i < oprsz / 16; i++) { 206177fca794SSong Gao do_vssrlrni_q(Vd, Vj, imm, i, mask); 206277fca794SSong Gao } 2063162cd32cSSong Gao } 2064162cd32cSSong Gao 2065162cd32cSSong Gao VSSRLRNI(vssrlrni_b_h, 16, B, H) 2066162cd32cSSong Gao VSSRLRNI(vssrlrni_h_w, 32, H, W) 2067162cd32cSSong Gao VSSRLRNI(vssrlrni_w_d, 64, W, D) 2068162cd32cSSong Gao 2069162cd32cSSong Gao #define VSSRARNI(NAME, BIT, E1, E2) \ 2070329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2071162cd32cSSong Gao { \ 207277fca794SSong Gao int i, j, ofs; \ 207377fca794SSong Gao VReg temp = {}; \ 2074329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 2075329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 207677fca794SSong Gao int oprsz = simd_oprsz(desc); \ 2077162cd32cSSong Gao \ 207877fca794SSong Gao ofs = LSX_LEN / BIT; \ 207977fca794SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 208077fca794SSong Gao for (j = 0; j < ofs; j++) { \ 208177fca794SSong Gao temp.E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i), \ 208277fca794SSong Gao imm, BIT / 2 - 1); \ 208377fca794SSong Gao temp.E1(j + ofs * (2 * i + 1)) = do_ssrarns_ ## E1(Vd->E2(j + ofs * i), \ 208477fca794SSong Gao imm, BIT / 2 - 1); \ 208577fca794SSong Gao } \ 2086162cd32cSSong Gao } \ 2087162cd32cSSong Gao *Vd = temp; \ 2088162cd32cSSong Gao } 2089162cd32cSSong Gao 209077fca794SSong Gao static void do_vssrarni_d_q(VReg *Vd, VReg *Vj, 209177fca794SSong Gao uint64_t imm, int idx, Int128 mask1, Int128 mask2) 2092162cd32cSSong Gao { 209377fca794SSong Gao Int128 shft_res1, shft_res2, r1, r2; 2094162cd32cSSong Gao 2095162cd32cSSong Gao if (imm == 0) { 209677fca794SSong Gao shft_res1 = Vj->Q(idx); 209777fca794SSong Gao shft_res2 = Vd->Q(idx); 2098162cd32cSSong Gao } else { 209977fca794SSong Gao r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one()); 210077fca794SSong Gao r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one()); 210177fca794SSong Gao shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1); 210277fca794SSong Gao shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2); 2103162cd32cSSong Gao } 210477fca794SSong Gao if (int128_gt(shft_res1, mask1)) { 210577fca794SSong Gao Vd->D(idx * 2) = int128_getlo(mask1); 210677fca794SSong Gao } else if (int128_lt(shft_res1, int128_neg(mask2))) { 210777fca794SSong Gao Vd->D(idx * 2) = int128_getlo(mask2); 210877fca794SSong Gao } else { 210977fca794SSong Gao Vd->D(idx * 2) = int128_getlo(shft_res1); 211077fca794SSong Gao } 211177fca794SSong Gao 211277fca794SSong Gao if (int128_gt(shft_res2, mask1)) { 211377fca794SSong Gao Vd->D(idx * 2 + 1) = int128_getlo(mask1); 211477fca794SSong Gao } else if (int128_lt(shft_res2, int128_neg(mask2))) { 211577fca794SSong Gao Vd->D(idx * 2 + 1) = int128_getlo(mask2); 211677fca794SSong Gao } else { 211777fca794SSong Gao Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 211877fca794SSong Gao } 211977fca794SSong Gao } 212077fca794SSong Gao 212177fca794SSong Gao void HELPER(vssrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 212277fca794SSong Gao { 212377fca794SSong Gao int i; 212477fca794SSong Gao Int128 mask1, mask2; 212577fca794SSong Gao VReg *Vd = (VReg *)vd; 212677fca794SSong Gao VReg *Vj = (VReg *)vj; 212777fca794SSong Gao int oprsz = simd_oprsz(desc); 2128162cd32cSSong Gao 2129162cd32cSSong Gao mask1 = int128_sub(int128_lshift(int128_one(), 63), int128_one()); 2130162cd32cSSong Gao mask2 = int128_lshift(int128_one(), 63); 2131162cd32cSSong Gao 213277fca794SSong Gao for (i = 0; i < oprsz / 16; i++) { 213377fca794SSong Gao do_vssrarni_d_q(Vd, Vj, imm, i, mask1, mask2); 2134162cd32cSSong Gao } 2135162cd32cSSong Gao } 2136162cd32cSSong Gao 2137162cd32cSSong Gao VSSRARNI(vssrarni_b_h, 16, B, H) 2138162cd32cSSong Gao VSSRARNI(vssrarni_h_w, 32, H, W) 2139162cd32cSSong Gao VSSRARNI(vssrarni_w_d, 64, W, D) 2140162cd32cSSong Gao 2141162cd32cSSong Gao #define VSSRLRNUI(NAME, BIT, E1, E2) \ 2142329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2143162cd32cSSong Gao { \ 214477fca794SSong Gao int i, j, ofs; \ 214577fca794SSong Gao VReg temp = {}; \ 2146329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 2147329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 214877fca794SSong Gao int oprsz = simd_oprsz(desc); \ 2149162cd32cSSong Gao \ 215077fca794SSong Gao ofs = LSX_LEN / BIT; \ 215177fca794SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 215277fca794SSong Gao for (j = 0; j < ofs; j++) { \ 215377fca794SSong Gao temp.E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i), \ 215477fca794SSong Gao imm, BIT / 2); \ 215577fca794SSong Gao temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrnu_ ## E1(Vd->E2(j + ofs * i), \ 215677fca794SSong Gao imm, BIT / 2); \ 215777fca794SSong Gao } \ 2158162cd32cSSong Gao } \ 2159162cd32cSSong Gao *Vd = temp; \ 2160162cd32cSSong Gao } 2161162cd32cSSong Gao 216277fca794SSong Gao void HELPER(vssrlrni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 216377fca794SSong Gao { 216477fca794SSong Gao int i; 216577fca794SSong Gao Int128 mask; 216677fca794SSong Gao VReg *Vd = (VReg *)vd; 216777fca794SSong Gao VReg *Vj = (VReg *)vj; 216877fca794SSong Gao int oprsz = simd_oprsz(desc); 216977fca794SSong Gao 217077fca794SSong Gao mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); 217177fca794SSong Gao 217277fca794SSong Gao for (i = 0; i < oprsz / 16; i++) { 217377fca794SSong Gao do_vssrlrni_q(Vd, Vj, imm, i, mask); 217477fca794SSong Gao } 217577fca794SSong Gao } 217677fca794SSong Gao 2177162cd32cSSong Gao VSSRLRNUI(vssrlrni_bu_h, 16, B, H) 2178162cd32cSSong Gao VSSRLRNUI(vssrlrni_hu_w, 32, H, W) 2179162cd32cSSong Gao VSSRLRNUI(vssrlrni_wu_d, 64, W, D) 2180162cd32cSSong Gao 2181162cd32cSSong Gao #define VSSRARNUI(NAME, BIT, E1, E2) \ 2182329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2183162cd32cSSong Gao { \ 218477fca794SSong Gao int i, j, ofs; \ 218577fca794SSong Gao VReg temp = {}; \ 2186329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 2187329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 218877fca794SSong Gao int oprsz = simd_oprsz(desc); \ 2189162cd32cSSong Gao \ 219077fca794SSong Gao ofs = LSX_LEN / BIT; \ 219177fca794SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 219277fca794SSong Gao for (j = 0; j < ofs; j++) { \ 219377fca794SSong Gao temp.E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i), \ 219477fca794SSong Gao imm, BIT / 2); \ 219577fca794SSong Gao temp.E1(j + ofs * (2 * i + 1)) = do_ssrarnu_ ## E1(Vd->E2(j + ofs * i), \ 219677fca794SSong Gao imm, BIT / 2); \ 219777fca794SSong Gao } \ 2198162cd32cSSong Gao } \ 2199162cd32cSSong Gao *Vd = temp; \ 2200162cd32cSSong Gao } 2201162cd32cSSong Gao 220277fca794SSong Gao static void do_vssrarni_du_q(VReg *Vd, VReg *Vj, 220377fca794SSong Gao uint64_t imm, int idx, Int128 mask1, Int128 mask2) 2204162cd32cSSong Gao { 220577fca794SSong Gao Int128 shft_res1, shft_res2, r1, r2; 2206162cd32cSSong Gao 2207162cd32cSSong Gao if (imm == 0) { 220877fca794SSong Gao shft_res1 = Vj->Q(idx); 220977fca794SSong Gao shft_res2 = Vd->Q(idx); 2210162cd32cSSong Gao } else { 221177fca794SSong Gao r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one()); 221277fca794SSong Gao r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one()); 221377fca794SSong Gao shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1); 221477fca794SSong Gao shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2); 2215162cd32cSSong Gao } 2216162cd32cSSong Gao 221777fca794SSong Gao if (int128_lt(Vj->Q(idx), int128_zero())) { 2218162cd32cSSong Gao shft_res1 = int128_zero(); 2219162cd32cSSong Gao } 222077fca794SSong Gao if (int128_lt(Vd->Q(idx), int128_zero())) { 2221162cd32cSSong Gao shft_res2 = int128_zero(); 2222162cd32cSSong Gao } 2223162cd32cSSong Gao 222477fca794SSong Gao if (int128_gt(shft_res1, mask1)) { 222577fca794SSong Gao Vd->D(idx * 2) = int128_getlo(mask1); 222677fca794SSong Gao } else if (int128_lt(shft_res1, int128_neg(mask2))) { 222777fca794SSong Gao Vd->D(idx * 2) = int128_getlo(mask2); 222877fca794SSong Gao } else { 222977fca794SSong Gao Vd->D(idx * 2) = int128_getlo(shft_res1); 223077fca794SSong Gao } 223177fca794SSong Gao 223277fca794SSong Gao if (int128_gt(shft_res2, mask1)) { 223377fca794SSong Gao Vd->D(idx * 2 + 1) = int128_getlo(mask1); 223477fca794SSong Gao } else if (int128_lt(shft_res2, int128_neg(mask2))) { 223577fca794SSong Gao Vd->D(idx * 2 + 1) = int128_getlo(mask2); 223677fca794SSong Gao } else { 223777fca794SSong Gao Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 223877fca794SSong Gao } 223977fca794SSong Gao } 224077fca794SSong Gao 224177fca794SSong Gao void HELPER(vssrarni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 224277fca794SSong Gao { 224377fca794SSong Gao int i; 224477fca794SSong Gao Int128 mask1, mask2; 224577fca794SSong Gao VReg *Vd = (VReg *)vd; 224677fca794SSong Gao VReg *Vj = (VReg *)vj; 224777fca794SSong Gao int oprsz = simd_oprsz(desc); 224877fca794SSong Gao 2249162cd32cSSong Gao mask1 = int128_sub(int128_lshift(int128_one(), 64), int128_one()); 2250162cd32cSSong Gao mask2 = int128_lshift(int128_one(), 64); 2251162cd32cSSong Gao 225277fca794SSong Gao for (i = 0; i < oprsz / 16; i++) { 225377fca794SSong Gao do_vssrarni_du_q(Vd, Vj, imm, i, mask1, mask2); 2254162cd32cSSong Gao } 2255162cd32cSSong Gao } 2256162cd32cSSong Gao 2257162cd32cSSong Gao VSSRARNUI(vssrarni_bu_h, 16, B, H) 2258162cd32cSSong Gao VSSRARNUI(vssrarni_hu_w, 32, H, W) 2259162cd32cSSong Gao VSSRARNUI(vssrarni_wu_d, 64, W, D) 22602e105e12SSong Gao 22612e105e12SSong Gao #define DO_2OP(NAME, BIT, E, DO_OP) \ 2262ff27e335SSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ 22632e105e12SSong Gao { \ 22642e105e12SSong Gao int i; \ 2265ff27e335SSong Gao VReg *Vd = (VReg *)vd; \ 2266ff27e335SSong Gao VReg *Vj = (VReg *)vj; \ 226712ad133fSSong Gao int oprsz = simd_oprsz(desc); \ 22682e105e12SSong Gao \ 226912ad133fSSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) \ 22702e105e12SSong Gao { \ 22712e105e12SSong Gao Vd->E(i) = DO_OP(Vj->E(i)); \ 22722e105e12SSong Gao } \ 22732e105e12SSong Gao } 22742e105e12SSong Gao 22752e105e12SSong Gao #define DO_CLO_B(N) (clz32(~N & 0xff) - 24) 22762e105e12SSong Gao #define DO_CLO_H(N) (clz32(~N & 0xffff) - 16) 22772e105e12SSong Gao #define DO_CLO_W(N) (clz32(~N)) 22782e105e12SSong Gao #define DO_CLO_D(N) (clz64(~N)) 22792e105e12SSong Gao #define DO_CLZ_B(N) (clz32(N) - 24) 22802e105e12SSong Gao #define DO_CLZ_H(N) (clz32(N) - 16) 22812e105e12SSong Gao #define DO_CLZ_W(N) (clz32(N)) 22822e105e12SSong Gao #define DO_CLZ_D(N) (clz64(N)) 22832e105e12SSong Gao 22842e105e12SSong Gao DO_2OP(vclo_b, 8, UB, DO_CLO_B) 22852e105e12SSong Gao DO_2OP(vclo_h, 16, UH, DO_CLO_H) 22862e105e12SSong Gao DO_2OP(vclo_w, 32, UW, DO_CLO_W) 22872e105e12SSong Gao DO_2OP(vclo_d, 64, UD, DO_CLO_D) 22882e105e12SSong Gao DO_2OP(vclz_b, 8, UB, DO_CLZ_B) 22892e105e12SSong Gao DO_2OP(vclz_h, 16, UH, DO_CLZ_H) 22902e105e12SSong Gao DO_2OP(vclz_w, 32, UW, DO_CLZ_W) 22912e105e12SSong Gao DO_2OP(vclz_d, 64, UD, DO_CLZ_D) 2292bb22ee57SSong Gao 2293bb22ee57SSong Gao #define VPCNT(NAME, BIT, E, FN) \ 2294ff27e335SSong Gao void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ 2295bb22ee57SSong Gao { \ 2296bb22ee57SSong Gao int i; \ 2297ff27e335SSong Gao VReg *Vd = (VReg *)vd; \ 2298ff27e335SSong Gao VReg *Vj = (VReg *)vj; \ 2299956dec74SSong Gao int oprsz = simd_oprsz(desc); \ 2300bb22ee57SSong Gao \ 2301956dec74SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) \ 2302bb22ee57SSong Gao { \ 2303bb22ee57SSong Gao Vd->E(i) = FN(Vj->E(i)); \ 2304bb22ee57SSong Gao } \ 2305bb22ee57SSong Gao } 2306bb22ee57SSong Gao 2307bb22ee57SSong Gao VPCNT(vpcnt_b, 8, UB, ctpop8) 2308bb22ee57SSong Gao VPCNT(vpcnt_h, 16, UH, ctpop16) 2309bb22ee57SSong Gao VPCNT(vpcnt_w, 32, UW, ctpop32) 2310bb22ee57SSong Gao VPCNT(vpcnt_d, 64, UD, ctpop64) 23110b1e6705SSong Gao 23120b1e6705SSong Gao #define DO_BITCLR(a, bit) (a & ~(1ull << bit)) 23130b1e6705SSong Gao #define DO_BITSET(a, bit) (a | 1ull << bit) 23140b1e6705SSong Gao #define DO_BITREV(a, bit) (a ^ (1ull << bit)) 23150b1e6705SSong Gao 23160b1e6705SSong Gao #define DO_BIT(NAME, BIT, E, DO_OP) \ 23171b3e242fSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 23180b1e6705SSong Gao { \ 23190b1e6705SSong Gao int i; \ 23200b1e6705SSong Gao VReg *Vd = (VReg *)vd; \ 23210b1e6705SSong Gao VReg *Vj = (VReg *)vj; \ 23220b1e6705SSong Gao VReg *Vk = (VReg *)vk; \ 23231b3e242fSSong Gao int oprsz = simd_oprsz(desc); \ 23240b1e6705SSong Gao \ 23251b3e242fSSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 23260b1e6705SSong Gao Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)%BIT); \ 23270b1e6705SSong Gao } \ 23280b1e6705SSong Gao } 23290b1e6705SSong Gao 23300b1e6705SSong Gao DO_BIT(vbitclr_b, 8, UB, DO_BITCLR) 23310b1e6705SSong Gao DO_BIT(vbitclr_h, 16, UH, DO_BITCLR) 23320b1e6705SSong Gao DO_BIT(vbitclr_w, 32, UW, DO_BITCLR) 23330b1e6705SSong Gao DO_BIT(vbitclr_d, 64, UD, DO_BITCLR) 23340b1e6705SSong Gao DO_BIT(vbitset_b, 8, UB, DO_BITSET) 23350b1e6705SSong Gao DO_BIT(vbitset_h, 16, UH, DO_BITSET) 23360b1e6705SSong Gao DO_BIT(vbitset_w, 32, UW, DO_BITSET) 23370b1e6705SSong Gao DO_BIT(vbitset_d, 64, UD, DO_BITSET) 23380b1e6705SSong Gao DO_BIT(vbitrev_b, 8, UB, DO_BITREV) 23390b1e6705SSong Gao DO_BIT(vbitrev_h, 16, UH, DO_BITREV) 23400b1e6705SSong Gao DO_BIT(vbitrev_w, 32, UW, DO_BITREV) 23410b1e6705SSong Gao DO_BIT(vbitrev_d, 64, UD, DO_BITREV) 23420b1e6705SSong Gao 23430b1e6705SSong Gao #define DO_BITI(NAME, BIT, E, DO_OP) \ 23441b3e242fSSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 23450b1e6705SSong Gao { \ 23460b1e6705SSong Gao int i; \ 23470b1e6705SSong Gao VReg *Vd = (VReg *)vd; \ 23480b1e6705SSong Gao VReg *Vj = (VReg *)vj; \ 23491b3e242fSSong Gao int oprsz = simd_oprsz(desc); \ 23500b1e6705SSong Gao \ 23511b3e242fSSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 23520b1e6705SSong Gao Vd->E(i) = DO_OP(Vj->E(i), imm); \ 23530b1e6705SSong Gao } \ 23540b1e6705SSong Gao } 23550b1e6705SSong Gao 23560b1e6705SSong Gao DO_BITI(vbitclri_b, 8, UB, DO_BITCLR) 23570b1e6705SSong Gao DO_BITI(vbitclri_h, 16, UH, DO_BITCLR) 23580b1e6705SSong Gao DO_BITI(vbitclri_w, 32, UW, DO_BITCLR) 23590b1e6705SSong Gao DO_BITI(vbitclri_d, 64, UD, DO_BITCLR) 23600b1e6705SSong Gao DO_BITI(vbitseti_b, 8, UB, DO_BITSET) 23610b1e6705SSong Gao DO_BITI(vbitseti_h, 16, UH, DO_BITSET) 23620b1e6705SSong Gao DO_BITI(vbitseti_w, 32, UW, DO_BITSET) 23630b1e6705SSong Gao DO_BITI(vbitseti_d, 64, UD, DO_BITSET) 23640b1e6705SSong Gao DO_BITI(vbitrevi_b, 8, UB, DO_BITREV) 23650b1e6705SSong Gao DO_BITI(vbitrevi_h, 16, UH, DO_BITREV) 23660b1e6705SSong Gao DO_BITI(vbitrevi_w, 32, UW, DO_BITREV) 23670b1e6705SSong Gao DO_BITI(vbitrevi_d, 64, UD, DO_BITREV) 2368ac95a0b9SSong Gao 2369ac95a0b9SSong Gao #define VFRSTP(NAME, BIT, MASK, E) \ 237004711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 2371ac95a0b9SSong Gao { \ 2372abee168eSSong Gao int i, j, m, ofs; \ 237304711da1SSong Gao VReg *Vd = (VReg *)vd; \ 237404711da1SSong Gao VReg *Vj = (VReg *)vj; \ 237504711da1SSong Gao VReg *Vk = (VReg *)vk; \ 2376abee168eSSong Gao int oprsz = simd_oprsz(desc); \ 2377ac95a0b9SSong Gao \ 2378abee168eSSong Gao ofs = LSX_LEN / BIT; \ 2379abee168eSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 2380abee168eSSong Gao m = Vk->E(i * ofs) & MASK; \ 2381abee168eSSong Gao for (j = 0; j < ofs; j++) { \ 2382abee168eSSong Gao if (Vj->E(j + ofs * i) < 0) { \ 2383ac95a0b9SSong Gao break; \ 2384ac95a0b9SSong Gao } \ 2385ac95a0b9SSong Gao } \ 2386abee168eSSong Gao Vd->E(m + i * ofs) = j; \ 2387abee168eSSong Gao } \ 2388ac95a0b9SSong Gao } 2389ac95a0b9SSong Gao 2390ac95a0b9SSong Gao VFRSTP(vfrstp_b, 8, 0xf, B) 2391ac95a0b9SSong Gao VFRSTP(vfrstp_h, 16, 0x7, H) 2392ac95a0b9SSong Gao 2393ac95a0b9SSong Gao #define VFRSTPI(NAME, BIT, E) \ 2394329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2395ac95a0b9SSong Gao { \ 2396abee168eSSong Gao int i, j, m, ofs; \ 2397329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 2398329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 2399abee168eSSong Gao int oprsz = simd_oprsz(desc); \ 2400ac95a0b9SSong Gao \ 2401abee168eSSong Gao ofs = LSX_LEN / BIT; \ 2402abee168eSSong Gao m = imm % ofs; \ 2403abee168eSSong Gao for (i = 0; i < oprsz / 16; i++) { \ 2404abee168eSSong Gao for (j = 0; j < ofs; j++) { \ 2405abee168eSSong Gao if (Vj->E(j + ofs * i) < 0) { \ 2406ac95a0b9SSong Gao break; \ 2407ac95a0b9SSong Gao } \ 2408ac95a0b9SSong Gao } \ 2409abee168eSSong Gao Vd->E(m + i * ofs) = j; \ 2410abee168eSSong Gao } \ 2411ac95a0b9SSong Gao } 2412ac95a0b9SSong Gao 2413ac95a0b9SSong Gao VFRSTPI(vfrstpi_b, 8, B) 2414ac95a0b9SSong Gao VFRSTPI(vfrstpi_h, 16, H) 2415aca67472SSong Gao 2416aca67472SSong Gao static void vec_update_fcsr0_mask(CPULoongArchState *env, 2417aca67472SSong Gao uintptr_t pc, int mask) 2418aca67472SSong Gao { 2419aca67472SSong Gao int flags = get_float_exception_flags(&env->fp_status); 2420aca67472SSong Gao 2421aca67472SSong Gao set_float_exception_flags(0, &env->fp_status); 2422aca67472SSong Gao 2423aca67472SSong Gao flags &= ~mask; 2424aca67472SSong Gao 2425aca67472SSong Gao if (flags) { 2426aca67472SSong Gao flags = ieee_ex_to_loongarch(flags); 2427aca67472SSong Gao UPDATE_FP_CAUSE(env->fcsr0, flags); 2428aca67472SSong Gao } 2429aca67472SSong Gao 2430aca67472SSong Gao if (GET_FP_ENABLES(env->fcsr0) & flags) { 2431aca67472SSong Gao do_raise_exception(env, EXCCODE_FPE, pc); 2432aca67472SSong Gao } else { 2433aca67472SSong Gao UPDATE_FP_FLAGS(env->fcsr0, flags); 2434aca67472SSong Gao } 2435aca67472SSong Gao } 2436aca67472SSong Gao 2437aca67472SSong Gao static void vec_update_fcsr0(CPULoongArchState *env, uintptr_t pc) 2438aca67472SSong Gao { 2439aca67472SSong Gao vec_update_fcsr0_mask(env, pc, 0); 2440aca67472SSong Gao } 2441aca67472SSong Gao 2442aca67472SSong Gao static inline void vec_clear_cause(CPULoongArchState *env) 2443aca67472SSong Gao { 2444aca67472SSong Gao SET_FP_CAUSE(env->fcsr0, 0); 2445aca67472SSong Gao } 2446aca67472SSong Gao 2447aca67472SSong Gao #define DO_3OP_F(NAME, BIT, E, FN) \ 24483b286753SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, \ 24493b286753SSong Gao CPULoongArchState *env, uint32_t desc) \ 2450aca67472SSong Gao { \ 2451aca67472SSong Gao int i; \ 24523b286753SSong Gao VReg *Vd = (VReg *)vd; \ 24533b286753SSong Gao VReg *Vj = (VReg *)vj; \ 24543b286753SSong Gao VReg *Vk = (VReg *)vk; \ 2455c9caf158SSong Gao int oprsz = simd_oprsz(desc); \ 2456aca67472SSong Gao \ 2457aca67472SSong Gao vec_clear_cause(env); \ 2458c9caf158SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2459aca67472SSong Gao Vd->E(i) = FN(Vj->E(i), Vk->E(i), &env->fp_status); \ 2460aca67472SSong Gao vec_update_fcsr0(env, GETPC()); \ 2461aca67472SSong Gao } \ 2462aca67472SSong Gao } 2463aca67472SSong Gao 2464aca67472SSong Gao DO_3OP_F(vfadd_s, 32, UW, float32_add) 2465aca67472SSong Gao DO_3OP_F(vfadd_d, 64, UD, float64_add) 2466aca67472SSong Gao DO_3OP_F(vfsub_s, 32, UW, float32_sub) 2467aca67472SSong Gao DO_3OP_F(vfsub_d, 64, UD, float64_sub) 2468aca67472SSong Gao DO_3OP_F(vfmul_s, 32, UW, float32_mul) 2469aca67472SSong Gao DO_3OP_F(vfmul_d, 64, UD, float64_mul) 2470aca67472SSong Gao DO_3OP_F(vfdiv_s, 32, UW, float32_div) 2471aca67472SSong Gao DO_3OP_F(vfdiv_d, 64, UD, float64_div) 2472aca67472SSong Gao DO_3OP_F(vfmax_s, 32, UW, float32_maxnum) 2473aca67472SSong Gao DO_3OP_F(vfmax_d, 64, UD, float64_maxnum) 2474aca67472SSong Gao DO_3OP_F(vfmin_s, 32, UW, float32_minnum) 2475aca67472SSong Gao DO_3OP_F(vfmin_d, 64, UD, float64_minnum) 2476aca67472SSong Gao DO_3OP_F(vfmaxa_s, 32, UW, float32_maxnummag) 2477aca67472SSong Gao DO_3OP_F(vfmaxa_d, 64, UD, float64_maxnummag) 2478aca67472SSong Gao DO_3OP_F(vfmina_s, 32, UW, float32_minnummag) 2479aca67472SSong Gao DO_3OP_F(vfmina_d, 64, UD, float64_minnummag) 2480aca67472SSong Gao 2481aca67472SSong Gao #define DO_4OP_F(NAME, BIT, E, FN, flags) \ 2482e2600dadSSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, void *va, \ 2483e2600dadSSong Gao CPULoongArchState *env, uint32_t desc) \ 2484aca67472SSong Gao { \ 2485aca67472SSong Gao int i; \ 2486e2600dadSSong Gao VReg *Vd = (VReg *)vd; \ 2487e2600dadSSong Gao VReg *Vj = (VReg *)vj; \ 2488e2600dadSSong Gao VReg *Vk = (VReg *)vk; \ 2489e2600dadSSong Gao VReg *Va = (VReg *)va; \ 2490c9caf158SSong Gao int oprsz = simd_oprsz(desc); \ 2491aca67472SSong Gao \ 2492aca67472SSong Gao vec_clear_cause(env); \ 2493c9caf158SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2494aca67472SSong Gao Vd->E(i) = FN(Vj->E(i), Vk->E(i), Va->E(i), flags, &env->fp_status); \ 2495aca67472SSong Gao vec_update_fcsr0(env, GETPC()); \ 2496aca67472SSong Gao } \ 2497aca67472SSong Gao } 2498aca67472SSong Gao 2499aca67472SSong Gao DO_4OP_F(vfmadd_s, 32, UW, float32_muladd, 0) 2500aca67472SSong Gao DO_4OP_F(vfmadd_d, 64, UD, float64_muladd, 0) 2501aca67472SSong Gao DO_4OP_F(vfmsub_s, 32, UW, float32_muladd, float_muladd_negate_c) 2502aca67472SSong Gao DO_4OP_F(vfmsub_d, 64, UD, float64_muladd, float_muladd_negate_c) 2503aca67472SSong Gao DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_result) 2504aca67472SSong Gao DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_result) 2505aca67472SSong Gao DO_4OP_F(vfnmsub_s, 32, UW, float32_muladd, 2506aca67472SSong Gao float_muladd_negate_c | float_muladd_negate_result) 2507aca67472SSong Gao DO_4OP_F(vfnmsub_d, 64, UD, float64_muladd, 2508aca67472SSong Gao float_muladd_negate_c | float_muladd_negate_result) 2509aca67472SSong Gao 2510aca67472SSong Gao #define DO_2OP_F(NAME, BIT, E, FN) \ 2511226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj, \ 2512226bf881SSong Gao CPULoongArchState *env, uint32_t desc) \ 2513aca67472SSong Gao { \ 2514aca67472SSong Gao int i; \ 2515226bf881SSong Gao VReg *Vd = (VReg *)vd; \ 2516226bf881SSong Gao VReg *Vj = (VReg *)vj; \ 2517c9caf158SSong Gao int oprsz = simd_oprsz(desc); \ 2518aca67472SSong Gao \ 2519aca67472SSong Gao vec_clear_cause(env); \ 2520c9caf158SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2521aca67472SSong Gao Vd->E(i) = FN(env, Vj->E(i)); \ 2522aca67472SSong Gao } \ 2523aca67472SSong Gao } 2524aca67472SSong Gao 2525aca67472SSong Gao #define FLOGB(BIT, T) \ 2526aca67472SSong Gao static T do_flogb_## BIT(CPULoongArchState *env, T fj) \ 2527aca67472SSong Gao { \ 2528aca67472SSong Gao T fp, fd; \ 2529aca67472SSong Gao float_status *status = &env->fp_status; \ 2530aca67472SSong Gao FloatRoundMode old_mode = get_float_rounding_mode(status); \ 2531aca67472SSong Gao \ 2532aca67472SSong Gao set_float_rounding_mode(float_round_down, status); \ 2533aca67472SSong Gao fp = float ## BIT ##_log2(fj, status); \ 2534aca67472SSong Gao fd = float ## BIT ##_round_to_int(fp, status); \ 2535aca67472SSong Gao set_float_rounding_mode(old_mode, status); \ 2536aca67472SSong Gao vec_update_fcsr0_mask(env, GETPC(), float_flag_inexact); \ 2537aca67472SSong Gao return fd; \ 2538aca67472SSong Gao } 2539aca67472SSong Gao 2540aca67472SSong Gao FLOGB(32, uint32_t) 2541aca67472SSong Gao FLOGB(64, uint64_t) 2542aca67472SSong Gao 2543aca67472SSong Gao #define FCLASS(NAME, BIT, E, FN) \ 2544226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj, \ 2545226bf881SSong Gao CPULoongArchState *env, uint32_t desc) \ 2546aca67472SSong Gao { \ 2547aca67472SSong Gao int i; \ 2548226bf881SSong Gao VReg *Vd = (VReg *)vd; \ 2549226bf881SSong Gao VReg *Vj = (VReg *)vj; \ 2550c9caf158SSong Gao int oprsz = simd_oprsz(desc); \ 2551aca67472SSong Gao \ 2552c9caf158SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2553aca67472SSong Gao Vd->E(i) = FN(env, Vj->E(i)); \ 2554aca67472SSong Gao } \ 2555aca67472SSong Gao } 2556aca67472SSong Gao 2557aca67472SSong Gao FCLASS(vfclass_s, 32, UW, helper_fclass_s) 2558aca67472SSong Gao FCLASS(vfclass_d, 64, UD, helper_fclass_d) 2559aca67472SSong Gao 2560aca67472SSong Gao #define FSQRT(BIT, T) \ 2561aca67472SSong Gao static T do_fsqrt_## BIT(CPULoongArchState *env, T fj) \ 2562aca67472SSong Gao { \ 2563aca67472SSong Gao T fd; \ 2564aca67472SSong Gao fd = float ## BIT ##_sqrt(fj, &env->fp_status); \ 2565aca67472SSong Gao vec_update_fcsr0(env, GETPC()); \ 2566aca67472SSong Gao return fd; \ 2567aca67472SSong Gao } 2568aca67472SSong Gao 2569aca67472SSong Gao FSQRT(32, uint32_t) 2570aca67472SSong Gao FSQRT(64, uint64_t) 2571aca67472SSong Gao 2572aca67472SSong Gao #define FRECIP(BIT, T) \ 2573aca67472SSong Gao static T do_frecip_## BIT(CPULoongArchState *env, T fj) \ 2574aca67472SSong Gao { \ 2575aca67472SSong Gao T fd; \ 2576aca67472SSong Gao fd = float ## BIT ##_div(float ## BIT ##_one, fj, &env->fp_status); \ 2577aca67472SSong Gao vec_update_fcsr0(env, GETPC()); \ 2578aca67472SSong Gao return fd; \ 2579aca67472SSong Gao } 2580aca67472SSong Gao 2581aca67472SSong Gao FRECIP(32, uint32_t) 2582aca67472SSong Gao FRECIP(64, uint64_t) 2583aca67472SSong Gao 2584aca67472SSong Gao #define FRSQRT(BIT, T) \ 2585aca67472SSong Gao static T do_frsqrt_## BIT(CPULoongArchState *env, T fj) \ 2586aca67472SSong Gao { \ 2587aca67472SSong Gao T fd, fp; \ 2588aca67472SSong Gao fp = float ## BIT ##_sqrt(fj, &env->fp_status); \ 2589aca67472SSong Gao fd = float ## BIT ##_div(float ## BIT ##_one, fp, &env->fp_status); \ 2590aca67472SSong Gao vec_update_fcsr0(env, GETPC()); \ 2591aca67472SSong Gao return fd; \ 2592aca67472SSong Gao } 2593aca67472SSong Gao 2594aca67472SSong Gao FRSQRT(32, uint32_t) 2595aca67472SSong Gao FRSQRT(64, uint64_t) 2596aca67472SSong Gao 2597aca67472SSong Gao DO_2OP_F(vflogb_s, 32, UW, do_flogb_32) 2598aca67472SSong Gao DO_2OP_F(vflogb_d, 64, UD, do_flogb_64) 2599aca67472SSong Gao DO_2OP_F(vfsqrt_s, 32, UW, do_fsqrt_32) 2600aca67472SSong Gao DO_2OP_F(vfsqrt_d, 64, UD, do_fsqrt_64) 2601aca67472SSong Gao DO_2OP_F(vfrecip_s, 32, UW, do_frecip_32) 2602aca67472SSong Gao DO_2OP_F(vfrecip_d, 64, UD, do_frecip_64) 2603aca67472SSong Gao DO_2OP_F(vfrsqrt_s, 32, UW, do_frsqrt_32) 2604aca67472SSong Gao DO_2OP_F(vfrsqrt_d, 64, UD, do_frsqrt_64) 2605399665d2SSong Gao 2606399665d2SSong Gao static uint32_t float16_cvt_float32(uint16_t h, float_status *status) 2607399665d2SSong Gao { 2608399665d2SSong Gao return float16_to_float32(h, true, status); 2609399665d2SSong Gao } 2610399665d2SSong Gao static uint64_t float32_cvt_float64(uint32_t s, float_status *status) 2611399665d2SSong Gao { 2612399665d2SSong Gao return float32_to_float64(s, status); 2613399665d2SSong Gao } 2614399665d2SSong Gao 2615399665d2SSong Gao static uint16_t float32_cvt_float16(uint32_t s, float_status *status) 2616399665d2SSong Gao { 2617399665d2SSong Gao return float32_to_float16(s, true, status); 2618399665d2SSong Gao } 2619399665d2SSong Gao static uint32_t float64_cvt_float32(uint64_t d, float_status *status) 2620399665d2SSong Gao { 2621399665d2SSong Gao return float64_to_float32(d, status); 2622399665d2SSong Gao } 2623399665d2SSong Gao 2624226bf881SSong Gao void HELPER(vfcvtl_s_h)(void *vd, void *vj, 2625226bf881SSong Gao CPULoongArchState *env, uint32_t desc) 2626399665d2SSong Gao { 262760df31a2SSong Gao int i, j, ofs; 262860df31a2SSong Gao VReg temp = {}; 2629226bf881SSong Gao VReg *Vd = (VReg *)vd; 2630226bf881SSong Gao VReg *Vj = (VReg *)vj; 263160df31a2SSong Gao int oprsz = simd_oprsz(desc); 2632399665d2SSong Gao 263360df31a2SSong Gao ofs = LSX_LEN / 32; 2634399665d2SSong Gao vec_clear_cause(env); 263560df31a2SSong Gao for (i = 0; i < oprsz / 16; i++) { 263660df31a2SSong Gao for (j = 0; j < ofs; j++) { 263760df31a2SSong Gao temp.UW(j + ofs * i) =float16_cvt_float32(Vj->UH(j + ofs * 2 * i), 263860df31a2SSong Gao &env->fp_status); 263960df31a2SSong Gao } 2640399665d2SSong Gao vec_update_fcsr0(env, GETPC()); 2641399665d2SSong Gao } 2642399665d2SSong Gao *Vd = temp; 2643399665d2SSong Gao } 2644399665d2SSong Gao 2645226bf881SSong Gao void HELPER(vfcvtl_d_s)(void *vd, void *vj, 2646226bf881SSong Gao CPULoongArchState *env, uint32_t desc) 2647399665d2SSong Gao { 264860df31a2SSong Gao int i, j, ofs; 264960df31a2SSong Gao VReg temp = {}; 2650226bf881SSong Gao VReg *Vd = (VReg *)vd; 2651226bf881SSong Gao VReg *Vj = (VReg *)vj; 265260df31a2SSong Gao int oprsz = simd_oprsz(desc); 2653399665d2SSong Gao 265460df31a2SSong Gao ofs = LSX_LEN / 64; 2655399665d2SSong Gao vec_clear_cause(env); 265660df31a2SSong Gao for (i = 0; i < oprsz / 16; i++) { 265760df31a2SSong Gao for (j = 0; j < ofs; j++) { 265860df31a2SSong Gao temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * 2 * i), 265960df31a2SSong Gao &env->fp_status); 266060df31a2SSong Gao } 2661399665d2SSong Gao vec_update_fcsr0(env, GETPC()); 2662399665d2SSong Gao } 2663399665d2SSong Gao *Vd = temp; 2664399665d2SSong Gao } 2665399665d2SSong Gao 2666226bf881SSong Gao void HELPER(vfcvth_s_h)(void *vd, void *vj, 2667226bf881SSong Gao CPULoongArchState *env, uint32_t desc) 2668399665d2SSong Gao { 266960df31a2SSong Gao int i, j, ofs; 267060df31a2SSong Gao VReg temp = {}; 2671226bf881SSong Gao VReg *Vd = (VReg *)vd; 2672226bf881SSong Gao VReg *Vj = (VReg *)vj; 267360df31a2SSong Gao int oprsz = simd_oprsz(desc); 2674399665d2SSong Gao 267560df31a2SSong Gao ofs = LSX_LEN / 32; 2676399665d2SSong Gao vec_clear_cause(env); 267760df31a2SSong Gao for (i = 0; i < oprsz / 16; i++) { 267860df31a2SSong Gao for (j = 0; j < ofs; j++) { 267960df31a2SSong Gao temp.UW(j + ofs * i) = float16_cvt_float32(Vj->UH(j + ofs * (2 * i + 1)), 268060df31a2SSong Gao &env->fp_status); 268160df31a2SSong Gao } 2682399665d2SSong Gao vec_update_fcsr0(env, GETPC()); 2683399665d2SSong Gao } 2684399665d2SSong Gao *Vd = temp; 2685399665d2SSong Gao } 2686399665d2SSong Gao 2687226bf881SSong Gao void HELPER(vfcvth_d_s)(void *vd, void *vj, 2688226bf881SSong Gao CPULoongArchState *env, uint32_t desc) 2689399665d2SSong Gao { 269060df31a2SSong Gao int i, j, ofs; 269160df31a2SSong Gao VReg temp = {}; 2692226bf881SSong Gao VReg *Vd = (VReg *)vd; 2693226bf881SSong Gao VReg *Vj = (VReg *)vj; 269460df31a2SSong Gao int oprsz = simd_oprsz(desc); 2695399665d2SSong Gao 269660df31a2SSong Gao ofs = LSX_LEN / 64; 2697399665d2SSong Gao vec_clear_cause(env); 269860df31a2SSong Gao for (i = 0; i < oprsz / 16; i++) { 269960df31a2SSong Gao for (j = 0; j < ofs; j++) { 270060df31a2SSong Gao temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * (2 * i + 1)), 270160df31a2SSong Gao &env->fp_status); 270260df31a2SSong Gao } 2703399665d2SSong Gao vec_update_fcsr0(env, GETPC()); 2704399665d2SSong Gao } 2705399665d2SSong Gao *Vd = temp; 2706399665d2SSong Gao } 2707399665d2SSong Gao 27083b286753SSong Gao void HELPER(vfcvt_h_s)(void *vd, void *vj, void *vk, 27093b286753SSong Gao CPULoongArchState *env, uint32_t desc) 2710399665d2SSong Gao { 271160df31a2SSong Gao int i, j, ofs; 271260df31a2SSong Gao VReg temp = {}; 27133b286753SSong Gao VReg *Vd = (VReg *)vd; 27143b286753SSong Gao VReg *Vj = (VReg *)vj; 27153b286753SSong Gao VReg *Vk = (VReg *)vk; 271660df31a2SSong Gao int oprsz = simd_oprsz(desc); 2717399665d2SSong Gao 271860df31a2SSong Gao ofs = LSX_LEN / 32; 2719399665d2SSong Gao vec_clear_cause(env); 272060df31a2SSong Gao for(i = 0; i < oprsz / 16; i++) { 272160df31a2SSong Gao for (j = 0; j < ofs; j++) { 272260df31a2SSong Gao temp.UH(j + ofs * (2 * i + 1)) = float32_cvt_float16(Vj->UW(j + ofs * i), 272360df31a2SSong Gao &env->fp_status); 272460df31a2SSong Gao temp.UH(j + ofs * 2 * i) = float32_cvt_float16(Vk->UW(j + ofs * i), 272560df31a2SSong Gao &env->fp_status); 272660df31a2SSong Gao } 2727399665d2SSong Gao vec_update_fcsr0(env, GETPC()); 2728399665d2SSong Gao } 2729399665d2SSong Gao *Vd = temp; 2730399665d2SSong Gao } 2731399665d2SSong Gao 27323b286753SSong Gao void HELPER(vfcvt_s_d)(void *vd, void *vj, void *vk, 27333b286753SSong Gao CPULoongArchState *env, uint32_t desc) 2734399665d2SSong Gao { 273560df31a2SSong Gao int i, j, ofs; 273660df31a2SSong Gao VReg temp = {}; 27373b286753SSong Gao VReg *Vd = (VReg *)vd; 27383b286753SSong Gao VReg *Vj = (VReg *)vj; 27393b286753SSong Gao VReg *Vk = (VReg *)vk; 274060df31a2SSong Gao int oprsz = simd_oprsz(desc); 2741399665d2SSong Gao 274260df31a2SSong Gao ofs = LSX_LEN / 64; 2743399665d2SSong Gao vec_clear_cause(env); 274460df31a2SSong Gao for(i = 0; i < oprsz / 16; i++) { 274560df31a2SSong Gao for (j = 0; j < ofs; j++) { 274660df31a2SSong Gao temp.UW(j + ofs * (2 * i + 1)) = float64_cvt_float32(Vj->UD(j + ofs * i), 274760df31a2SSong Gao &env->fp_status); 274860df31a2SSong Gao temp.UW(j + ofs * 2 * i) = float64_cvt_float32(Vk->UD(j + ofs * i), 274960df31a2SSong Gao &env->fp_status); 275060df31a2SSong Gao } 2751399665d2SSong Gao vec_update_fcsr0(env, GETPC()); 2752399665d2SSong Gao } 2753399665d2SSong Gao *Vd = temp; 2754399665d2SSong Gao } 2755399665d2SSong Gao 2756226bf881SSong Gao void HELPER(vfrint_s)(void *vd, void *vj, 2757226bf881SSong Gao CPULoongArchState *env, uint32_t desc) 2758399665d2SSong Gao { 2759399665d2SSong Gao int i; 2760226bf881SSong Gao VReg *Vd = (VReg *)vd; 2761226bf881SSong Gao VReg *Vj = (VReg *)vj; 276260df31a2SSong Gao int oprsz = simd_oprsz(desc); 2763399665d2SSong Gao 2764399665d2SSong Gao vec_clear_cause(env); 276560df31a2SSong Gao for (i = 0; i < oprsz / 4; i++) { 2766399665d2SSong Gao Vd->W(i) = float32_round_to_int(Vj->UW(i), &env->fp_status); 2767399665d2SSong Gao vec_update_fcsr0(env, GETPC()); 2768399665d2SSong Gao } 2769399665d2SSong Gao } 2770399665d2SSong Gao 2771226bf881SSong Gao void HELPER(vfrint_d)(void *vd, void *vj, 2772226bf881SSong Gao CPULoongArchState *env, uint32_t desc) 2773399665d2SSong Gao { 2774399665d2SSong Gao int i; 2775226bf881SSong Gao VReg *Vd = (VReg *)vd; 2776226bf881SSong Gao VReg *Vj = (VReg *)vj; 277760df31a2SSong Gao int oprsz = simd_oprsz(desc); 2778399665d2SSong Gao 2779399665d2SSong Gao vec_clear_cause(env); 278060df31a2SSong Gao for (i = 0; i < oprsz / 8; i++) { 2781399665d2SSong Gao Vd->D(i) = float64_round_to_int(Vj->UD(i), &env->fp_status); 2782399665d2SSong Gao vec_update_fcsr0(env, GETPC()); 2783399665d2SSong Gao } 2784399665d2SSong Gao } 2785399665d2SSong Gao 2786399665d2SSong Gao #define FCVT_2OP(NAME, BIT, E, MODE) \ 2787226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj, \ 2788226bf881SSong Gao CPULoongArchState *env, uint32_t desc) \ 2789399665d2SSong Gao { \ 2790399665d2SSong Gao int i; \ 2791226bf881SSong Gao VReg *Vd = (VReg *)vd; \ 2792226bf881SSong Gao VReg *Vj = (VReg *)vj; \ 279360df31a2SSong Gao int oprsz = simd_oprsz(desc); \ 2794399665d2SSong Gao \ 2795399665d2SSong Gao vec_clear_cause(env); \ 279660df31a2SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2797399665d2SSong Gao FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \ 2798399665d2SSong Gao set_float_rounding_mode(MODE, &env->fp_status); \ 2799399665d2SSong Gao Vd->E(i) = float## BIT ## _round_to_int(Vj->E(i), &env->fp_status); \ 2800399665d2SSong Gao set_float_rounding_mode(old_mode, &env->fp_status); \ 2801399665d2SSong Gao vec_update_fcsr0(env, GETPC()); \ 2802399665d2SSong Gao } \ 2803399665d2SSong Gao } 2804399665d2SSong Gao 2805399665d2SSong Gao FCVT_2OP(vfrintrne_s, 32, UW, float_round_nearest_even) 2806399665d2SSong Gao FCVT_2OP(vfrintrne_d, 64, UD, float_round_nearest_even) 2807399665d2SSong Gao FCVT_2OP(vfrintrz_s, 32, UW, float_round_to_zero) 2808399665d2SSong Gao FCVT_2OP(vfrintrz_d, 64, UD, float_round_to_zero) 2809399665d2SSong Gao FCVT_2OP(vfrintrp_s, 32, UW, float_round_up) 2810399665d2SSong Gao FCVT_2OP(vfrintrp_d, 64, UD, float_round_up) 2811399665d2SSong Gao FCVT_2OP(vfrintrm_s, 32, UW, float_round_down) 2812399665d2SSong Gao FCVT_2OP(vfrintrm_d, 64, UD, float_round_down) 2813399665d2SSong Gao 2814399665d2SSong Gao #define FTINT(NAME, FMT1, FMT2, T1, T2, MODE) \ 2815399665d2SSong Gao static T2 do_ftint ## NAME(CPULoongArchState *env, T1 fj) \ 2816399665d2SSong Gao { \ 2817399665d2SSong Gao T2 fd; \ 2818399665d2SSong Gao FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \ 2819399665d2SSong Gao \ 2820399665d2SSong Gao set_float_rounding_mode(MODE, &env->fp_status); \ 2821399665d2SSong Gao fd = do_## FMT1 ##_to_## FMT2(env, fj); \ 2822399665d2SSong Gao set_float_rounding_mode(old_mode, &env->fp_status); \ 2823399665d2SSong Gao return fd; \ 2824399665d2SSong Gao } 2825399665d2SSong Gao 2826399665d2SSong Gao #define DO_FTINT(FMT1, FMT2, T1, T2) \ 2827399665d2SSong Gao static T2 do_## FMT1 ##_to_## FMT2(CPULoongArchState *env, T1 fj) \ 2828399665d2SSong Gao { \ 2829399665d2SSong Gao T2 fd; \ 2830399665d2SSong Gao \ 2831399665d2SSong Gao fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \ 2832399665d2SSong Gao if (get_float_exception_flags(&env->fp_status) & (float_flag_invalid)) { \ 2833399665d2SSong Gao if (FMT1 ##_is_any_nan(fj)) { \ 2834399665d2SSong Gao fd = 0; \ 2835399665d2SSong Gao } \ 2836399665d2SSong Gao } \ 2837399665d2SSong Gao vec_update_fcsr0(env, GETPC()); \ 2838399665d2SSong Gao return fd; \ 2839399665d2SSong Gao } 2840399665d2SSong Gao 2841399665d2SSong Gao DO_FTINT(float32, int32, uint32_t, uint32_t) 2842399665d2SSong Gao DO_FTINT(float64, int64, uint64_t, uint64_t) 2843399665d2SSong Gao DO_FTINT(float32, uint32, uint32_t, uint32_t) 2844399665d2SSong Gao DO_FTINT(float64, uint64, uint64_t, uint64_t) 2845399665d2SSong Gao DO_FTINT(float64, int32, uint64_t, uint32_t) 2846399665d2SSong Gao DO_FTINT(float32, int64, uint32_t, uint64_t) 2847399665d2SSong Gao 2848399665d2SSong Gao FTINT(rne_w_s, float32, int32, uint32_t, uint32_t, float_round_nearest_even) 2849399665d2SSong Gao FTINT(rne_l_d, float64, int64, uint64_t, uint64_t, float_round_nearest_even) 2850399665d2SSong Gao FTINT(rp_w_s, float32, int32, uint32_t, uint32_t, float_round_up) 2851399665d2SSong Gao FTINT(rp_l_d, float64, int64, uint64_t, uint64_t, float_round_up) 2852399665d2SSong Gao FTINT(rz_w_s, float32, int32, uint32_t, uint32_t, float_round_to_zero) 2853399665d2SSong Gao FTINT(rz_l_d, float64, int64, uint64_t, uint64_t, float_round_to_zero) 2854399665d2SSong Gao FTINT(rm_w_s, float32, int32, uint32_t, uint32_t, float_round_down) 2855399665d2SSong Gao FTINT(rm_l_d, float64, int64, uint64_t, uint64_t, float_round_down) 2856399665d2SSong Gao 2857399665d2SSong Gao DO_2OP_F(vftintrne_w_s, 32, UW, do_ftintrne_w_s) 2858399665d2SSong Gao DO_2OP_F(vftintrne_l_d, 64, UD, do_ftintrne_l_d) 2859399665d2SSong Gao DO_2OP_F(vftintrp_w_s, 32, UW, do_ftintrp_w_s) 2860399665d2SSong Gao DO_2OP_F(vftintrp_l_d, 64, UD, do_ftintrp_l_d) 2861399665d2SSong Gao DO_2OP_F(vftintrz_w_s, 32, UW, do_ftintrz_w_s) 2862399665d2SSong Gao DO_2OP_F(vftintrz_l_d, 64, UD, do_ftintrz_l_d) 2863399665d2SSong Gao DO_2OP_F(vftintrm_w_s, 32, UW, do_ftintrm_w_s) 2864399665d2SSong Gao DO_2OP_F(vftintrm_l_d, 64, UD, do_ftintrm_l_d) 2865399665d2SSong Gao DO_2OP_F(vftint_w_s, 32, UW, do_float32_to_int32) 2866399665d2SSong Gao DO_2OP_F(vftint_l_d, 64, UD, do_float64_to_int64) 2867399665d2SSong Gao 2868399665d2SSong Gao FTINT(rz_wu_s, float32, uint32, uint32_t, uint32_t, float_round_to_zero) 2869399665d2SSong Gao FTINT(rz_lu_d, float64, uint64, uint64_t, uint64_t, float_round_to_zero) 2870399665d2SSong Gao 2871399665d2SSong Gao DO_2OP_F(vftintrz_wu_s, 32, UW, do_ftintrz_wu_s) 2872399665d2SSong Gao DO_2OP_F(vftintrz_lu_d, 64, UD, do_ftintrz_lu_d) 2873399665d2SSong Gao DO_2OP_F(vftint_wu_s, 32, UW, do_float32_to_uint32) 2874399665d2SSong Gao DO_2OP_F(vftint_lu_d, 64, UD, do_float64_to_uint64) 2875399665d2SSong Gao 2876399665d2SSong Gao FTINT(rm_w_d, float64, int32, uint64_t, uint32_t, float_round_down) 2877399665d2SSong Gao FTINT(rp_w_d, float64, int32, uint64_t, uint32_t, float_round_up) 2878399665d2SSong Gao FTINT(rz_w_d, float64, int32, uint64_t, uint32_t, float_round_to_zero) 2879399665d2SSong Gao FTINT(rne_w_d, float64, int32, uint64_t, uint32_t, float_round_nearest_even) 2880399665d2SSong Gao 2881399665d2SSong Gao #define FTINT_W_D(NAME, FN) \ 28823b286753SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, \ 28833b286753SSong Gao CPULoongArchState *env, uint32_t desc) \ 2884399665d2SSong Gao { \ 288560df31a2SSong Gao int i, j, ofs; \ 288660df31a2SSong Gao VReg temp = {}; \ 28873b286753SSong Gao VReg *Vd = (VReg *)vd; \ 28883b286753SSong Gao VReg *Vj = (VReg *)vj; \ 28893b286753SSong Gao VReg *Vk = (VReg *)vk; \ 289060df31a2SSong Gao int oprsz = simd_oprsz(desc); \ 2891399665d2SSong Gao \ 289260df31a2SSong Gao ofs = LSX_LEN / 64; \ 2893399665d2SSong Gao vec_clear_cause(env); \ 289460df31a2SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 289560df31a2SSong Gao for (j = 0; j < ofs; j++) { \ 289660df31a2SSong Gao temp.W(j + ofs * (2 * i + 1)) = FN(env, Vj->UD(j + ofs * i)); \ 289760df31a2SSong Gao temp.W(j + ofs * 2 * i) = FN(env, Vk->UD(j + ofs * i)); \ 289860df31a2SSong Gao } \ 2899399665d2SSong Gao } \ 2900399665d2SSong Gao *Vd = temp; \ 2901399665d2SSong Gao } 2902399665d2SSong Gao 2903399665d2SSong Gao FTINT_W_D(vftint_w_d, do_float64_to_int32) 2904399665d2SSong Gao FTINT_W_D(vftintrm_w_d, do_ftintrm_w_d) 2905399665d2SSong Gao FTINT_W_D(vftintrp_w_d, do_ftintrp_w_d) 2906399665d2SSong Gao FTINT_W_D(vftintrz_w_d, do_ftintrz_w_d) 2907399665d2SSong Gao FTINT_W_D(vftintrne_w_d, do_ftintrne_w_d) 2908399665d2SSong Gao 2909399665d2SSong Gao FTINT(rml_l_s, float32, int64, uint32_t, uint64_t, float_round_down) 2910399665d2SSong Gao FTINT(rpl_l_s, float32, int64, uint32_t, uint64_t, float_round_up) 2911399665d2SSong Gao FTINT(rzl_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero) 2912399665d2SSong Gao FTINT(rnel_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even) 2913399665d2SSong Gao FTINT(rmh_l_s, float32, int64, uint32_t, uint64_t, float_round_down) 2914399665d2SSong Gao FTINT(rph_l_s, float32, int64, uint32_t, uint64_t, float_round_up) 2915399665d2SSong Gao FTINT(rzh_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero) 2916399665d2SSong Gao FTINT(rneh_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even) 2917399665d2SSong Gao 2918399665d2SSong Gao #define FTINTL_L_S(NAME, FN) \ 2919226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj, \ 2920226bf881SSong Gao CPULoongArchState *env, uint32_t desc) \ 2921399665d2SSong Gao { \ 292260df31a2SSong Gao int i, j, ofs; \ 2923399665d2SSong Gao VReg temp; \ 2924226bf881SSong Gao VReg *Vd = (VReg *)vd; \ 2925226bf881SSong Gao VReg *Vj = (VReg *)vj; \ 292660df31a2SSong Gao int oprsz = simd_oprsz(desc); \ 2927399665d2SSong Gao \ 292860df31a2SSong Gao ofs = LSX_LEN / 64; \ 2929399665d2SSong Gao vec_clear_cause(env); \ 293060df31a2SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 293160df31a2SSong Gao for (j = 0; j < ofs; j++) { \ 293260df31a2SSong Gao temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * 2 * i)); \ 293360df31a2SSong Gao } \ 2934399665d2SSong Gao } \ 2935399665d2SSong Gao *Vd = temp; \ 2936399665d2SSong Gao } 2937399665d2SSong Gao 2938399665d2SSong Gao FTINTL_L_S(vftintl_l_s, do_float32_to_int64) 2939399665d2SSong Gao FTINTL_L_S(vftintrml_l_s, do_ftintrml_l_s) 2940399665d2SSong Gao FTINTL_L_S(vftintrpl_l_s, do_ftintrpl_l_s) 2941399665d2SSong Gao FTINTL_L_S(vftintrzl_l_s, do_ftintrzl_l_s) 2942399665d2SSong Gao FTINTL_L_S(vftintrnel_l_s, do_ftintrnel_l_s) 2943399665d2SSong Gao 2944399665d2SSong Gao #define FTINTH_L_S(NAME, FN) \ 2945226bf881SSong Gao void HELPER(NAME)(void *vd, void *vj, \ 2946226bf881SSong Gao CPULoongArchState *env, uint32_t desc) \ 2947399665d2SSong Gao { \ 294860df31a2SSong Gao int i, j, ofs; \ 294960df31a2SSong Gao VReg temp = {}; \ 2950226bf881SSong Gao VReg *Vd = (VReg *)vd; \ 2951226bf881SSong Gao VReg *Vj = (VReg *)vj; \ 295260df31a2SSong Gao int oprsz = simd_oprsz(desc); \ 2953399665d2SSong Gao \ 295460df31a2SSong Gao ofs = LSX_LEN / 64; \ 2955399665d2SSong Gao vec_clear_cause(env); \ 295660df31a2SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 295760df31a2SSong Gao for (j = 0; j < ofs; j++) { \ 295860df31a2SSong Gao temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * (2 * i + 1))); \ 295960df31a2SSong Gao } \ 2960399665d2SSong Gao } \ 2961399665d2SSong Gao *Vd = temp; \ 2962399665d2SSong Gao } 2963399665d2SSong Gao 2964399665d2SSong Gao FTINTH_L_S(vftinth_l_s, do_float32_to_int64) 2965399665d2SSong Gao FTINTH_L_S(vftintrmh_l_s, do_ftintrmh_l_s) 2966399665d2SSong Gao FTINTH_L_S(vftintrph_l_s, do_ftintrph_l_s) 2967399665d2SSong Gao FTINTH_L_S(vftintrzh_l_s, do_ftintrzh_l_s) 2968399665d2SSong Gao FTINTH_L_S(vftintrneh_l_s, do_ftintrneh_l_s) 2969399665d2SSong Gao 2970399665d2SSong Gao #define FFINT(NAME, FMT1, FMT2, T1, T2) \ 2971399665d2SSong Gao static T2 do_ffint_ ## NAME(CPULoongArchState *env, T1 fj) \ 2972399665d2SSong Gao { \ 2973399665d2SSong Gao T2 fd; \ 2974399665d2SSong Gao \ 2975399665d2SSong Gao fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \ 2976399665d2SSong Gao vec_update_fcsr0(env, GETPC()); \ 2977399665d2SSong Gao return fd; \ 2978399665d2SSong Gao } 2979399665d2SSong Gao 2980399665d2SSong Gao FFINT(s_w, int32, float32, int32_t, uint32_t) 2981399665d2SSong Gao FFINT(d_l, int64, float64, int64_t, uint64_t) 2982399665d2SSong Gao FFINT(s_wu, uint32, float32, uint32_t, uint32_t) 2983399665d2SSong Gao FFINT(d_lu, uint64, float64, uint64_t, uint64_t) 2984399665d2SSong Gao 2985399665d2SSong Gao DO_2OP_F(vffint_s_w, 32, W, do_ffint_s_w) 2986399665d2SSong Gao DO_2OP_F(vffint_d_l, 64, D, do_ffint_d_l) 2987399665d2SSong Gao DO_2OP_F(vffint_s_wu, 32, UW, do_ffint_s_wu) 2988399665d2SSong Gao DO_2OP_F(vffint_d_lu, 64, UD, do_ffint_d_lu) 2989399665d2SSong Gao 2990226bf881SSong Gao void HELPER(vffintl_d_w)(void *vd, void *vj, 2991226bf881SSong Gao CPULoongArchState *env, uint32_t desc) 2992399665d2SSong Gao { 299360df31a2SSong Gao int i, j, ofs; 299460df31a2SSong Gao VReg temp = {}; 2995226bf881SSong Gao VReg *Vd = (VReg *)vd; 2996226bf881SSong Gao VReg *Vj = (VReg *)vj; 299760df31a2SSong Gao int oprsz = simd_oprsz(desc); 2998399665d2SSong Gao 299960df31a2SSong Gao ofs = LSX_LEN / 64; 3000399665d2SSong Gao vec_clear_cause(env); 300160df31a2SSong Gao for (i = 0; i < oprsz / 16; i++) { 300260df31a2SSong Gao for (j = 0; j < ofs; j++) { 300360df31a2SSong Gao temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * 2 * i), 300460df31a2SSong Gao &env->fp_status); 300560df31a2SSong Gao } 3006399665d2SSong Gao vec_update_fcsr0(env, GETPC()); 3007399665d2SSong Gao } 3008399665d2SSong Gao *Vd = temp; 3009399665d2SSong Gao } 3010399665d2SSong Gao 3011226bf881SSong Gao void HELPER(vffinth_d_w)(void *vd, void *vj, 3012226bf881SSong Gao CPULoongArchState *env, uint32_t desc) 3013399665d2SSong Gao { 301460df31a2SSong Gao int i, j, ofs; 301560df31a2SSong Gao VReg temp = {}; 3016226bf881SSong Gao VReg *Vd = (VReg *)vd; 3017226bf881SSong Gao VReg *Vj = (VReg *)vj; 301860df31a2SSong Gao int oprsz = simd_oprsz(desc); 3019399665d2SSong Gao 302060df31a2SSong Gao ofs = LSX_LEN / 64; 3021399665d2SSong Gao vec_clear_cause(env); 302260df31a2SSong Gao for (i = 0; i < oprsz /16; i++) { 302360df31a2SSong Gao for (j = 0; j < ofs; j++) { 302460df31a2SSong Gao temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * (2 * i + 1)), 302560df31a2SSong Gao &env->fp_status); 302660df31a2SSong Gao } 3027399665d2SSong Gao vec_update_fcsr0(env, GETPC()); 3028399665d2SSong Gao } 3029399665d2SSong Gao *Vd = temp; 3030399665d2SSong Gao } 3031399665d2SSong Gao 30323b286753SSong Gao void HELPER(vffint_s_l)(void *vd, void *vj, void *vk, 30333b286753SSong Gao CPULoongArchState *env, uint32_t desc) 3034399665d2SSong Gao { 303560df31a2SSong Gao int i, j, ofs; 303660df31a2SSong Gao VReg temp = {}; 30373b286753SSong Gao VReg *Vd = (VReg *)vd; 30383b286753SSong Gao VReg *Vj = (VReg *)vj; 30393b286753SSong Gao VReg *Vk = (VReg *)vk; 304060df31a2SSong Gao int oprsz = simd_oprsz(desc); 3041399665d2SSong Gao 304260df31a2SSong Gao ofs = LSX_LEN / 64; 3043399665d2SSong Gao vec_clear_cause(env); 304460df31a2SSong Gao for (i = 0; i < oprsz / 16; i++) { 304560df31a2SSong Gao for (j = 0; j < ofs; j++) { 304660df31a2SSong Gao temp.W(j + ofs * (2 * i + 1)) = int64_to_float32(Vj->D(j + ofs * i), 304760df31a2SSong Gao &env->fp_status); 304860df31a2SSong Gao temp.W(j + ofs * 2 * i) = int64_to_float32(Vk->D(j + ofs * i), 304960df31a2SSong Gao &env->fp_status); 305060df31a2SSong Gao } 3051399665d2SSong Gao vec_update_fcsr0(env, GETPC()); 3052399665d2SSong Gao } 3053399665d2SSong Gao *Vd = temp; 3054399665d2SSong Gao } 3055f435e1e5SSong Gao 3056f435e1e5SSong Gao #define VSEQ(a, b) (a == b ? -1 : 0) 3057f435e1e5SSong Gao #define VSLE(a, b) (a <= b ? -1 : 0) 3058f435e1e5SSong Gao #define VSLT(a, b) (a < b ? -1 : 0) 3059f435e1e5SSong Gao 3060f435e1e5SSong Gao #define VCMPI(NAME, BIT, E, DO_OP) \ 30614da72d43SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3062f435e1e5SSong Gao { \ 3063f435e1e5SSong Gao int i; \ 3064f435e1e5SSong Gao VReg *Vd = (VReg *)vd; \ 3065f435e1e5SSong Gao VReg *Vj = (VReg *)vj; \ 3066f435e1e5SSong Gao typedef __typeof(Vd->E(0)) TD; \ 30674da72d43SSong Gao int oprsz = simd_oprsz(desc); \ 3068f435e1e5SSong Gao \ 30694da72d43SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3070f435e1e5SSong Gao Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \ 3071f435e1e5SSong Gao } \ 3072f435e1e5SSong Gao } 3073f435e1e5SSong Gao 3074f435e1e5SSong Gao VCMPI(vseqi_b, 8, B, VSEQ) 3075f435e1e5SSong Gao VCMPI(vseqi_h, 16, H, VSEQ) 3076f435e1e5SSong Gao VCMPI(vseqi_w, 32, W, VSEQ) 3077f435e1e5SSong Gao VCMPI(vseqi_d, 64, D, VSEQ) 3078f435e1e5SSong Gao VCMPI(vslei_b, 8, B, VSLE) 3079f435e1e5SSong Gao VCMPI(vslei_h, 16, H, VSLE) 3080f435e1e5SSong Gao VCMPI(vslei_w, 32, W, VSLE) 3081f435e1e5SSong Gao VCMPI(vslei_d, 64, D, VSLE) 3082f435e1e5SSong Gao VCMPI(vslei_bu, 8, UB, VSLE) 3083f435e1e5SSong Gao VCMPI(vslei_hu, 16, UH, VSLE) 3084f435e1e5SSong Gao VCMPI(vslei_wu, 32, UW, VSLE) 3085f435e1e5SSong Gao VCMPI(vslei_du, 64, UD, VSLE) 3086f435e1e5SSong Gao VCMPI(vslti_b, 8, B, VSLT) 3087f435e1e5SSong Gao VCMPI(vslti_h, 16, H, VSLT) 3088f435e1e5SSong Gao VCMPI(vslti_w, 32, W, VSLT) 3089f435e1e5SSong Gao VCMPI(vslti_d, 64, D, VSLT) 3090f435e1e5SSong Gao VCMPI(vslti_bu, 8, UB, VSLT) 3091f435e1e5SSong Gao VCMPI(vslti_hu, 16, UH, VSLT) 3092f435e1e5SSong Gao VCMPI(vslti_wu, 32, UW, VSLT) 3093f435e1e5SSong Gao VCMPI(vslti_du, 64, UD, VSLT) 3094386c4e86SSong Gao 3095386c4e86SSong Gao static uint64_t vfcmp_common(CPULoongArchState *env, 3096386c4e86SSong Gao FloatRelation cmp, uint32_t flags) 3097386c4e86SSong Gao { 3098386c4e86SSong Gao uint64_t ret = 0; 3099386c4e86SSong Gao 3100386c4e86SSong Gao switch (cmp) { 3101386c4e86SSong Gao case float_relation_less: 3102386c4e86SSong Gao ret = (flags & FCMP_LT); 3103386c4e86SSong Gao break; 3104386c4e86SSong Gao case float_relation_equal: 3105386c4e86SSong Gao ret = (flags & FCMP_EQ); 3106386c4e86SSong Gao break; 3107386c4e86SSong Gao case float_relation_greater: 3108386c4e86SSong Gao ret = (flags & FCMP_GT); 3109386c4e86SSong Gao break; 3110386c4e86SSong Gao case float_relation_unordered: 3111386c4e86SSong Gao ret = (flags & FCMP_UN); 3112386c4e86SSong Gao break; 3113386c4e86SSong Gao default: 3114386c4e86SSong Gao g_assert_not_reached(); 3115386c4e86SSong Gao } 3116386c4e86SSong Gao 3117386c4e86SSong Gao if (ret) { 3118386c4e86SSong Gao ret = -1; 3119386c4e86SSong Gao } 3120386c4e86SSong Gao 3121386c4e86SSong Gao return ret; 3122386c4e86SSong Gao } 3123386c4e86SSong Gao 3124386c4e86SSong Gao #define VFCMP(NAME, BIT, E, FN) \ 31253eeda5feSSong Gao void HELPER(NAME)(CPULoongArchState *env, uint32_t oprsz, \ 3126386c4e86SSong Gao uint32_t vd, uint32_t vj, uint32_t vk, uint32_t flags) \ 3127386c4e86SSong Gao { \ 3128386c4e86SSong Gao int i; \ 3129386c4e86SSong Gao VReg t; \ 3130386c4e86SSong Gao VReg *Vd = &(env->fpr[vd].vreg); \ 3131386c4e86SSong Gao VReg *Vj = &(env->fpr[vj].vreg); \ 3132386c4e86SSong Gao VReg *Vk = &(env->fpr[vk].vreg); \ 3133386c4e86SSong Gao \ 3134386c4e86SSong Gao vec_clear_cause(env); \ 31353eeda5feSSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3136386c4e86SSong Gao FloatRelation cmp; \ 3137386c4e86SSong Gao cmp = FN(Vj->E(i), Vk->E(i), &env->fp_status); \ 3138386c4e86SSong Gao t.E(i) = vfcmp_common(env, cmp, flags); \ 3139386c4e86SSong Gao vec_update_fcsr0(env, GETPC()); \ 3140386c4e86SSong Gao } \ 3141386c4e86SSong Gao *Vd = t; \ 3142386c4e86SSong Gao } 3143386c4e86SSong Gao 3144386c4e86SSong Gao VFCMP(vfcmp_c_s, 32, UW, float32_compare_quiet) 3145386c4e86SSong Gao VFCMP(vfcmp_s_s, 32, UW, float32_compare) 3146386c4e86SSong Gao VFCMP(vfcmp_c_d, 64, UD, float64_compare_quiet) 3147386c4e86SSong Gao VFCMP(vfcmp_s_d, 64, UD, float64_compare) 3148d0dfa19aSSong Gao 3149f3dfcc8bSSong Gao void HELPER(vbitseli_b)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3150d0dfa19aSSong Gao { 3151d0dfa19aSSong Gao int i; 3152d0dfa19aSSong Gao VReg *Vd = (VReg *)vd; 3153d0dfa19aSSong Gao VReg *Vj = (VReg *)vj; 3154d0dfa19aSSong Gao 3155f3dfcc8bSSong Gao for (i = 0; i < simd_oprsz(desc); i++) { 3156d0dfa19aSSong Gao Vd->B(i) = (~Vd->B(i) & Vj->B(i)) | (Vd->B(i) & imm); 3157d0dfa19aSSong Gao } 3158d0dfa19aSSong Gao } 3159d0dfa19aSSong Gao 3160d0dfa19aSSong Gao /* Copy from target/arm/tcg/sve_helper.c */ 3161d0dfa19aSSong Gao static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 3162d0dfa19aSSong Gao { 3163f3dfcc8bSSong Gao int bits = 8 << esz; 3164d0dfa19aSSong Gao uint64_t ones = dup_const(esz, 1); 3165d0dfa19aSSong Gao uint64_t signs = ones << (bits - 1); 3166d0dfa19aSSong Gao uint64_t cmp0, cmp1; 3167d0dfa19aSSong Gao 3168d0dfa19aSSong Gao cmp1 = dup_const(esz, n); 3169d0dfa19aSSong Gao cmp0 = cmp1 ^ m0; 3170d0dfa19aSSong Gao cmp1 = cmp1 ^ m1; 3171d0dfa19aSSong Gao cmp0 = (cmp0 - ones) & ~cmp0; 3172d0dfa19aSSong Gao cmp1 = (cmp1 - ones) & ~cmp1; 3173d0dfa19aSSong Gao return (cmp0 | cmp1) & signs; 3174d0dfa19aSSong Gao } 3175d0dfa19aSSong Gao 3176d0dfa19aSSong Gao #define SETANYEQZ(NAME, MO) \ 3177f3dfcc8bSSong Gao void HELPER(NAME)(CPULoongArchState *env, \ 3178f3dfcc8bSSong Gao uint32_t oprsz, uint32_t cd, uint32_t vj) \ 3179d0dfa19aSSong Gao { \ 3180d0dfa19aSSong Gao VReg *Vj = &(env->fpr[vj].vreg); \ 3181d0dfa19aSSong Gao \ 3182d0dfa19aSSong Gao env->cf[cd & 0x7] = do_match2(0, Vj->D(0), Vj->D(1), MO); \ 3183f3dfcc8bSSong Gao if (oprsz == 32) { \ 3184f3dfcc8bSSong Gao env->cf[cd & 0x7] = env->cf[cd & 0x7] || \ 3185f3dfcc8bSSong Gao do_match2(0, Vj->D(2), Vj->D(3), MO); \ 3186f3dfcc8bSSong Gao } \ 3187d0dfa19aSSong Gao } 3188f3dfcc8bSSong Gao 3189d0dfa19aSSong Gao SETANYEQZ(vsetanyeqz_b, MO_8) 3190d0dfa19aSSong Gao SETANYEQZ(vsetanyeqz_h, MO_16) 3191d0dfa19aSSong Gao SETANYEQZ(vsetanyeqz_w, MO_32) 3192d0dfa19aSSong Gao SETANYEQZ(vsetanyeqz_d, MO_64) 3193d0dfa19aSSong Gao 3194d0dfa19aSSong Gao #define SETALLNEZ(NAME, MO) \ 3195f3dfcc8bSSong Gao void HELPER(NAME)(CPULoongArchState *env, \ 3196f3dfcc8bSSong Gao uint32_t oprsz, uint32_t cd, uint32_t vj) \ 3197d0dfa19aSSong Gao { \ 3198d0dfa19aSSong Gao VReg *Vj = &(env->fpr[vj].vreg); \ 3199d0dfa19aSSong Gao \ 3200d0dfa19aSSong Gao env->cf[cd & 0x7]= !do_match2(0, Vj->D(0), Vj->D(1), MO); \ 3201f3dfcc8bSSong Gao if (oprsz == 32) { \ 3202f3dfcc8bSSong Gao env->cf[cd & 0x7] = env->cf[cd & 0x7] && \ 3203f3dfcc8bSSong Gao !do_match2(0, Vj->D(2), Vj->D(3), MO); \ 3204f3dfcc8bSSong Gao } \ 3205d0dfa19aSSong Gao } 3206f3dfcc8bSSong Gao 3207d0dfa19aSSong Gao SETALLNEZ(vsetallnez_b, MO_8) 3208d0dfa19aSSong Gao SETALLNEZ(vsetallnez_h, MO_16) 3209d0dfa19aSSong Gao SETALLNEZ(vsetallnez_w, MO_32) 3210d0dfa19aSSong Gao SETALLNEZ(vsetallnez_d, MO_64) 3211d5e5563cSSong Gao 3212df97f338SSong Gao #define XVINSVE0(NAME, E, MASK) \ 3213df97f338SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3214df97f338SSong Gao { \ 3215df97f338SSong Gao VReg *Vd = (VReg *)vd; \ 3216df97f338SSong Gao VReg *Vj = (VReg *)vj; \ 3217df97f338SSong Gao Vd->E(imm & MASK) = Vj->E(0); \ 3218df97f338SSong Gao } 3219df97f338SSong Gao 3220df97f338SSong Gao XVINSVE0(xvinsve0_w, W, 0x7) 3221df97f338SSong Gao XVINSVE0(xvinsve0_d, D, 0x3) 3222df97f338SSong Gao 3223df97f338SSong Gao #define XVPICKVE(NAME, E, BIT, MASK) \ 3224df97f338SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3225df97f338SSong Gao { \ 3226df97f338SSong Gao int i; \ 3227df97f338SSong Gao VReg *Vd = (VReg *)vd; \ 3228df97f338SSong Gao VReg *Vj = (VReg *)vj; \ 3229df97f338SSong Gao int oprsz = simd_oprsz(desc); \ 3230df97f338SSong Gao \ 3231df97f338SSong Gao Vd->E(0) = Vj->E(imm & MASK); \ 3232df97f338SSong Gao for (i = 1; i < oprsz / (BIT / 8); i++) { \ 3233df97f338SSong Gao Vd->E(i) = 0; \ 3234df97f338SSong Gao } \ 3235df97f338SSong Gao } 3236df97f338SSong Gao 3237df97f338SSong Gao XVPICKVE(xvpickve_w, W, 32, 0x7) 3238df97f338SSong Gao XVPICKVE(xvpickve_d, D, 64, 0x3) 3239df97f338SSong Gao 3240d5e5563cSSong Gao #define VPACKEV(NAME, BIT, E) \ 324104711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3242d5e5563cSSong Gao { \ 3243d5e5563cSSong Gao int i; \ 3244ad292148SSong Gao VReg temp = {}; \ 324504711da1SSong Gao VReg *Vd = (VReg *)vd; \ 324604711da1SSong Gao VReg *Vj = (VReg *)vj; \ 324704711da1SSong Gao VReg *Vk = (VReg *)vk; \ 3248ad292148SSong Gao int oprsz = simd_oprsz(desc); \ 3249d5e5563cSSong Gao \ 3250ad292148SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3251d5e5563cSSong Gao temp.E(2 * i + 1) = Vj->E(2 * i); \ 3252d5e5563cSSong Gao temp.E(2 *i) = Vk->E(2 * i); \ 3253d5e5563cSSong Gao } \ 3254d5e5563cSSong Gao *Vd = temp; \ 3255d5e5563cSSong Gao } 3256d5e5563cSSong Gao 3257d5e5563cSSong Gao VPACKEV(vpackev_b, 16, B) 3258d5e5563cSSong Gao VPACKEV(vpackev_h, 32, H) 3259d5e5563cSSong Gao VPACKEV(vpackev_w, 64, W) 3260d5e5563cSSong Gao VPACKEV(vpackev_d, 128, D) 3261d5e5563cSSong Gao 3262d5e5563cSSong Gao #define VPACKOD(NAME, BIT, E) \ 326304711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3264d5e5563cSSong Gao { \ 3265d5e5563cSSong Gao int i; \ 3266ad292148SSong Gao VReg temp = {}; \ 326704711da1SSong Gao VReg *Vd = (VReg *)vd; \ 326804711da1SSong Gao VReg *Vj = (VReg *)vj; \ 326904711da1SSong Gao VReg *Vk = (VReg *)vk; \ 3270ad292148SSong Gao int oprsz = simd_oprsz(desc); \ 3271d5e5563cSSong Gao \ 3272ad292148SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3273d5e5563cSSong Gao temp.E(2 * i + 1) = Vj->E(2 * i + 1); \ 3274d5e5563cSSong Gao temp.E(2 * i) = Vk->E(2 * i + 1); \ 3275d5e5563cSSong Gao } \ 3276d5e5563cSSong Gao *Vd = temp; \ 3277d5e5563cSSong Gao } 3278d5e5563cSSong Gao 3279d5e5563cSSong Gao VPACKOD(vpackod_b, 16, B) 3280d5e5563cSSong Gao VPACKOD(vpackod_h, 32, H) 3281d5e5563cSSong Gao VPACKOD(vpackod_w, 64, W) 3282d5e5563cSSong Gao VPACKOD(vpackod_d, 128, D) 3283d5e5563cSSong Gao 3284d5e5563cSSong Gao #define VPICKEV(NAME, BIT, E) \ 328504711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3286d5e5563cSSong Gao { \ 3287ad292148SSong Gao int i, j, ofs; \ 3288ad292148SSong Gao VReg temp = {}; \ 328904711da1SSong Gao VReg *Vd = (VReg *)vd; \ 329004711da1SSong Gao VReg *Vj = (VReg *)vj; \ 329104711da1SSong Gao VReg *Vk = (VReg *)vk; \ 3292ad292148SSong Gao int oprsz = simd_oprsz(desc); \ 3293d5e5563cSSong Gao \ 3294ad292148SSong Gao ofs = LSX_LEN / BIT; \ 3295ad292148SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 3296ad292148SSong Gao for (j = 0; j < ofs; j++) { \ 3297ad292148SSong Gao temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i)); \ 3298ad292148SSong Gao temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i)); \ 3299ad292148SSong Gao } \ 3300d5e5563cSSong Gao } \ 3301d5e5563cSSong Gao *Vd = temp; \ 3302d5e5563cSSong Gao } 3303d5e5563cSSong Gao 3304d5e5563cSSong Gao VPICKEV(vpickev_b, 16, B) 3305d5e5563cSSong Gao VPICKEV(vpickev_h, 32, H) 3306d5e5563cSSong Gao VPICKEV(vpickev_w, 64, W) 3307d5e5563cSSong Gao VPICKEV(vpickev_d, 128, D) 3308d5e5563cSSong Gao 3309d5e5563cSSong Gao #define VPICKOD(NAME, BIT, E) \ 331004711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3311d5e5563cSSong Gao { \ 3312ad292148SSong Gao int i, j, ofs; \ 3313ad292148SSong Gao VReg temp = {}; \ 331404711da1SSong Gao VReg *Vd = (VReg *)vd; \ 331504711da1SSong Gao VReg *Vj = (VReg *)vj; \ 331604711da1SSong Gao VReg *Vk = (VReg *)vk; \ 3317ad292148SSong Gao int oprsz = simd_oprsz(desc); \ 3318d5e5563cSSong Gao \ 3319ad292148SSong Gao ofs = LSX_LEN / BIT; \ 3320ad292148SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 3321ad292148SSong Gao for (j = 0; j < ofs; j++) { \ 3322ad292148SSong Gao temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i) + 1); \ 3323ad292148SSong Gao temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i) + 1); \ 3324ad292148SSong Gao } \ 3325d5e5563cSSong Gao } \ 3326d5e5563cSSong Gao *Vd = temp; \ 3327d5e5563cSSong Gao } 3328d5e5563cSSong Gao 3329d5e5563cSSong Gao VPICKOD(vpickod_b, 16, B) 3330d5e5563cSSong Gao VPICKOD(vpickod_h, 32, H) 3331d5e5563cSSong Gao VPICKOD(vpickod_w, 64, W) 3332d5e5563cSSong Gao VPICKOD(vpickod_d, 128, D) 3333e93dd431SSong Gao 3334e93dd431SSong Gao #define VILVL(NAME, BIT, E) \ 333504711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3336e93dd431SSong Gao { \ 3337ad292148SSong Gao int i, j, ofs; \ 3338ad292148SSong Gao VReg temp = {}; \ 333904711da1SSong Gao VReg *Vd = (VReg *)vd; \ 334004711da1SSong Gao VReg *Vj = (VReg *)vj; \ 334104711da1SSong Gao VReg *Vk = (VReg *)vk; \ 3342ad292148SSong Gao int oprsz = simd_oprsz(desc); \ 3343e93dd431SSong Gao \ 3344ad292148SSong Gao ofs = LSX_LEN / BIT; \ 3345ad292148SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 3346ad292148SSong Gao for (j = 0; j < ofs; j++) { \ 3347ad292148SSong Gao temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * 2 * i); \ 3348ad292148SSong Gao temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * 2 * i); \ 3349ad292148SSong Gao } \ 3350e93dd431SSong Gao } \ 3351e93dd431SSong Gao *Vd = temp; \ 3352e93dd431SSong Gao } 3353e93dd431SSong Gao 3354e93dd431SSong Gao VILVL(vilvl_b, 16, B) 3355e93dd431SSong Gao VILVL(vilvl_h, 32, H) 3356e93dd431SSong Gao VILVL(vilvl_w, 64, W) 3357e93dd431SSong Gao VILVL(vilvl_d, 128, D) 3358e93dd431SSong Gao 3359e93dd431SSong Gao #define VILVH(NAME, BIT, E) \ 336004711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3361e93dd431SSong Gao { \ 3362ad292148SSong Gao int i, j, ofs; \ 3363ad292148SSong Gao VReg temp = {}; \ 336404711da1SSong Gao VReg *Vd = (VReg *)vd; \ 336504711da1SSong Gao VReg *Vj = (VReg *)vj; \ 336604711da1SSong Gao VReg *Vk = (VReg *)vk; \ 3367ad292148SSong Gao int oprsz = simd_oprsz(desc); \ 3368e93dd431SSong Gao \ 3369ad292148SSong Gao ofs = LSX_LEN / BIT; \ 3370ad292148SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 3371ad292148SSong Gao for (j = 0; j < ofs; j++) { \ 3372ad292148SSong Gao temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * (2 * i + 1)); \ 3373ad292148SSong Gao temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * (2 * i + 1)); \ 3374ad292148SSong Gao } \ 3375e93dd431SSong Gao } \ 3376e93dd431SSong Gao *Vd = temp; \ 3377e93dd431SSong Gao } 3378e93dd431SSong Gao 3379e93dd431SSong Gao VILVH(vilvh_b, 16, B) 3380e93dd431SSong Gao VILVH(vilvh_h, 32, H) 3381e93dd431SSong Gao VILVH(vilvh_w, 64, W) 3382e93dd431SSong Gao VILVH(vilvh_d, 128, D) 3383e93dd431SSong Gao 3384513e88a2SSong Gao #define SHF_POS(i, imm) (((i) & 0xfc) + (((imm) >> (2 * ((i) & 0x03))) & 0x03)) 3385513e88a2SSong Gao 3386eb48ab22SSong Gao void HELPER(vshuf_b)(void *vd, void *vj, void *vk, void *va, uint32_t desc) 3387e93dd431SSong Gao { 3388513e88a2SSong Gao int i, j, m; 3389513e88a2SSong Gao VReg temp = {}; 3390eb48ab22SSong Gao VReg *Vd = (VReg *)vd; 3391eb48ab22SSong Gao VReg *Vj = (VReg *)vj; 3392eb48ab22SSong Gao VReg *Vk = (VReg *)vk; 3393eb48ab22SSong Gao VReg *Va = (VReg *)va; 3394513e88a2SSong Gao int oprsz = simd_oprsz(desc); 3395e93dd431SSong Gao 3396e93dd431SSong Gao m = LSX_LEN / 8; 3397513e88a2SSong Gao for (i = 0; i < (oprsz / 16) * m; i++) { 3398513e88a2SSong Gao j = i < m ? 0 : 1; 3399e93dd431SSong Gao uint64_t k = (uint8_t)Va->B(i) % (2 * m); 3400513e88a2SSong Gao temp.B(i) = k < m ? Vk->B(k + j * m): Vj->B(k + (j - 1) * m); 3401e93dd431SSong Gao } 3402e93dd431SSong Gao *Vd = temp; 3403e93dd431SSong Gao } 3404e93dd431SSong Gao 3405e93dd431SSong Gao #define VSHUF(NAME, BIT, E) \ 340604711da1SSong Gao void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3407e93dd431SSong Gao { \ 3408513e88a2SSong Gao int i, j, m; \ 3409513e88a2SSong Gao VReg temp = {}; \ 341004711da1SSong Gao VReg *Vd = (VReg *)vd; \ 341104711da1SSong Gao VReg *Vj = (VReg *)vj; \ 341204711da1SSong Gao VReg *Vk = (VReg *)vk; \ 3413513e88a2SSong Gao int oprsz = simd_oprsz(desc); \ 3414e93dd431SSong Gao \ 3415e93dd431SSong Gao m = LSX_LEN / BIT; \ 3416513e88a2SSong Gao for (i = 0; i < (oprsz / 16) * m; i++) { \ 3417513e88a2SSong Gao j = i < m ? 0 : 1; \ 3418e93dd431SSong Gao uint64_t k = ((uint8_t)Vd->E(i)) % (2 * m); \ 3419513e88a2SSong Gao temp.E(i) = k < m ? Vk->E(k + j * m) : Vj->E(k + (j - 1) * m); \ 3420e93dd431SSong Gao } \ 3421e93dd431SSong Gao *Vd = temp; \ 3422e93dd431SSong Gao } 3423e93dd431SSong Gao 3424e93dd431SSong Gao VSHUF(vshuf_h, 16, H) 3425e93dd431SSong Gao VSHUF(vshuf_w, 32, W) 3426e93dd431SSong Gao VSHUF(vshuf_d, 64, D) 3427e93dd431SSong Gao 3428e93dd431SSong Gao #define VSHUF4I(NAME, BIT, E) \ 3429329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3430e93dd431SSong Gao { \ 3431513e88a2SSong Gao int i, j, max; \ 3432513e88a2SSong Gao VReg temp = {}; \ 3433329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 3434329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 3435513e88a2SSong Gao int oprsz = simd_oprsz(desc); \ 3436e93dd431SSong Gao \ 3437513e88a2SSong Gao max = LSX_LEN / BIT; \ 3438513e88a2SSong Gao for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3439513e88a2SSong Gao j = i < max ? 1 : 2; \ 3440513e88a2SSong Gao temp.E(i) = Vj->E(SHF_POS(i - ((j -1)* max), imm) + (j - 1) * max); \ 3441e93dd431SSong Gao } \ 3442e93dd431SSong Gao *Vd = temp; \ 3443e93dd431SSong Gao } 3444e93dd431SSong Gao 3445e93dd431SSong Gao VSHUF4I(vshuf4i_b, 8, B) 3446e93dd431SSong Gao VSHUF4I(vshuf4i_h, 16, H) 3447e93dd431SSong Gao VSHUF4I(vshuf4i_w, 32, W) 3448e93dd431SSong Gao 3449329517d5SSong Gao void HELPER(vshuf4i_d)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3450e93dd431SSong Gao { 3451513e88a2SSong Gao int i; 3452513e88a2SSong Gao VReg temp = {}; 3453329517d5SSong Gao VReg *Vd = (VReg *)vd; 3454329517d5SSong Gao VReg *Vj = (VReg *)vj; 3455513e88a2SSong Gao int oprsz = simd_oprsz(desc); 3456e93dd431SSong Gao 3457513e88a2SSong Gao for (i = 0; i < oprsz / 16; i++) { 3458513e88a2SSong Gao temp.D(2 * i) = (imm & 2 ? Vj : Vd)->D((imm & 1) + 2 * i); 3459513e88a2SSong Gao temp.D(2 * i + 1) = (imm & 8 ? Vj : Vd)->D(((imm >> 2) & 1) + 2 * i); 3460513e88a2SSong Gao } 3461513e88a2SSong Gao *Vd = temp; 3462513e88a2SSong Gao } 3463513e88a2SSong Gao 3464513e88a2SSong Gao void HELPER(vperm_w)(void *vd, void *vj, void *vk, uint32_t desc) 3465513e88a2SSong Gao { 3466513e88a2SSong Gao int i, m; 3467513e88a2SSong Gao VReg temp = {}; 3468513e88a2SSong Gao VReg *Vd = (VReg *)vd; 3469513e88a2SSong Gao VReg *Vj = (VReg *)vj; 3470513e88a2SSong Gao VReg *Vk = (VReg *)vk; 3471513e88a2SSong Gao 3472513e88a2SSong Gao m = LASX_LEN / 32; 3473513e88a2SSong Gao for (i = 0; i < m ; i++) { 3474513e88a2SSong Gao uint64_t k = (uint8_t)Vk->W(i) % 8; 3475513e88a2SSong Gao temp.W(i) = Vj->W(k); 3476513e88a2SSong Gao } 3477e93dd431SSong Gao *Vd = temp; 3478e93dd431SSong Gao } 3479e93dd431SSong Gao 3480329517d5SSong Gao void HELPER(vpermi_w)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3481e93dd431SSong Gao { 3482513e88a2SSong Gao int i; 3483513e88a2SSong Gao VReg temp = {}; 3484513e88a2SSong Gao VReg *Vd = (VReg *)vd; 3485513e88a2SSong Gao VReg *Vj = (VReg *)vj; 3486513e88a2SSong Gao int oprsz = simd_oprsz(desc); 3487513e88a2SSong Gao 3488513e88a2SSong Gao for (i = 0; i < oprsz / 16; i++) { 3489513e88a2SSong Gao temp.W(4 * i) = Vj->W((imm & 0x3) + 4 * i); 3490513e88a2SSong Gao temp.W(4 * i + 1) = Vj->W(((imm >> 2) & 0x3) + 4 * i); 3491513e88a2SSong Gao temp.W(4 * i + 2) = Vd->W(((imm >> 4) & 0x3) + 4 * i); 3492513e88a2SSong Gao temp.W(4 * i + 3) = Vd->W(((imm >> 6) & 0x3) + 4 * i); 3493513e88a2SSong Gao } 3494513e88a2SSong Gao *Vd = temp; 3495513e88a2SSong Gao } 3496513e88a2SSong Gao 3497513e88a2SSong Gao void HELPER(vpermi_d)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3498513e88a2SSong Gao { 3499513e88a2SSong Gao VReg temp = {}; 3500513e88a2SSong Gao VReg *Vd = (VReg *)vd; 3501513e88a2SSong Gao VReg *Vj = (VReg *)vj; 3502513e88a2SSong Gao 3503513e88a2SSong Gao temp.D(0) = Vj->D(imm & 0x3); 3504513e88a2SSong Gao temp.D(1) = Vj->D((imm >> 2) & 0x3); 3505513e88a2SSong Gao temp.D(2) = Vj->D((imm >> 4) & 0x3); 3506513e88a2SSong Gao temp.D(3) = Vj->D((imm >> 6) & 0x3); 3507513e88a2SSong Gao *Vd = temp; 3508513e88a2SSong Gao } 3509513e88a2SSong Gao 3510513e88a2SSong Gao void HELPER(vpermi_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3511513e88a2SSong Gao { 3512513e88a2SSong Gao int i; 3513e93dd431SSong Gao VReg temp; 3514329517d5SSong Gao VReg *Vd = (VReg *)vd; 3515329517d5SSong Gao VReg *Vj = (VReg *)vj; 3516e93dd431SSong Gao 3517513e88a2SSong Gao for (i = 0; i < 2; i++, imm >>= 4) { 3518513e88a2SSong Gao temp.Q(i) = (imm & 2 ? Vd: Vj)->Q(imm & 1); 3519513e88a2SSong Gao } 3520e93dd431SSong Gao *Vd = temp; 3521e93dd431SSong Gao } 3522e93dd431SSong Gao 3523e93dd431SSong Gao #define VEXTRINS(NAME, BIT, E, MASK) \ 3524329517d5SSong Gao void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3525e93dd431SSong Gao { \ 3526513e88a2SSong Gao int i, ins, extr, max; \ 3527329517d5SSong Gao VReg *Vd = (VReg *)vd; \ 3528329517d5SSong Gao VReg *Vj = (VReg *)vj; \ 3529513e88a2SSong Gao int oprsz = simd_oprsz(desc); \ 3530e93dd431SSong Gao \ 3531513e88a2SSong Gao max = LSX_LEN / BIT; \ 3532e93dd431SSong Gao ins = (imm >> 4) & MASK; \ 3533e93dd431SSong Gao extr = imm & MASK; \ 3534513e88a2SSong Gao for (i = 0; i < oprsz / 16; i++) { \ 3535513e88a2SSong Gao Vd->E(ins + i * max) = Vj->E(extr + i * max); \ 3536513e88a2SSong Gao } \ 3537e93dd431SSong Gao } 3538e93dd431SSong Gao 3539e93dd431SSong Gao VEXTRINS(vextrins_b, 8, B, 0xf) 3540e93dd431SSong Gao VEXTRINS(vextrins_h, 16, H, 0x7) 3541e93dd431SSong Gao VEXTRINS(vextrins_w, 32, W, 0x3) 3542e93dd431SSong Gao VEXTRINS(vextrins_d, 64, D, 0x1) 3543