1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* 3 * QEMU LoongArch vector helper functions. 4 * 5 * Copyright (c) 2022-2023 Loongson Technology Corporation Limited 6 */ 7 8 #include "qemu/osdep.h" 9 #include "cpu.h" 10 #include "exec/helper-proto.h" 11 #include "fpu/softfloat.h" 12 #include "internals.h" 13 #include "tcg/tcg.h" 14 #include "vec.h" 15 #include "tcg/tcg-gvec-desc.h" 16 17 #define DO_ODD_EVEN(NAME, BIT, E1, E2, DO_OP) \ 18 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 19 { \ 20 int i; \ 21 VReg *Vd = (VReg *)vd; \ 22 VReg *Vj = (VReg *)vj; \ 23 VReg *Vk = (VReg *)vk; \ 24 typedef __typeof(Vd->E1(0)) TD; \ 25 int oprsz = simd_oprsz(desc); \ 26 \ 27 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 28 Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i)); \ 29 } \ 30 } 31 32 DO_ODD_EVEN(vhaddw_h_b, 16, H, B, DO_ADD) 33 DO_ODD_EVEN(vhaddw_w_h, 32, W, H, DO_ADD) 34 DO_ODD_EVEN(vhaddw_d_w, 64, D, W, DO_ADD) 35 36 void HELPER(vhaddw_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 37 { 38 int i; 39 VReg *Vd = (VReg *)vd; 40 VReg *Vj = (VReg *)vj; 41 VReg *Vk = (VReg *)vk; 42 int oprsz = simd_oprsz(desc); 43 44 for (i = 0; i < oprsz / 16 ; i++) { 45 Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i + 1)), 46 int128_makes64(Vk->D(2 * i))); 47 } 48 } 49 50 DO_ODD_EVEN(vhsubw_h_b, 16, H, B, DO_SUB) 51 DO_ODD_EVEN(vhsubw_w_h, 32, W, H, DO_SUB) 52 DO_ODD_EVEN(vhsubw_d_w, 64, D, W, DO_SUB) 53 54 void HELPER(vhsubw_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 55 { 56 int i; 57 VReg *Vd = (VReg *)vd; 58 VReg *Vj = (VReg *)vj; 59 VReg *Vk = (VReg *)vk; 60 int oprsz = simd_oprsz(desc); 61 62 for (i = 0; i < oprsz / 16; i++) { 63 Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)), 64 int128_makes64(Vk->D(2 * i))); 65 } 66 } 67 68 DO_ODD_EVEN(vhaddw_hu_bu, 16, UH, UB, DO_ADD) 69 DO_ODD_EVEN(vhaddw_wu_hu, 32, UW, UH, DO_ADD) 70 DO_ODD_EVEN(vhaddw_du_wu, 64, UD, UW, DO_ADD) 71 72 void HELPER(vhaddw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc) 73 { 74 int i; 75 VReg *Vd = (VReg *)vd; 76 VReg *Vj = (VReg *)vj; 77 VReg *Vk = (VReg *)vk; 78 int oprsz = simd_oprsz(desc); 79 80 for (i = 0; i < oprsz / 16; i ++) { 81 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), 82 int128_make64(Vk->UD(2 * i))); 83 } 84 } 85 86 DO_ODD_EVEN(vhsubw_hu_bu, 16, UH, UB, DO_SUB) 87 DO_ODD_EVEN(vhsubw_wu_hu, 32, UW, UH, DO_SUB) 88 DO_ODD_EVEN(vhsubw_du_wu, 64, UD, UW, DO_SUB) 89 90 void HELPER(vhsubw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc) 91 { 92 int i; 93 VReg *Vd = (VReg *)vd; 94 VReg *Vj = (VReg *)vj; 95 VReg *Vk = (VReg *)vk; 96 int oprsz = simd_oprsz(desc); 97 98 for (i = 0; i < oprsz / 16; i++) { 99 Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)), 100 int128_make64(Vk->UD(2 * i))); 101 } 102 } 103 104 #define DO_EVEN(NAME, BIT, E1, E2, DO_OP) \ 105 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 106 { \ 107 int i; \ 108 VReg *Vd = (VReg *)vd; \ 109 VReg *Vj = (VReg *)vj; \ 110 VReg *Vk = (VReg *)vk; \ 111 typedef __typeof(Vd->E1(0)) TD; \ 112 int oprsz = simd_oprsz(desc); \ 113 \ 114 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 115 Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i) ,(TD)Vk->E2(2 * i)); \ 116 } \ 117 } 118 119 #define DO_ODD(NAME, BIT, E1, E2, DO_OP) \ 120 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 121 { \ 122 int i; \ 123 VReg *Vd = (VReg *)vd; \ 124 VReg *Vj = (VReg *)vj; \ 125 VReg *Vk = (VReg *)vk; \ 126 typedef __typeof(Vd->E1(0)) TD; \ 127 int oprsz = simd_oprsz(desc); \ 128 \ 129 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 130 Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i + 1)); \ 131 } \ 132 } 133 134 void HELPER(vaddwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 135 { 136 int i; 137 VReg *Vd = (VReg *)vd; 138 VReg *Vj = (VReg *)vj; 139 VReg *Vk = (VReg *)vk; 140 int oprsz = simd_oprsz(desc); 141 142 for (i = 0; i < oprsz / 16; i++) { 143 Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i)), 144 int128_makes64(Vk->D(2 * i))); 145 } 146 } 147 148 DO_EVEN(vaddwev_h_b, 16, H, B, DO_ADD) 149 DO_EVEN(vaddwev_w_h, 32, W, H, DO_ADD) 150 DO_EVEN(vaddwev_d_w, 64, D, W, DO_ADD) 151 152 void HELPER(vaddwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 153 { 154 int i; 155 VReg *Vd = (VReg *)vd; 156 VReg *Vj = (VReg *)vj; 157 VReg *Vk = (VReg *)vk; 158 int oprsz = simd_oprsz(desc); 159 160 for (i = 0; i < oprsz / 16; i++) { 161 Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i +1)), 162 int128_makes64(Vk->D(2 * i +1))); 163 } 164 } 165 166 DO_ODD(vaddwod_h_b, 16, H, B, DO_ADD) 167 DO_ODD(vaddwod_w_h, 32, W, H, DO_ADD) 168 DO_ODD(vaddwod_d_w, 64, D, W, DO_ADD) 169 170 void HELPER(vsubwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 171 { 172 int i; 173 VReg *Vd = (VReg *)vd; 174 VReg *Vj = (VReg *)vj; 175 VReg *Vk = (VReg *)vk; 176 int oprsz = simd_oprsz(desc); 177 178 for (i = 0; i < oprsz / 16; i++) { 179 Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i)), 180 int128_makes64(Vk->D(2 * i))); 181 } 182 } 183 184 DO_EVEN(vsubwev_h_b, 16, H, B, DO_SUB) 185 DO_EVEN(vsubwev_w_h, 32, W, H, DO_SUB) 186 DO_EVEN(vsubwev_d_w, 64, D, W, DO_SUB) 187 188 void HELPER(vsubwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 189 { 190 int i; 191 VReg *Vd = (VReg *)vd; 192 VReg *Vj = (VReg *)vj; 193 VReg *Vk = (VReg *)vk; 194 int oprsz = simd_oprsz(desc); 195 196 for (i = 0; i < oprsz / 16; i++) { 197 Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)), 198 int128_makes64(Vk->D(2 * i + 1))); 199 } 200 } 201 202 DO_ODD(vsubwod_h_b, 16, H, B, DO_SUB) 203 DO_ODD(vsubwod_w_h, 32, W, H, DO_SUB) 204 DO_ODD(vsubwod_d_w, 64, D, W, DO_SUB) 205 206 void HELPER(vaddwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc) 207 { 208 int i; 209 VReg *Vd = (VReg *)vd; 210 VReg *Vj = (VReg *)vj; 211 VReg *Vk = (VReg *)vk; 212 int oprsz = simd_oprsz(desc); 213 214 for (i = 0; i < oprsz / 16; i++) { 215 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)), 216 int128_make64(Vk->UD(2 * i))); 217 } 218 } 219 220 DO_EVEN(vaddwev_h_bu, 16, UH, UB, DO_ADD) 221 DO_EVEN(vaddwev_w_hu, 32, UW, UH, DO_ADD) 222 DO_EVEN(vaddwev_d_wu, 64, UD, UW, DO_ADD) 223 224 void HELPER(vaddwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc) 225 { 226 int i; 227 VReg *Vd = (VReg *)vd; 228 VReg *Vj = (VReg *)vj; 229 VReg *Vk = (VReg *)vk; 230 int oprsz = simd_oprsz(desc); 231 232 for (i = 0; i < oprsz / 16; i++) { 233 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), 234 int128_make64(Vk->UD(2 * i + 1))); 235 } 236 } 237 238 DO_ODD(vaddwod_h_bu, 16, UH, UB, DO_ADD) 239 DO_ODD(vaddwod_w_hu, 32, UW, UH, DO_ADD) 240 DO_ODD(vaddwod_d_wu, 64, UD, UW, DO_ADD) 241 242 void HELPER(vsubwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc) 243 { 244 int i; 245 VReg *Vd = (VReg *)vd; 246 VReg *Vj = (VReg *)vj; 247 VReg *Vk = (VReg *)vk; 248 int oprsz = simd_oprsz(desc); 249 250 for (i = 0; i < oprsz / 16; i++) { 251 Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i)), 252 int128_make64(Vk->UD(2 * i))); 253 } 254 } 255 256 DO_EVEN(vsubwev_h_bu, 16, UH, UB, DO_SUB) 257 DO_EVEN(vsubwev_w_hu, 32, UW, UH, DO_SUB) 258 DO_EVEN(vsubwev_d_wu, 64, UD, UW, DO_SUB) 259 260 void HELPER(vsubwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc) 261 { 262 int i; 263 VReg *Vd = (VReg *)vd; 264 VReg *Vj = (VReg *)vj; 265 VReg *Vk = (VReg *)vk; 266 int oprsz = simd_oprsz(desc); 267 268 for (i = 0; i < oprsz / 16; i++) { 269 Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)), 270 int128_make64(Vk->UD(2 * i + 1))); 271 } 272 } 273 274 DO_ODD(vsubwod_h_bu, 16, UH, UB, DO_SUB) 275 DO_ODD(vsubwod_w_hu, 32, UW, UH, DO_SUB) 276 DO_ODD(vsubwod_d_wu, 64, UD, UW, DO_SUB) 277 278 #define DO_EVEN_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ 279 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 280 { \ 281 int i; \ 282 VReg *Vd = (VReg *)vd; \ 283 VReg *Vj = (VReg *)vj; \ 284 VReg *Vk = (VReg *)vk; \ 285 typedef __typeof(Vd->ES1(0)) TDS; \ 286 typedef __typeof(Vd->EU1(0)) TDU; \ 287 int oprsz = simd_oprsz(desc); \ 288 \ 289 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 290 Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i) ,(TDS)Vk->ES2(2 * i)); \ 291 } \ 292 } 293 294 #define DO_ODD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ 295 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 296 { \ 297 int i; \ 298 VReg *Vd = (VReg *)vd; \ 299 VReg *Vj = (VReg *)vj; \ 300 VReg *Vk = (VReg *)vk; \ 301 typedef __typeof(Vd->ES1(0)) TDS; \ 302 typedef __typeof(Vd->EU1(0)) TDU; \ 303 int oprsz = simd_oprsz(desc); \ 304 \ 305 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 306 Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i + 1), (TDS)Vk->ES2(2 * i + 1)); \ 307 } \ 308 } 309 310 void HELPER(vaddwev_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc) 311 { 312 int i; 313 VReg *Vd = (VReg *)vd; 314 VReg *Vj = (VReg *)vj; 315 VReg *Vk = (VReg *)vk; 316 int oprsz = simd_oprsz(desc); 317 318 for (i = 0; i < oprsz / 16; i++) { 319 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)), 320 int128_makes64(Vk->D(2 * i))); 321 } 322 } 323 324 DO_EVEN_U_S(vaddwev_h_bu_b, 16, H, UH, B, UB, DO_ADD) 325 DO_EVEN_U_S(vaddwev_w_hu_h, 32, W, UW, H, UH, DO_ADD) 326 DO_EVEN_U_S(vaddwev_d_wu_w, 64, D, UD, W, UW, DO_ADD) 327 328 void HELPER(vaddwod_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc) 329 { 330 int i; 331 VReg *Vd = (VReg *)vd; 332 VReg *Vj = (VReg *)vj; 333 VReg *Vk = (VReg *)vk; 334 int oprsz = simd_oprsz(desc); 335 336 for (i = 0; i < oprsz / 16; i++) { 337 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), 338 int128_makes64(Vk->D(2 * i + 1))); 339 } 340 } 341 342 DO_ODD_U_S(vaddwod_h_bu_b, 16, H, UH, B, UB, DO_ADD) 343 DO_ODD_U_S(vaddwod_w_hu_h, 32, W, UW, H, UH, DO_ADD) 344 DO_ODD_U_S(vaddwod_d_wu_w, 64, D, UD, W, UW, DO_ADD) 345 346 #define DO_3OP(NAME, BIT, E, DO_OP) \ 347 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 348 { \ 349 int i; \ 350 VReg *Vd = (VReg *)vd; \ 351 VReg *Vj = (VReg *)vj; \ 352 VReg *Vk = (VReg *)vk; \ 353 int oprsz = simd_oprsz(desc); \ 354 \ 355 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 356 Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \ 357 } \ 358 } 359 360 DO_3OP(vavg_b, 8, B, DO_VAVG) 361 DO_3OP(vavg_h, 16, H, DO_VAVG) 362 DO_3OP(vavg_w, 32, W, DO_VAVG) 363 DO_3OP(vavg_d, 64, D, DO_VAVG) 364 DO_3OP(vavgr_b, 8, B, DO_VAVGR) 365 DO_3OP(vavgr_h, 16, H, DO_VAVGR) 366 DO_3OP(vavgr_w, 32, W, DO_VAVGR) 367 DO_3OP(vavgr_d, 64, D, DO_VAVGR) 368 DO_3OP(vavg_bu, 8, UB, DO_VAVG) 369 DO_3OP(vavg_hu, 16, UH, DO_VAVG) 370 DO_3OP(vavg_wu, 32, UW, DO_VAVG) 371 DO_3OP(vavg_du, 64, UD, DO_VAVG) 372 DO_3OP(vavgr_bu, 8, UB, DO_VAVGR) 373 DO_3OP(vavgr_hu, 16, UH, DO_VAVGR) 374 DO_3OP(vavgr_wu, 32, UW, DO_VAVGR) 375 DO_3OP(vavgr_du, 64, UD, DO_VAVGR) 376 377 DO_3OP(vabsd_b, 8, B, DO_VABSD) 378 DO_3OP(vabsd_h, 16, H, DO_VABSD) 379 DO_3OP(vabsd_w, 32, W, DO_VABSD) 380 DO_3OP(vabsd_d, 64, D, DO_VABSD) 381 DO_3OP(vabsd_bu, 8, UB, DO_VABSD) 382 DO_3OP(vabsd_hu, 16, UH, DO_VABSD) 383 DO_3OP(vabsd_wu, 32, UW, DO_VABSD) 384 DO_3OP(vabsd_du, 64, UD, DO_VABSD) 385 386 #define DO_VADDA(NAME, BIT, E) \ 387 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 388 { \ 389 int i; \ 390 VReg *Vd = (VReg *)vd; \ 391 VReg *Vj = (VReg *)vj; \ 392 VReg *Vk = (VReg *)vk; \ 393 int oprsz = simd_oprsz(desc); \ 394 \ 395 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 396 Vd->E(i) = DO_VABS(Vj->E(i)) + DO_VABS(Vk->E(i)); \ 397 } \ 398 } 399 400 DO_VADDA(vadda_b, 8, B) 401 DO_VADDA(vadda_h, 16, H) 402 DO_VADDA(vadda_w, 32, W) 403 DO_VADDA(vadda_d, 64, D) 404 405 #define VMINMAXI(NAME, BIT, E, DO_OP) \ 406 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 407 { \ 408 int i; \ 409 VReg *Vd = (VReg *)vd; \ 410 VReg *Vj = (VReg *)vj; \ 411 typedef __typeof(Vd->E(0)) TD; \ 412 int oprsz = simd_oprsz(desc); \ 413 \ 414 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 415 Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \ 416 } \ 417 } 418 419 VMINMAXI(vmini_b, 8, B, DO_MIN) 420 VMINMAXI(vmini_h, 16, H, DO_MIN) 421 VMINMAXI(vmini_w, 32, W, DO_MIN) 422 VMINMAXI(vmini_d, 64, D, DO_MIN) 423 VMINMAXI(vmaxi_b, 8, B, DO_MAX) 424 VMINMAXI(vmaxi_h, 16, H, DO_MAX) 425 VMINMAXI(vmaxi_w, 32, W, DO_MAX) 426 VMINMAXI(vmaxi_d, 64, D, DO_MAX) 427 VMINMAXI(vmini_bu, 8, UB, DO_MIN) 428 VMINMAXI(vmini_hu, 16, UH, DO_MIN) 429 VMINMAXI(vmini_wu, 32, UW, DO_MIN) 430 VMINMAXI(vmini_du, 64, UD, DO_MIN) 431 VMINMAXI(vmaxi_bu, 8, UB, DO_MAX) 432 VMINMAXI(vmaxi_hu, 16, UH, DO_MAX) 433 VMINMAXI(vmaxi_wu, 32, UW, DO_MAX) 434 VMINMAXI(vmaxi_du, 64, UD, DO_MAX) 435 436 #define DO_VMUH(NAME, BIT, E1, E2, DO_OP) \ 437 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 438 { \ 439 int i; \ 440 VReg *Vd = (VReg *)vd; \ 441 VReg *Vj = (VReg *)vj; \ 442 VReg *Vk = (VReg *)vk; \ 443 typedef __typeof(Vd->E1(0)) T; \ 444 int oprsz = simd_oprsz(desc); \ 445 \ 446 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 447 Vd->E2(i) = ((T)Vj->E2(i)) * ((T)Vk->E2(i)) >> BIT; \ 448 } \ 449 } 450 451 void HELPER(vmuh_d)(void *vd, void *vj, void *vk, uint32_t desc) 452 { 453 int i; 454 uint64_t l, h; 455 VReg *Vd = (VReg *)vd; 456 VReg *Vj = (VReg *)vj; 457 VReg *Vk = (VReg *)vk; 458 int oprsz = simd_oprsz(desc); 459 460 for (i = 0; i < oprsz / 8; i++) { 461 muls64(&l, &h, Vj->D(i), Vk->D(i)); 462 Vd->D(i) = h; 463 } 464 } 465 466 DO_VMUH(vmuh_b, 8, H, B, DO_MUH) 467 DO_VMUH(vmuh_h, 16, W, H, DO_MUH) 468 DO_VMUH(vmuh_w, 32, D, W, DO_MUH) 469 470 void HELPER(vmuh_du)(void *vd, void *vj, void *vk, uint32_t desc) 471 { 472 int i; 473 uint64_t l, h; 474 VReg *Vd = (VReg *)vd; 475 VReg *Vj = (VReg *)vj; 476 VReg *Vk = (VReg *)vk; 477 int oprsz = simd_oprsz(desc); 478 479 for (i = 0; i < oprsz / 8; i++) { 480 mulu64(&l, &h, Vj->D(i), Vk->D(i)); 481 Vd->D(i) = h; 482 } 483 } 484 485 DO_VMUH(vmuh_bu, 8, UH, UB, DO_MUH) 486 DO_VMUH(vmuh_hu, 16, UW, UH, DO_MUH) 487 DO_VMUH(vmuh_wu, 32, UD, UW, DO_MUH) 488 489 DO_EVEN(vmulwev_h_b, 16, H, B, DO_MUL) 490 DO_EVEN(vmulwev_w_h, 32, W, H, DO_MUL) 491 DO_EVEN(vmulwev_d_w, 64, D, W, DO_MUL) 492 493 DO_ODD(vmulwod_h_b, 16, H, B, DO_MUL) 494 DO_ODD(vmulwod_w_h, 32, W, H, DO_MUL) 495 DO_ODD(vmulwod_d_w, 64, D, W, DO_MUL) 496 497 DO_EVEN(vmulwev_h_bu, 16, UH, UB, DO_MUL) 498 DO_EVEN(vmulwev_w_hu, 32, UW, UH, DO_MUL) 499 DO_EVEN(vmulwev_d_wu, 64, UD, UW, DO_MUL) 500 501 DO_ODD(vmulwod_h_bu, 16, UH, UB, DO_MUL) 502 DO_ODD(vmulwod_w_hu, 32, UW, UH, DO_MUL) 503 DO_ODD(vmulwod_d_wu, 64, UD, UW, DO_MUL) 504 505 DO_EVEN_U_S(vmulwev_h_bu_b, 16, H, UH, B, UB, DO_MUL) 506 DO_EVEN_U_S(vmulwev_w_hu_h, 32, W, UW, H, UH, DO_MUL) 507 DO_EVEN_U_S(vmulwev_d_wu_w, 64, D, UD, W, UW, DO_MUL) 508 509 DO_ODD_U_S(vmulwod_h_bu_b, 16, H, UH, B, UB, DO_MUL) 510 DO_ODD_U_S(vmulwod_w_hu_h, 32, W, UW, H, UH, DO_MUL) 511 DO_ODD_U_S(vmulwod_d_wu_w, 64, D, UD, W, UW, DO_MUL) 512 513 #define VMADDSUB(NAME, BIT, E, DO_OP) \ 514 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 515 { \ 516 int i; \ 517 VReg *Vd = (VReg *)vd; \ 518 VReg *Vj = (VReg *)vj; \ 519 VReg *Vk = (VReg *)vk; \ 520 int oprsz = simd_oprsz(desc); \ 521 \ 522 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 523 Vd->E(i) = DO_OP(Vd->E(i), Vj->E(i) ,Vk->E(i)); \ 524 } \ 525 } 526 527 VMADDSUB(vmadd_b, 8, B, DO_MADD) 528 VMADDSUB(vmadd_h, 16, H, DO_MADD) 529 VMADDSUB(vmadd_w, 32, W, DO_MADD) 530 VMADDSUB(vmadd_d, 64, D, DO_MADD) 531 VMADDSUB(vmsub_b, 8, B, DO_MSUB) 532 VMADDSUB(vmsub_h, 16, H, DO_MSUB) 533 VMADDSUB(vmsub_w, 32, W, DO_MSUB) 534 VMADDSUB(vmsub_d, 64, D, DO_MSUB) 535 536 #define VMADDWEV(NAME, BIT, E1, E2, DO_OP) \ 537 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 538 { \ 539 int i; \ 540 VReg *Vd = (VReg *)vd; \ 541 VReg *Vj = (VReg *)vj; \ 542 VReg *Vk = (VReg *)vk; \ 543 typedef __typeof(Vd->E1(0)) TD; \ 544 int oprsz = simd_oprsz(desc); \ 545 \ 546 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 547 Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i), (TD)Vk->E2(2 * i)); \ 548 } \ 549 } 550 551 VMADDWEV(vmaddwev_h_b, 16, H, B, DO_MUL) 552 VMADDWEV(vmaddwev_w_h, 32, W, H, DO_MUL) 553 VMADDWEV(vmaddwev_d_w, 64, D, W, DO_MUL) 554 VMADDWEV(vmaddwev_h_bu, 16, UH, UB, DO_MUL) 555 VMADDWEV(vmaddwev_w_hu, 32, UW, UH, DO_MUL) 556 VMADDWEV(vmaddwev_d_wu, 64, UD, UW, DO_MUL) 557 558 #define VMADDWOD(NAME, BIT, E1, E2, DO_OP) \ 559 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 560 { \ 561 int i; \ 562 VReg *Vd = (VReg *)vd; \ 563 VReg *Vj = (VReg *)vj; \ 564 VReg *Vk = (VReg *)vk; \ 565 typedef __typeof(Vd->E1(0)) TD; \ 566 int oprsz = simd_oprsz(desc); \ 567 \ 568 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 569 Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i + 1), \ 570 (TD)Vk->E2(2 * i + 1)); \ 571 } \ 572 } 573 574 VMADDWOD(vmaddwod_h_b, 16, H, B, DO_MUL) 575 VMADDWOD(vmaddwod_w_h, 32, W, H, DO_MUL) 576 VMADDWOD(vmaddwod_d_w, 64, D, W, DO_MUL) 577 VMADDWOD(vmaddwod_h_bu, 16, UH, UB, DO_MUL) 578 VMADDWOD(vmaddwod_w_hu, 32, UW, UH, DO_MUL) 579 VMADDWOD(vmaddwod_d_wu, 64, UD, UW, DO_MUL) 580 581 #define VMADDWEV_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ 582 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 583 { \ 584 int i; \ 585 VReg *Vd = (VReg *)vd; \ 586 VReg *Vj = (VReg *)vj; \ 587 VReg *Vk = (VReg *)vk; \ 588 typedef __typeof(Vd->ES1(0)) TS1; \ 589 typedef __typeof(Vd->EU1(0)) TU1; \ 590 int oprsz = simd_oprsz(desc); \ 591 \ 592 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 593 Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i), \ 594 (TS1)Vk->ES2(2 * i)); \ 595 } \ 596 } 597 598 VMADDWEV_U_S(vmaddwev_h_bu_b, 16, H, UH, B, UB, DO_MUL) 599 VMADDWEV_U_S(vmaddwev_w_hu_h, 32, W, UW, H, UH, DO_MUL) 600 VMADDWEV_U_S(vmaddwev_d_wu_w, 64, D, UD, W, UW, DO_MUL) 601 602 #define VMADDWOD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ 603 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 604 { \ 605 int i; \ 606 VReg *Vd = (VReg *)vd; \ 607 VReg *Vj = (VReg *)vj; \ 608 VReg *Vk = (VReg *)vk; \ 609 typedef __typeof(Vd->ES1(0)) TS1; \ 610 typedef __typeof(Vd->EU1(0)) TU1; \ 611 int oprsz = simd_oprsz(desc); \ 612 \ 613 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 614 Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i + 1), \ 615 (TS1)Vk->ES2(2 * i + 1)); \ 616 } \ 617 } 618 619 VMADDWOD_U_S(vmaddwod_h_bu_b, 16, H, UH, B, UB, DO_MUL) 620 VMADDWOD_U_S(vmaddwod_w_hu_h, 32, W, UW, H, UH, DO_MUL) 621 VMADDWOD_U_S(vmaddwod_d_wu_w, 64, D, UD, W, UW, DO_MUL) 622 623 #define VDIV(NAME, BIT, E, DO_OP) \ 624 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 625 { \ 626 int i; \ 627 VReg *Vd = (VReg *)vd; \ 628 VReg *Vj = (VReg *)vj; \ 629 VReg *Vk = (VReg *)vk; \ 630 int oprsz = simd_oprsz(desc); \ 631 \ 632 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 633 Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \ 634 } \ 635 } 636 637 VDIV(vdiv_b, 8, B, DO_DIV) 638 VDIV(vdiv_h, 16, H, DO_DIV) 639 VDIV(vdiv_w, 32, W, DO_DIV) 640 VDIV(vdiv_d, 64, D, DO_DIV) 641 VDIV(vdiv_bu, 8, UB, DO_DIVU) 642 VDIV(vdiv_hu, 16, UH, DO_DIVU) 643 VDIV(vdiv_wu, 32, UW, DO_DIVU) 644 VDIV(vdiv_du, 64, UD, DO_DIVU) 645 VDIV(vmod_b, 8, B, DO_REM) 646 VDIV(vmod_h, 16, H, DO_REM) 647 VDIV(vmod_w, 32, W, DO_REM) 648 VDIV(vmod_d, 64, D, DO_REM) 649 VDIV(vmod_bu, 8, UB, DO_REMU) 650 VDIV(vmod_hu, 16, UH, DO_REMU) 651 VDIV(vmod_wu, 32, UW, DO_REMU) 652 VDIV(vmod_du, 64, UD, DO_REMU) 653 654 #define VSAT_S(NAME, BIT, E) \ 655 void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \ 656 { \ 657 int i; \ 658 VReg *Vd = (VReg *)vd; \ 659 VReg *Vj = (VReg *)vj; \ 660 typedef __typeof(Vd->E(0)) TD; \ 661 int oprsz = simd_oprsz(desc); \ 662 \ 663 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 664 Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : \ 665 Vj->E(i) < (TD)~max ? (TD)~max: Vj->E(i); \ 666 } \ 667 } 668 669 VSAT_S(vsat_b, 8, B) 670 VSAT_S(vsat_h, 16, H) 671 VSAT_S(vsat_w, 32, W) 672 VSAT_S(vsat_d, 64, D) 673 674 #define VSAT_U(NAME, BIT, E) \ 675 void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \ 676 { \ 677 int i; \ 678 VReg *Vd = (VReg *)vd; \ 679 VReg *Vj = (VReg *)vj; \ 680 typedef __typeof(Vd->E(0)) TD; \ 681 int oprsz = simd_oprsz(desc); \ 682 \ 683 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 684 Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : Vj->E(i); \ 685 } \ 686 } 687 688 VSAT_U(vsat_bu, 8, UB) 689 VSAT_U(vsat_hu, 16, UH) 690 VSAT_U(vsat_wu, 32, UW) 691 VSAT_U(vsat_du, 64, UD) 692 693 #define VEXTH(NAME, BIT, E1, E2) \ 694 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ 695 { \ 696 int i, j, ofs; \ 697 VReg *Vd = (VReg *)vd; \ 698 VReg *Vj = (VReg *)vj; \ 699 int oprsz = simd_oprsz(desc); \ 700 \ 701 ofs = LSX_LEN / BIT; \ 702 for (i = 0; i < oprsz / 16; i++) { \ 703 for (j = 0; j < ofs; j++) { \ 704 Vd->E1(j + i * ofs) = Vj->E2(j + ofs + ofs * 2 * i); \ 705 } \ 706 } \ 707 } 708 709 void HELPER(vexth_q_d)(void *vd, void *vj, uint32_t desc) 710 { 711 int i; 712 VReg *Vd = (VReg *)vd; 713 VReg *Vj = (VReg *)vj; 714 int oprsz = simd_oprsz(desc); 715 716 for (i = 0; i < oprsz / 16; i++) { 717 Vd->Q(i) = int128_makes64(Vj->D(2 * i + 1)); 718 } 719 } 720 721 void HELPER(vexth_qu_du)(void *vd, void *vj, uint32_t desc) 722 { 723 int i; 724 VReg *Vd = (VReg *)vd; 725 VReg *Vj = (VReg *)vj; 726 int oprsz = simd_oprsz(desc); 727 728 for (i = 0; i < oprsz / 16; i++) { 729 Vd->Q(i) = int128_make64(Vj->UD(2 * i + 1)); 730 } 731 } 732 733 VEXTH(vexth_h_b, 16, H, B) 734 VEXTH(vexth_w_h, 32, W, H) 735 VEXTH(vexth_d_w, 64, D, W) 736 VEXTH(vexth_hu_bu, 16, UH, UB) 737 VEXTH(vexth_wu_hu, 32, UW, UH) 738 VEXTH(vexth_du_wu, 64, UD, UW) 739 740 #define VEXT2XV(NAME, BIT, E1, E2) \ 741 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ 742 { \ 743 int i; \ 744 VReg temp = {}; \ 745 VReg *Vd = (VReg *)vd; \ 746 VReg *Vj = (VReg *)vj; \ 747 int oprsz = simd_oprsz(desc); \ 748 \ 749 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 750 temp.E1(i) = Vj->E2(i); \ 751 } \ 752 *Vd = temp; \ 753 } 754 755 VEXT2XV(vext2xv_h_b, 16, H, B) 756 VEXT2XV(vext2xv_w_b, 32, W, B) 757 VEXT2XV(vext2xv_d_b, 64, D, B) 758 VEXT2XV(vext2xv_w_h, 32, W, H) 759 VEXT2XV(vext2xv_d_h, 64, D, H) 760 VEXT2XV(vext2xv_d_w, 64, D, W) 761 VEXT2XV(vext2xv_hu_bu, 16, UH, UB) 762 VEXT2XV(vext2xv_wu_bu, 32, UW, UB) 763 VEXT2XV(vext2xv_du_bu, 64, UD, UB) 764 VEXT2XV(vext2xv_wu_hu, 32, UW, UH) 765 VEXT2XV(vext2xv_du_hu, 64, UD, UH) 766 VEXT2XV(vext2xv_du_wu, 64, UD, UW) 767 768 DO_3OP(vsigncov_b, 8, B, DO_SIGNCOV) 769 DO_3OP(vsigncov_h, 16, H, DO_SIGNCOV) 770 DO_3OP(vsigncov_w, 32, W, DO_SIGNCOV) 771 DO_3OP(vsigncov_d, 64, D, DO_SIGNCOV) 772 773 static uint64_t do_vmskltz_b(int64_t val) 774 { 775 uint64_t m = 0x8080808080808080ULL; 776 uint64_t c = val & m; 777 c |= c << 7; 778 c |= c << 14; 779 c |= c << 28; 780 return c >> 56; 781 } 782 783 void HELPER(vmskltz_b)(void *vd, void *vj, uint32_t desc) 784 { 785 int i; 786 uint16_t temp = 0; 787 VReg *Vd = (VReg *)vd; 788 VReg *Vj = (VReg *)vj; 789 int oprsz = simd_oprsz(desc); 790 791 for (i = 0; i < oprsz / 16; i++) { 792 temp = 0; 793 temp = do_vmskltz_b(Vj->D(2 * i)); 794 temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8); 795 Vd->D(2 * i) = temp; 796 Vd->D(2 * i + 1) = 0; 797 } 798 } 799 800 static uint64_t do_vmskltz_h(int64_t val) 801 { 802 uint64_t m = 0x8000800080008000ULL; 803 uint64_t c = val & m; 804 c |= c << 15; 805 c |= c << 30; 806 return c >> 60; 807 } 808 809 void HELPER(vmskltz_h)(void *vd, void *vj, uint32_t desc) 810 { 811 int i; 812 uint16_t temp = 0; 813 VReg *Vd = (VReg *)vd; 814 VReg *Vj = (VReg *)vj; 815 int oprsz = simd_oprsz(desc); 816 817 for (i = 0; i < oprsz / 16; i++) { 818 temp = 0; 819 temp = do_vmskltz_h(Vj->D(2 * i)); 820 temp |= (do_vmskltz_h(Vj->D(2 * i + 1)) << 4); 821 Vd->D(2 * i) = temp; 822 Vd->D(2 * i + 1) = 0; 823 } 824 } 825 826 static uint64_t do_vmskltz_w(int64_t val) 827 { 828 uint64_t m = 0x8000000080000000ULL; 829 uint64_t c = val & m; 830 c |= c << 31; 831 return c >> 62; 832 } 833 834 void HELPER(vmskltz_w)(void *vd, void *vj, uint32_t desc) 835 { 836 int i; 837 uint16_t temp = 0; 838 VReg *Vd = (VReg *)vd; 839 VReg *Vj = (VReg *)vj; 840 int oprsz = simd_oprsz(desc); 841 842 for (i = 0; i < oprsz / 16; i++) { 843 temp = 0; 844 temp = do_vmskltz_w(Vj->D(2 * i)); 845 temp |= (do_vmskltz_w(Vj->D(2 * i + 1)) << 2); 846 Vd->D(2 * i) = temp; 847 Vd->D(2 * i + 1) = 0; 848 } 849 } 850 851 static uint64_t do_vmskltz_d(int64_t val) 852 { 853 return (uint64_t)val >> 63; 854 } 855 void HELPER(vmskltz_d)(void *vd, void *vj, uint32_t desc) 856 { 857 int i; 858 uint16_t temp = 0; 859 VReg *Vd = (VReg *)vd; 860 VReg *Vj = (VReg *)vj; 861 int oprsz = simd_oprsz(desc); 862 863 for (i = 0; i < oprsz / 16; i++) { 864 temp = 0; 865 temp = do_vmskltz_d(Vj->D(2 * i)); 866 temp |= (do_vmskltz_d(Vj->D(2 * i + 1)) << 1); 867 Vd->D(2 * i) = temp; 868 Vd->D(2 * i + 1) = 0; 869 } 870 } 871 872 void HELPER(vmskgez_b)(void *vd, void *vj, uint32_t desc) 873 { 874 int i; 875 uint16_t temp = 0; 876 VReg *Vd = (VReg *)vd; 877 VReg *Vj = (VReg *)vj; 878 int oprsz = simd_oprsz(desc); 879 880 for (i = 0; i < oprsz / 16; i++) { 881 temp = 0; 882 temp = do_vmskltz_b(Vj->D(2 * i)); 883 temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8); 884 Vd->D(2 * i) = (uint16_t)(~temp); 885 Vd->D(2 * i + 1) = 0; 886 } 887 } 888 889 static uint64_t do_vmskez_b(uint64_t a) 890 { 891 uint64_t m = 0x7f7f7f7f7f7f7f7fULL; 892 uint64_t c = ~(((a & m) + m) | a | m); 893 c |= c << 7; 894 c |= c << 14; 895 c |= c << 28; 896 return c >> 56; 897 } 898 899 void HELPER(vmsknz_b)(void *vd, void *vj, uint32_t desc) 900 { 901 int i; 902 uint16_t temp = 0; 903 VReg *Vd = (VReg *)vd; 904 VReg *Vj = (VReg *)vj; 905 int oprsz = simd_oprsz(desc); 906 907 for (i = 0; i < oprsz / 16; i++) { 908 temp = 0; 909 temp = do_vmskez_b(Vj->D(2 * i)); 910 temp |= (do_vmskez_b(Vj->D(2 * i + 1)) << 8); 911 Vd->D(2 * i) = (uint16_t)(~temp); 912 Vd->D(2 * i + 1) = 0; 913 } 914 } 915 916 void HELPER(vnori_b)(void *vd, void *vj, uint64_t imm, uint32_t desc) 917 { 918 int i; 919 VReg *Vd = (VReg *)vd; 920 VReg *Vj = (VReg *)vj; 921 922 for (i = 0; i < simd_oprsz(desc); i++) { 923 Vd->B(i) = ~(Vj->B(i) | (uint8_t)imm); 924 } 925 } 926 927 #define VSLLWIL(NAME, BIT, E1, E2) \ 928 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 929 { \ 930 int i, j, ofs; \ 931 VReg temp = {}; \ 932 VReg *Vd = (VReg *)vd; \ 933 VReg *Vj = (VReg *)vj; \ 934 int oprsz = simd_oprsz(desc); \ 935 typedef __typeof(temp.E1(0)) TD; \ 936 \ 937 ofs = LSX_LEN / BIT; \ 938 for (i = 0; i < oprsz / 16; i++) { \ 939 for (j = 0; j < ofs; j++) { \ 940 temp.E1(j + ofs * i) = (TD)Vj->E2(j + ofs * 2 * i) << (imm % BIT); \ 941 } \ 942 } \ 943 *Vd = temp; \ 944 } 945 946 947 void HELPER(vextl_q_d)(void *vd, void *vj, uint32_t desc) 948 { 949 int i; 950 VReg *Vd = (VReg *)vd; 951 VReg *Vj = (VReg *)vj; 952 int oprsz = simd_oprsz(desc); 953 954 for (i = 0; i < oprsz / 16; i++) { 955 Vd->Q(i) = int128_makes64(Vj->D(2 * i)); 956 } 957 } 958 959 void HELPER(vextl_qu_du)(void *vd, void *vj, uint32_t desc) 960 { 961 int i; 962 VReg *Vd = (VReg *)vd; 963 VReg *Vj = (VReg *)vj; 964 int oprsz = simd_oprsz(desc); 965 966 for (i = 0; i < oprsz / 16; i++) { 967 Vd->Q(i) = int128_make64(Vj->UD(2 * i)); 968 } 969 } 970 971 VSLLWIL(vsllwil_h_b, 16, H, B) 972 VSLLWIL(vsllwil_w_h, 32, W, H) 973 VSLLWIL(vsllwil_d_w, 64, D, W) 974 VSLLWIL(vsllwil_hu_bu, 16, UH, UB) 975 VSLLWIL(vsllwil_wu_hu, 32, UW, UH) 976 VSLLWIL(vsllwil_du_wu, 64, UD, UW) 977 978 #define do_vsrlr(E, T) \ 979 static T do_vsrlr_ ##E(T s1, int sh) \ 980 { \ 981 if (sh == 0) { \ 982 return s1; \ 983 } else { \ 984 return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \ 985 } \ 986 } 987 988 do_vsrlr(B, uint8_t) 989 do_vsrlr(H, uint16_t) 990 do_vsrlr(W, uint32_t) 991 do_vsrlr(D, uint64_t) 992 993 #define VSRLR(NAME, BIT, T, E) \ 994 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 995 { \ 996 int i; \ 997 VReg *Vd = (VReg *)vd; \ 998 VReg *Vj = (VReg *)vj; \ 999 VReg *Vk = (VReg *)vk; \ 1000 int oprsz = simd_oprsz(desc); \ 1001 \ 1002 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1003 Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \ 1004 } \ 1005 } 1006 1007 VSRLR(vsrlr_b, 8, uint8_t, B) 1008 VSRLR(vsrlr_h, 16, uint16_t, H) 1009 VSRLR(vsrlr_w, 32, uint32_t, W) 1010 VSRLR(vsrlr_d, 64, uint64_t, D) 1011 1012 #define VSRLRI(NAME, BIT, E) \ 1013 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1014 { \ 1015 int i; \ 1016 VReg *Vd = (VReg *)vd; \ 1017 VReg *Vj = (VReg *)vj; \ 1018 int oprsz = simd_oprsz(desc); \ 1019 \ 1020 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1021 Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), imm); \ 1022 } \ 1023 } 1024 1025 VSRLRI(vsrlri_b, 8, B) 1026 VSRLRI(vsrlri_h, 16, H) 1027 VSRLRI(vsrlri_w, 32, W) 1028 VSRLRI(vsrlri_d, 64, D) 1029 1030 #define do_vsrar(E, T) \ 1031 static T do_vsrar_ ##E(T s1, int sh) \ 1032 { \ 1033 if (sh == 0) { \ 1034 return s1; \ 1035 } else { \ 1036 return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \ 1037 } \ 1038 } 1039 1040 do_vsrar(B, int8_t) 1041 do_vsrar(H, int16_t) 1042 do_vsrar(W, int32_t) 1043 do_vsrar(D, int64_t) 1044 1045 #define VSRAR(NAME, BIT, T, E) \ 1046 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1047 { \ 1048 int i; \ 1049 VReg *Vd = (VReg *)vd; \ 1050 VReg *Vj = (VReg *)vj; \ 1051 VReg *Vk = (VReg *)vk; \ 1052 int oprsz = simd_oprsz(desc); \ 1053 \ 1054 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1055 Vd->E(i) = do_vsrar_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \ 1056 } \ 1057 } 1058 1059 VSRAR(vsrar_b, 8, uint8_t, B) 1060 VSRAR(vsrar_h, 16, uint16_t, H) 1061 VSRAR(vsrar_w, 32, uint32_t, W) 1062 VSRAR(vsrar_d, 64, uint64_t, D) 1063 1064 #define VSRARI(NAME, BIT, E) \ 1065 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1066 { \ 1067 int i; \ 1068 VReg *Vd = (VReg *)vd; \ 1069 VReg *Vj = (VReg *)vj; \ 1070 int oprsz = simd_oprsz(desc); \ 1071 \ 1072 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1073 Vd->E(i) = do_vsrar_ ## E(Vj->E(i), imm); \ 1074 } \ 1075 } 1076 1077 VSRARI(vsrari_b, 8, B) 1078 VSRARI(vsrari_h, 16, H) 1079 VSRARI(vsrari_w, 32, W) 1080 VSRARI(vsrari_d, 64, D) 1081 1082 #define VSRLN(NAME, BIT, E1, E2) \ 1083 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1084 { \ 1085 int i, j, ofs; \ 1086 VReg *Vd = (VReg *)vd; \ 1087 VReg *Vj = (VReg *)vj; \ 1088 VReg *Vk = (VReg *)vk; \ 1089 int oprsz = simd_oprsz(desc); \ 1090 \ 1091 ofs = LSX_LEN / BIT; \ 1092 for (i = 0; i < oprsz / 16; i++) { \ 1093 for (j = 0; j < ofs; j++) { \ 1094 Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), \ 1095 Vk->E2(j + ofs * i) % BIT); \ 1096 } \ 1097 Vd->D(2 * i + 1) = 0; \ 1098 } \ 1099 } 1100 1101 VSRLN(vsrln_b_h, 16, B, UH) 1102 VSRLN(vsrln_h_w, 32, H, UW) 1103 VSRLN(vsrln_w_d, 64, W, UD) 1104 1105 #define VSRAN(NAME, BIT, E1, E2, E3) \ 1106 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1107 { \ 1108 int i, j, ofs; \ 1109 VReg *Vd = (VReg *)vd; \ 1110 VReg *Vj = (VReg *)vj; \ 1111 VReg *Vk = (VReg *)vk; \ 1112 int oprsz = simd_oprsz(desc); \ 1113 \ 1114 ofs = LSX_LEN / BIT; \ 1115 for (i = 0; i < oprsz / 16; i++) { \ 1116 for (j = 0; j < ofs; j++) { \ 1117 Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), \ 1118 Vk->E3(j + ofs * i) % BIT); \ 1119 } \ 1120 Vd->D(2 * i + 1) = 0; \ 1121 } \ 1122 } 1123 1124 VSRAN(vsran_b_h, 16, B, H, UH) 1125 VSRAN(vsran_h_w, 32, H, W, UW) 1126 VSRAN(vsran_w_d, 64, W, D, UD) 1127 1128 #define VSRLNI(NAME, BIT, E1, E2) \ 1129 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1130 { \ 1131 int i, j, ofs; \ 1132 VReg temp = {}; \ 1133 VReg *Vd = (VReg *)vd; \ 1134 VReg *Vj = (VReg *)vj; \ 1135 int oprsz = simd_oprsz(desc); \ 1136 \ 1137 ofs = LSX_LEN / BIT; \ 1138 for (i = 0; i < oprsz / 16; i++) { \ 1139 for (j = 0; j < ofs; j++) { \ 1140 temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \ 1141 temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \ 1142 imm); \ 1143 } \ 1144 } \ 1145 *Vd = temp; \ 1146 } 1147 1148 void HELPER(vsrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1149 { 1150 int i; 1151 VReg temp = {}; 1152 VReg *Vd = (VReg *)vd; 1153 VReg *Vj = (VReg *)vj; 1154 1155 for (i = 0; i < 2; i++) { 1156 temp.D(2 * i) = int128_getlo(int128_urshift(Vj->Q(i), imm % 128)); 1157 temp.D(2 * i +1) = int128_getlo(int128_urshift(Vd->Q(i), imm % 128)); 1158 } 1159 *Vd = temp; 1160 } 1161 1162 VSRLNI(vsrlni_b_h, 16, B, UH) 1163 VSRLNI(vsrlni_h_w, 32, H, UW) 1164 VSRLNI(vsrlni_w_d, 64, W, UD) 1165 1166 #define VSRANI(NAME, BIT, E1, E2) \ 1167 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1168 { \ 1169 int i, j, ofs; \ 1170 VReg temp = {}; \ 1171 VReg *Vd = (VReg *)vd; \ 1172 VReg *Vj = (VReg *)vj; \ 1173 int oprsz = simd_oprsz(desc); \ 1174 \ 1175 ofs = LSX_LEN / BIT; \ 1176 for (i = 0; i < oprsz / 16; i++) { \ 1177 for (j = 0; j < ofs; j++) { \ 1178 temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \ 1179 temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \ 1180 imm); \ 1181 } \ 1182 } \ 1183 *Vd = temp; \ 1184 } 1185 1186 void HELPER(vsrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1187 { 1188 int i; 1189 VReg temp = {}; 1190 VReg *Vd = (VReg *)vd; 1191 VReg *Vj = (VReg *)vj; 1192 1193 for (i = 0; i < 2; i++) { 1194 temp.D(2 * i) = int128_getlo(int128_rshift(Vj->Q(i), imm % 128)); 1195 temp.D(2 * i + 1) = int128_getlo(int128_rshift(Vd->Q(i), imm % 128)); 1196 } 1197 *Vd = temp; 1198 } 1199 1200 VSRANI(vsrani_b_h, 16, B, H) 1201 VSRANI(vsrani_h_w, 32, H, W) 1202 VSRANI(vsrani_w_d, 64, W, D) 1203 1204 #define VSRLRN(NAME, BIT, E1, E2, E3) \ 1205 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1206 { \ 1207 int i, j, ofs; \ 1208 VReg *Vd = (VReg *)vd; \ 1209 VReg *Vj = (VReg *)vj; \ 1210 VReg *Vk = (VReg *)vk; \ 1211 int oprsz = simd_oprsz(desc); \ 1212 \ 1213 ofs = LSX_LEN / BIT; \ 1214 for (i = 0; i < oprsz / 16; i++) { \ 1215 for (j = 0; j < ofs; j++) { \ 1216 Vd->E1(j + ofs * 2 * i) = do_vsrlr_ ##E2(Vj->E2(j + ofs * i), \ 1217 Vk->E3(j + ofs * i) % BIT); \ 1218 } \ 1219 Vd->D(2 * i + 1) = 0; \ 1220 } \ 1221 } 1222 1223 VSRLRN(vsrlrn_b_h, 16, B, H, UH) 1224 VSRLRN(vsrlrn_h_w, 32, H, W, UW) 1225 VSRLRN(vsrlrn_w_d, 64, W, D, UD) 1226 1227 #define VSRARN(NAME, BIT, E1, E2, E3) \ 1228 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1229 { \ 1230 int i, j, ofs; \ 1231 VReg *Vd = (VReg *)vd; \ 1232 VReg *Vj = (VReg *)vj; \ 1233 VReg *Vk = (VReg *)vk; \ 1234 int oprsz = simd_oprsz(desc); \ 1235 \ 1236 ofs = LSX_LEN / BIT; \ 1237 for (i = 0; i < oprsz / 16; i++) { \ 1238 for (j = 0; j < ofs; j++) { \ 1239 Vd->E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i), \ 1240 Vk->E3(j + ofs * i) % BIT); \ 1241 } \ 1242 Vd->D(2 * i + 1) = 0; \ 1243 } \ 1244 } 1245 1246 VSRARN(vsrarn_b_h, 16, B, H, UH) 1247 VSRARN(vsrarn_h_w, 32, H, W, UW) 1248 VSRARN(vsrarn_w_d, 64, W, D, UD) 1249 1250 #define VSRLRNI(NAME, BIT, E1, E2) \ 1251 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1252 { \ 1253 int i, j, ofs; \ 1254 VReg temp = {}; \ 1255 VReg *Vd = (VReg *)vd; \ 1256 VReg *Vj = (VReg *)vj; \ 1257 int oprsz = simd_oprsz(desc); \ 1258 \ 1259 ofs = LSX_LEN / BIT; \ 1260 for (i = 0; i < oprsz / 16; i++) { \ 1261 for (j = 0; j < ofs; j++) { \ 1262 temp.E1(j + ofs * 2 * i) = do_vsrlr_ ## E2(Vj->E2(j + ofs * i), imm); \ 1263 temp.E1(j + ofs * (2 * i + 1)) = do_vsrlr_ ## E2(Vd->E2(j + ofs * i), \ 1264 imm); \ 1265 } \ 1266 } \ 1267 *Vd = temp; \ 1268 } 1269 1270 void HELPER(vsrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1271 { 1272 int i; 1273 VReg temp = {}; 1274 VReg *Vd = (VReg *)vd; 1275 VReg *Vj = (VReg *)vj; 1276 Int128 r[4]; 1277 int oprsz = simd_oprsz(desc); 1278 1279 for (i = 0; i < oprsz / 16; i++) { 1280 if (imm == 0) { 1281 temp.D(2 * i) = int128_getlo(Vj->Q(i)); 1282 temp.D(2 * i + 1) = int128_getlo(Vd->Q(i)); 1283 } else { 1284 r[2 * i] = int128_and(int128_urshift(Vj->Q(i), (imm - 1)), 1285 int128_one()); 1286 r[2 * i + 1] = int128_and(int128_urshift(Vd->Q(i), (imm - 1)), 1287 int128_one()); 1288 temp.D(2 * i) = int128_getlo(int128_add(int128_urshift(Vj->Q(i), 1289 imm), r[2 * i])); 1290 temp.D(2 * i + 1) = int128_getlo(int128_add(int128_urshift(Vd->Q(i), 1291 imm), r[ 2 * i + 1])); 1292 } 1293 } 1294 *Vd = temp; 1295 } 1296 1297 VSRLRNI(vsrlrni_b_h, 16, B, H) 1298 VSRLRNI(vsrlrni_h_w, 32, H, W) 1299 VSRLRNI(vsrlrni_w_d, 64, W, D) 1300 1301 #define VSRARNI(NAME, BIT, E1, E2) \ 1302 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1303 { \ 1304 int i, j, ofs; \ 1305 VReg temp = {}; \ 1306 VReg *Vd = (VReg *)vd; \ 1307 VReg *Vj = (VReg *)vj; \ 1308 int oprsz = simd_oprsz(desc); \ 1309 \ 1310 ofs = LSX_LEN / BIT; \ 1311 for (i = 0; i < oprsz / 16; i++) { \ 1312 for (j = 0; j < ofs; j++) { \ 1313 temp.E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i), imm); \ 1314 temp.E1(j + ofs * (2 * i + 1)) = do_vsrar_ ## E2(Vd->E2(j + ofs * i), \ 1315 imm); \ 1316 } \ 1317 } \ 1318 *Vd = temp; \ 1319 } 1320 1321 void HELPER(vsrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1322 { 1323 int i; 1324 VReg temp = {}; 1325 VReg *Vd = (VReg *)vd; 1326 VReg *Vj = (VReg *)vj; 1327 Int128 r[4]; 1328 int oprsz = simd_oprsz(desc); 1329 1330 for (i = 0; i < oprsz / 16; i++) { 1331 if (imm == 0) { 1332 temp.D(2 * i) = int128_getlo(Vj->Q(i)); 1333 temp.D(2 * i + 1) = int128_getlo(Vd->Q(i)); 1334 } else { 1335 r[2 * i] = int128_and(int128_rshift(Vj->Q(i), (imm - 1)), 1336 int128_one()); 1337 r[2 * i + 1] = int128_and(int128_rshift(Vd->Q(i), (imm - 1)), 1338 int128_one()); 1339 temp.D(2 * i) = int128_getlo(int128_add(int128_rshift(Vj->Q(i), 1340 imm), r[2 * i])); 1341 temp.D(2 * i + 1) = int128_getlo(int128_add(int128_rshift(Vd->Q(i), 1342 imm), r[2 * i + 1])); 1343 } 1344 } 1345 *Vd = temp; 1346 } 1347 1348 VSRARNI(vsrarni_b_h, 16, B, H) 1349 VSRARNI(vsrarni_h_w, 32, H, W) 1350 VSRARNI(vsrarni_w_d, 64, W, D) 1351 1352 #define SSRLNS(NAME, T1, T2, T3) \ 1353 static T1 do_ssrlns_ ## NAME(T2 e2, int sa, int sh) \ 1354 { \ 1355 T1 shft_res; \ 1356 if (sa == 0) { \ 1357 shft_res = e2; \ 1358 } else { \ 1359 shft_res = (((T1)e2) >> sa); \ 1360 } \ 1361 T3 mask; \ 1362 mask = (1ull << sh) -1; \ 1363 if (shft_res > mask) { \ 1364 return mask; \ 1365 } else { \ 1366 return shft_res; \ 1367 } \ 1368 } 1369 1370 SSRLNS(B, uint16_t, int16_t, uint8_t) 1371 SSRLNS(H, uint32_t, int32_t, uint16_t) 1372 SSRLNS(W, uint64_t, int64_t, uint32_t) 1373 1374 #define VSSRLN(NAME, BIT, E1, E2, E3) \ 1375 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1376 { \ 1377 int i, j, ofs; \ 1378 VReg *Vd = (VReg *)vd; \ 1379 VReg *Vj = (VReg *)vj; \ 1380 VReg *Vk = (VReg *)vk; \ 1381 int oprsz = simd_oprsz(desc); \ 1382 \ 1383 ofs = LSX_LEN / BIT; \ 1384 for (i = 0; i < oprsz / 16; i++) { \ 1385 for (j = 0; j < ofs; j++) { \ 1386 Vd->E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i), \ 1387 Vk->E3(j + ofs * i) % BIT, \ 1388 BIT / 2 - 1); \ 1389 } \ 1390 Vd->D(2 * i + 1) = 0; \ 1391 } \ 1392 } 1393 1394 VSSRLN(vssrln_b_h, 16, B, H, UH) 1395 VSSRLN(vssrln_h_w, 32, H, W, UW) 1396 VSSRLN(vssrln_w_d, 64, W, D, UD) 1397 1398 #define SSRANS(E, T1, T2) \ 1399 static T1 do_ssrans_ ## E(T1 e2, int sa, int sh) \ 1400 { \ 1401 T1 shft_res; \ 1402 if (sa == 0) { \ 1403 shft_res = e2; \ 1404 } else { \ 1405 shft_res = e2 >> sa; \ 1406 } \ 1407 T2 mask; \ 1408 mask = (1ll << sh) - 1; \ 1409 if (shft_res > mask) { \ 1410 return mask; \ 1411 } else if (shft_res < -(mask + 1)) { \ 1412 return ~mask; \ 1413 } else { \ 1414 return shft_res; \ 1415 } \ 1416 } 1417 1418 SSRANS(B, int16_t, int8_t) 1419 SSRANS(H, int32_t, int16_t) 1420 SSRANS(W, int64_t, int32_t) 1421 1422 #define VSSRAN(NAME, BIT, E1, E2, E3) \ 1423 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1424 { \ 1425 int i, j, ofs; \ 1426 VReg *Vd = (VReg *)vd; \ 1427 VReg *Vj = (VReg *)vj; \ 1428 VReg *Vk = (VReg *)vk; \ 1429 int oprsz = simd_oprsz(desc); \ 1430 \ 1431 ofs = LSX_LEN / BIT; \ 1432 for (i = 0; i < oprsz / 16; i++) { \ 1433 for (j = 0; j < ofs; j++) { \ 1434 Vd->E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i), \ 1435 Vk->E3(j + ofs * i) % BIT, \ 1436 BIT / 2 - 1); \ 1437 } \ 1438 Vd->D(2 * i + 1) = 0; \ 1439 } \ 1440 } 1441 1442 VSSRAN(vssran_b_h, 16, B, H, UH) 1443 VSSRAN(vssran_h_w, 32, H, W, UW) 1444 VSSRAN(vssran_w_d, 64, W, D, UD) 1445 1446 #define SSRLNU(E, T1, T2, T3) \ 1447 static T1 do_ssrlnu_ ## E(T3 e2, int sa, int sh) \ 1448 { \ 1449 T1 shft_res; \ 1450 if (sa == 0) { \ 1451 shft_res = e2; \ 1452 } else { \ 1453 shft_res = (((T1)e2) >> sa); \ 1454 } \ 1455 T2 mask; \ 1456 mask = (1ull << sh) - 1; \ 1457 if (shft_res > mask) { \ 1458 return mask; \ 1459 } else { \ 1460 return shft_res; \ 1461 } \ 1462 } 1463 1464 SSRLNU(B, uint16_t, uint8_t, int16_t) 1465 SSRLNU(H, uint32_t, uint16_t, int32_t) 1466 SSRLNU(W, uint64_t, uint32_t, int64_t) 1467 1468 #define VSSRLNU(NAME, BIT, E1, E2, E3) \ 1469 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1470 { \ 1471 int i, j, ofs; \ 1472 VReg *Vd = (VReg *)vd; \ 1473 VReg *Vj = (VReg *)vj; \ 1474 VReg *Vk = (VReg *)vk; \ 1475 int oprsz = simd_oprsz(desc); \ 1476 \ 1477 ofs = LSX_LEN / BIT; \ 1478 for (i = 0; i < oprsz / 16; i++) { \ 1479 for (j = 0; j < ofs; j++) { \ 1480 Vd->E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i), \ 1481 Vk->E3(j + ofs * i) % BIT, \ 1482 BIT / 2); \ 1483 } \ 1484 Vd->D(2 * i + 1) = 0; \ 1485 } \ 1486 } 1487 1488 VSSRLNU(vssrln_bu_h, 16, B, H, UH) 1489 VSSRLNU(vssrln_hu_w, 32, H, W, UW) 1490 VSSRLNU(vssrln_wu_d, 64, W, D, UD) 1491 1492 #define SSRANU(E, T1, T2, T3) \ 1493 static T1 do_ssranu_ ## E(T3 e2, int sa, int sh) \ 1494 { \ 1495 T1 shft_res; \ 1496 if (sa == 0) { \ 1497 shft_res = e2; \ 1498 } else { \ 1499 shft_res = e2 >> sa; \ 1500 } \ 1501 if (e2 < 0) { \ 1502 shft_res = 0; \ 1503 } \ 1504 T2 mask; \ 1505 mask = (1ull << sh) - 1; \ 1506 if (shft_res > mask) { \ 1507 return mask; \ 1508 } else { \ 1509 return shft_res; \ 1510 } \ 1511 } 1512 1513 SSRANU(B, uint16_t, uint8_t, int16_t) 1514 SSRANU(H, uint32_t, uint16_t, int32_t) 1515 SSRANU(W, uint64_t, uint32_t, int64_t) 1516 1517 #define VSSRANU(NAME, BIT, E1, E2, E3) \ 1518 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1519 { \ 1520 int i, j, ofs; \ 1521 VReg *Vd = (VReg *)vd; \ 1522 VReg *Vj = (VReg *)vj; \ 1523 VReg *Vk = (VReg *)vk; \ 1524 int oprsz = simd_oprsz(desc); \ 1525 \ 1526 ofs = LSX_LEN / BIT; \ 1527 for (i = 0; i < oprsz / 16; i++) { \ 1528 for (j = 0; j < ofs; j++) { \ 1529 Vd->E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i), \ 1530 Vk->E3(j + ofs * i) % BIT, \ 1531 BIT / 2); \ 1532 } \ 1533 Vd->D(2 * i + 1) = 0; \ 1534 } \ 1535 } 1536 1537 VSSRANU(vssran_bu_h, 16, B, H, UH) 1538 VSSRANU(vssran_hu_w, 32, H, W, UW) 1539 VSSRANU(vssran_wu_d, 64, W, D, UD) 1540 1541 #define VSSRLNI(NAME, BIT, E1, E2) \ 1542 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1543 { \ 1544 int i, j, ofs; \ 1545 VReg temp = {}; \ 1546 VReg *Vd = (VReg *)vd; \ 1547 VReg *Vj = (VReg *)vj; \ 1548 int oprsz = simd_oprsz(desc); \ 1549 \ 1550 ofs = LSX_LEN / BIT; \ 1551 for (i = 0; i < oprsz / 16; i++) { \ 1552 for (j = 0; j < ofs; j++) { \ 1553 temp.E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i), \ 1554 imm, BIT / 2 - 1); \ 1555 temp.E1(j + ofs * (2 * i + 1)) = do_ssrlns_ ## E1(Vd->E2(j + ofs * i), \ 1556 imm, BIT / 2 - 1); \ 1557 } \ 1558 } \ 1559 *Vd = temp; \ 1560 } 1561 1562 static void do_vssrlni_q(VReg *Vd, VReg *Vj, 1563 uint64_t imm, int idx, Int128 mask) 1564 { 1565 Int128 shft_res1, shft_res2; 1566 1567 if (imm == 0) { 1568 shft_res1 = Vj->Q(idx); 1569 shft_res2 = Vd->Q(idx); 1570 } else { 1571 shft_res1 = int128_urshift(Vj->Q(idx), imm); 1572 shft_res2 = int128_urshift(Vd->Q(idx), imm); 1573 } 1574 1575 if (int128_ult(mask, shft_res1)) { 1576 Vd->D(idx * 2) = int128_getlo(mask); 1577 }else { 1578 Vd->D(idx * 2) = int128_getlo(shft_res1); 1579 } 1580 1581 if (int128_ult(mask, shft_res2)) { 1582 Vd->D(idx * 2 + 1) = int128_getlo(mask); 1583 }else { 1584 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 1585 } 1586 } 1587 1588 void HELPER(vssrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1589 { 1590 int i; 1591 Int128 mask; 1592 VReg *Vd = (VReg *)vd; 1593 VReg *Vj = (VReg *)vj; 1594 int oprsz = simd_oprsz(desc); 1595 1596 mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); 1597 1598 for (i = 0; i < oprsz / 16; i++) { 1599 do_vssrlni_q(Vd, Vj, imm, i, mask); 1600 } 1601 } 1602 1603 VSSRLNI(vssrlni_b_h, 16, B, H) 1604 VSSRLNI(vssrlni_h_w, 32, H, W) 1605 VSSRLNI(vssrlni_w_d, 64, W, D) 1606 1607 #define VSSRANI(NAME, BIT, E1, E2) \ 1608 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1609 { \ 1610 int i, j, ofs; \ 1611 VReg temp = {}; \ 1612 VReg *Vd = (VReg *)vd; \ 1613 VReg *Vj = (VReg *)vj; \ 1614 int oprsz = simd_oprsz(desc); \ 1615 \ 1616 ofs = LSX_LEN / BIT; \ 1617 for (i = 0; i < oprsz / 16; i++) { \ 1618 for (j = 0; j < ofs; j++) { \ 1619 temp.E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i), \ 1620 imm, BIT / 2 - 1); \ 1621 temp.E1(j + ofs * (2 * i + 1)) = do_ssrans_ ## E1(Vd->E2(j + ofs * i), \ 1622 imm, BIT / 2 - 1); \ 1623 } \ 1624 } \ 1625 *Vd = temp; \ 1626 } 1627 1628 static void do_vssrani_d_q(VReg *Vd, VReg *Vj, 1629 uint64_t imm, int idx, Int128 mask, Int128 min) 1630 { 1631 Int128 shft_res1, shft_res2; 1632 1633 if (imm == 0) { 1634 shft_res1 = Vj->Q(idx); 1635 shft_res2 = Vd->Q(idx); 1636 } else { 1637 shft_res1 = int128_rshift(Vj->Q(idx), imm); 1638 shft_res2 = int128_rshift(Vd->Q(idx), imm); 1639 } 1640 1641 if (int128_gt(shft_res1, mask)) { 1642 Vd->D(idx * 2) = int128_getlo(mask); 1643 } else if (int128_lt(shft_res1, int128_neg(min))) { 1644 Vd->D(idx * 2) = int128_getlo(min); 1645 } else { 1646 Vd->D(idx * 2) = int128_getlo(shft_res1); 1647 } 1648 1649 if (int128_gt(shft_res2, mask)) { 1650 Vd->D(idx * 2 + 1) = int128_getlo(mask); 1651 } else if (int128_lt(shft_res2, int128_neg(min))) { 1652 Vd->D(idx * 2 + 1) = int128_getlo(min); 1653 } else { 1654 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 1655 } 1656 } 1657 1658 void HELPER(vssrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1659 { 1660 int i; 1661 Int128 mask, min; 1662 VReg *Vd = (VReg *)vd; 1663 VReg *Vj = (VReg *)vj; 1664 int oprsz = simd_oprsz(desc); 1665 1666 mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); 1667 min = int128_lshift(int128_one(), 63); 1668 1669 for (i = 0; i < oprsz / 16; i++) { 1670 do_vssrani_d_q(Vd, Vj, imm, i, mask, min); 1671 } 1672 } 1673 1674 1675 VSSRANI(vssrani_b_h, 16, B, H) 1676 VSSRANI(vssrani_h_w, 32, H, W) 1677 VSSRANI(vssrani_w_d, 64, W, D) 1678 1679 #define VSSRLNUI(NAME, BIT, E1, E2) \ 1680 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1681 { \ 1682 int i, j, ofs; \ 1683 VReg temp = {}; \ 1684 VReg *Vd = (VReg *)vd; \ 1685 VReg *Vj = (VReg *)vj; \ 1686 int oprsz = simd_oprsz(desc); \ 1687 \ 1688 ofs = LSX_LEN / BIT; \ 1689 for (i = 0; i < oprsz / 16; i++) { \ 1690 for (j = 0; j < ofs; j++) { \ 1691 temp.E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i), \ 1692 imm, BIT / 2); \ 1693 temp.E1(j + ofs * (2 * i + 1)) = do_ssrlnu_ ## E1(Vd->E2(j + ofs * i), \ 1694 imm, BIT / 2); \ 1695 } \ 1696 } \ 1697 *Vd = temp; \ 1698 } 1699 1700 void HELPER(vssrlni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1701 { 1702 int i; 1703 Int128 mask; 1704 VReg *Vd = (VReg *)vd; 1705 VReg *Vj = (VReg *)vj; 1706 int oprsz = simd_oprsz(desc); 1707 1708 mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); 1709 1710 for (i = 0; i < oprsz / 16; i++) { 1711 do_vssrlni_q(Vd, Vj, imm, i, mask); 1712 } 1713 } 1714 1715 VSSRLNUI(vssrlni_bu_h, 16, B, H) 1716 VSSRLNUI(vssrlni_hu_w, 32, H, W) 1717 VSSRLNUI(vssrlni_wu_d, 64, W, D) 1718 1719 #define VSSRANUI(NAME, BIT, E1, E2) \ 1720 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1721 { \ 1722 int i, j, ofs; \ 1723 VReg temp = {}; \ 1724 VReg *Vd = (VReg *)vd; \ 1725 VReg *Vj = (VReg *)vj; \ 1726 int oprsz = simd_oprsz(desc); \ 1727 \ 1728 ofs = LSX_LEN / BIT; \ 1729 for (i = 0; i < oprsz / 16; i++) { \ 1730 for (j = 0; j < ofs; j++) { \ 1731 temp.E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i), \ 1732 imm, BIT / 2); \ 1733 temp.E1(j + ofs * (2 * i + 1)) = do_ssranu_ ## E1(Vd->E2(j + ofs * i), \ 1734 imm, BIT / 2); \ 1735 } \ 1736 } \ 1737 *Vd = temp; \ 1738 } 1739 1740 static void do_vssrani_du_q(VReg *Vd, VReg *Vj, 1741 uint64_t imm, int idx, Int128 mask) 1742 { 1743 Int128 shft_res1, shft_res2; 1744 1745 if (imm == 0) { 1746 shft_res1 = Vj->Q(idx); 1747 shft_res2 = Vd->Q(idx); 1748 } else { 1749 shft_res1 = int128_rshift(Vj->Q(idx), imm); 1750 shft_res2 = int128_rshift(Vd->Q(idx), imm); 1751 } 1752 1753 if (int128_lt(Vj->Q(idx), int128_zero())) { 1754 shft_res1 = int128_zero(); 1755 } 1756 1757 if (int128_lt(Vd->Q(idx), int128_zero())) { 1758 shft_res2 = int128_zero(); 1759 } 1760 if (int128_ult(mask, shft_res1)) { 1761 Vd->D(idx * 2) = int128_getlo(mask); 1762 }else { 1763 Vd->D(idx * 2) = int128_getlo(shft_res1); 1764 } 1765 1766 if (int128_ult(mask, shft_res2)) { 1767 Vd->D(idx * 2 + 1) = int128_getlo(mask); 1768 }else { 1769 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 1770 } 1771 1772 } 1773 1774 void HELPER(vssrani_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1775 { 1776 int i; 1777 Int128 mask; 1778 VReg *Vd = (VReg *)vd; 1779 VReg *Vj = (VReg *)vj; 1780 int oprsz = simd_oprsz(desc); 1781 1782 mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); 1783 1784 for (i = 0; i < oprsz / 16; i++) { 1785 do_vssrani_du_q(Vd, Vj, imm, i, mask); 1786 } 1787 } 1788 1789 VSSRANUI(vssrani_bu_h, 16, B, H) 1790 VSSRANUI(vssrani_hu_w, 32, H, W) 1791 VSSRANUI(vssrani_wu_d, 64, W, D) 1792 1793 #define SSRLRNS(E1, E2, T1, T2, T3) \ 1794 static T1 do_ssrlrns_ ## E1(T2 e2, int sa, int sh) \ 1795 { \ 1796 T1 shft_res; \ 1797 \ 1798 shft_res = do_vsrlr_ ## E2(e2, sa); \ 1799 T1 mask; \ 1800 mask = (1ull << sh) - 1; \ 1801 if (shft_res > mask) { \ 1802 return mask; \ 1803 } else { \ 1804 return shft_res; \ 1805 } \ 1806 } 1807 1808 SSRLRNS(B, H, uint16_t, int16_t, uint8_t) 1809 SSRLRNS(H, W, uint32_t, int32_t, uint16_t) 1810 SSRLRNS(W, D, uint64_t, int64_t, uint32_t) 1811 1812 #define VSSRLRN(NAME, BIT, E1, E2, E3) \ 1813 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1814 { \ 1815 int i, j, ofs; \ 1816 VReg *Vd = (VReg *)vd; \ 1817 VReg *Vj = (VReg *)vj; \ 1818 VReg *Vk = (VReg *)vk; \ 1819 int oprsz = simd_oprsz(desc); \ 1820 \ 1821 ofs = LSX_LEN / BIT; \ 1822 for (i = 0; i < oprsz / 16; i++) { \ 1823 for (j = 0; j < ofs; j++) { \ 1824 Vd->E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i), \ 1825 Vk->E3(j + ofs * i) % BIT, \ 1826 BIT / 2 - 1); \ 1827 } \ 1828 Vd->D(2 * i + 1) = 0; \ 1829 } \ 1830 } 1831 1832 VSSRLRN(vssrlrn_b_h, 16, B, H, UH) 1833 VSSRLRN(vssrlrn_h_w, 32, H, W, UW) 1834 VSSRLRN(vssrlrn_w_d, 64, W, D, UD) 1835 1836 #define SSRARNS(E1, E2, T1, T2) \ 1837 static T1 do_ssrarns_ ## E1(T1 e2, int sa, int sh) \ 1838 { \ 1839 T1 shft_res; \ 1840 \ 1841 shft_res = do_vsrar_ ## E2(e2, sa); \ 1842 T2 mask; \ 1843 mask = (1ll << sh) - 1; \ 1844 if (shft_res > mask) { \ 1845 return mask; \ 1846 } else if (shft_res < -(mask +1)) { \ 1847 return ~mask; \ 1848 } else { \ 1849 return shft_res; \ 1850 } \ 1851 } 1852 1853 SSRARNS(B, H, int16_t, int8_t) 1854 SSRARNS(H, W, int32_t, int16_t) 1855 SSRARNS(W, D, int64_t, int32_t) 1856 1857 #define VSSRARN(NAME, BIT, E1, E2, E3) \ 1858 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1859 { \ 1860 int i, j, ofs; \ 1861 VReg *Vd = (VReg *)vd; \ 1862 VReg *Vj = (VReg *)vj; \ 1863 VReg *Vk = (VReg *)vk; \ 1864 int oprsz = simd_oprsz(desc); \ 1865 \ 1866 ofs = LSX_LEN / BIT; \ 1867 for (i = 0; i < oprsz / 16; i++) { \ 1868 for (j = 0; j < ofs; j++) { \ 1869 Vd->E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i), \ 1870 Vk->E3(j + ofs * i) % BIT, \ 1871 BIT/ 2 - 1); \ 1872 } \ 1873 Vd->D(2 * i + 1) = 0; \ 1874 } \ 1875 } 1876 1877 VSSRARN(vssrarn_b_h, 16, B, H, UH) 1878 VSSRARN(vssrarn_h_w, 32, H, W, UW) 1879 VSSRARN(vssrarn_w_d, 64, W, D, UD) 1880 1881 #define SSRLRNU(E1, E2, T1, T2, T3) \ 1882 static T1 do_ssrlrnu_ ## E1(T3 e2, int sa, int sh) \ 1883 { \ 1884 T1 shft_res; \ 1885 \ 1886 shft_res = do_vsrlr_ ## E2(e2, sa); \ 1887 \ 1888 T2 mask; \ 1889 mask = (1ull << sh) - 1; \ 1890 if (shft_res > mask) { \ 1891 return mask; \ 1892 } else { \ 1893 return shft_res; \ 1894 } \ 1895 } 1896 1897 SSRLRNU(B, H, uint16_t, uint8_t, int16_t) 1898 SSRLRNU(H, W, uint32_t, uint16_t, int32_t) 1899 SSRLRNU(W, D, uint64_t, uint32_t, int64_t) 1900 1901 #define VSSRLRNU(NAME, BIT, E1, E2, E3) \ 1902 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1903 { \ 1904 int i, j, ofs; \ 1905 VReg *Vd = (VReg *)vd; \ 1906 VReg *Vj = (VReg *)vj; \ 1907 VReg *Vk = (VReg *)vk; \ 1908 int oprsz = simd_oprsz(desc); \ 1909 \ 1910 ofs = LSX_LEN / BIT; \ 1911 for (i = 0; i < oprsz / 16; i++) { \ 1912 for (j = 0; j < ofs; j++) { \ 1913 Vd->E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i), \ 1914 Vk->E3(j + ofs * i) % BIT, \ 1915 BIT / 2); \ 1916 } \ 1917 Vd->D(2 * i + 1) = 0; \ 1918 } \ 1919 } 1920 1921 VSSRLRNU(vssrlrn_bu_h, 16, B, H, UH) 1922 VSSRLRNU(vssrlrn_hu_w, 32, H, W, UW) 1923 VSSRLRNU(vssrlrn_wu_d, 64, W, D, UD) 1924 1925 #define SSRARNU(E1, E2, T1, T2, T3) \ 1926 static T1 do_ssrarnu_ ## E1(T3 e2, int sa, int sh) \ 1927 { \ 1928 T1 shft_res; \ 1929 \ 1930 if (e2 < 0) { \ 1931 shft_res = 0; \ 1932 } else { \ 1933 shft_res = do_vsrar_ ## E2(e2, sa); \ 1934 } \ 1935 T2 mask; \ 1936 mask = (1ull << sh) - 1; \ 1937 if (shft_res > mask) { \ 1938 return mask; \ 1939 } else { \ 1940 return shft_res; \ 1941 } \ 1942 } 1943 1944 SSRARNU(B, H, uint16_t, uint8_t, int16_t) 1945 SSRARNU(H, W, uint32_t, uint16_t, int32_t) 1946 SSRARNU(W, D, uint64_t, uint32_t, int64_t) 1947 1948 #define VSSRARNU(NAME, BIT, E1, E2, E3) \ 1949 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1950 { \ 1951 int i, j, ofs; \ 1952 VReg *Vd = (VReg *)vd; \ 1953 VReg *Vj = (VReg *)vj; \ 1954 VReg *Vk = (VReg *)vk; \ 1955 int oprsz = simd_oprsz(desc); \ 1956 \ 1957 ofs = LSX_LEN / BIT; \ 1958 for (i = 0; i < oprsz / 16; i++) { \ 1959 for (j = 0; j < ofs; j++) { \ 1960 Vd->E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i), \ 1961 Vk->E3(j + ofs * i) % BIT, \ 1962 BIT / 2); \ 1963 } \ 1964 Vd->D(2 * i + 1) = 0; \ 1965 } \ 1966 } 1967 1968 VSSRARNU(vssrarn_bu_h, 16, B, H, UH) 1969 VSSRARNU(vssrarn_hu_w, 32, H, W, UW) 1970 VSSRARNU(vssrarn_wu_d, 64, W, D, UD) 1971 1972 #define VSSRLRNI(NAME, BIT, E1, E2) \ 1973 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1974 { \ 1975 int i, j, ofs; \ 1976 VReg temp = {}; \ 1977 VReg *Vd = (VReg *)vd; \ 1978 VReg *Vj = (VReg *)vj; \ 1979 int oprsz = simd_oprsz(desc); \ 1980 \ 1981 ofs = LSX_LEN / BIT; \ 1982 for (i = 0; i < oprsz / 16; i++) { \ 1983 for (j = 0; j < ofs; j++) { \ 1984 temp.E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i), \ 1985 imm, BIT / 2 - 1); \ 1986 temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrns_ ## E1(Vd->E2(j + ofs * i), \ 1987 imm, BIT / 2 - 1); \ 1988 } \ 1989 } \ 1990 *Vd = temp; \ 1991 } 1992 1993 static void do_vssrlrni_q(VReg *Vd, VReg * Vj, 1994 uint64_t imm, int idx, Int128 mask) 1995 { 1996 Int128 shft_res1, shft_res2, r1, r2; 1997 if (imm == 0) { 1998 shft_res1 = Vj->Q(idx); 1999 shft_res2 = Vd->Q(idx); 2000 } else { 2001 r1 = int128_and(int128_urshift(Vj->Q(idx), (imm - 1)), int128_one()); 2002 r2 = int128_and(int128_urshift(Vd->Q(idx), (imm - 1)), int128_one()); 2003 shft_res1 = (int128_add(int128_urshift(Vj->Q(idx), imm), r1)); 2004 shft_res2 = (int128_add(int128_urshift(Vd->Q(idx), imm), r2)); 2005 } 2006 2007 if (int128_ult(mask, shft_res1)) { 2008 Vd->D(idx * 2) = int128_getlo(mask); 2009 }else { 2010 Vd->D(idx * 2) = int128_getlo(shft_res1); 2011 } 2012 2013 if (int128_ult(mask, shft_res2)) { 2014 Vd->D(idx * 2 + 1) = int128_getlo(mask); 2015 }else { 2016 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 2017 } 2018 } 2019 2020 void HELPER(vssrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 2021 { 2022 int i; 2023 Int128 mask; 2024 VReg *Vd = (VReg *)vd; 2025 VReg *Vj = (VReg *)vj; 2026 int oprsz = simd_oprsz(desc); 2027 2028 mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); 2029 2030 for (i = 0; i < oprsz / 16; i++) { 2031 do_vssrlrni_q(Vd, Vj, imm, i, mask); 2032 } 2033 } 2034 2035 VSSRLRNI(vssrlrni_b_h, 16, B, H) 2036 VSSRLRNI(vssrlrni_h_w, 32, H, W) 2037 VSSRLRNI(vssrlrni_w_d, 64, W, D) 2038 2039 #define VSSRARNI(NAME, BIT, E1, E2) \ 2040 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2041 { \ 2042 int i, j, ofs; \ 2043 VReg temp = {}; \ 2044 VReg *Vd = (VReg *)vd; \ 2045 VReg *Vj = (VReg *)vj; \ 2046 int oprsz = simd_oprsz(desc); \ 2047 \ 2048 ofs = LSX_LEN / BIT; \ 2049 for (i = 0; i < oprsz / 16; i++) { \ 2050 for (j = 0; j < ofs; j++) { \ 2051 temp.E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i), \ 2052 imm, BIT / 2 - 1); \ 2053 temp.E1(j + ofs * (2 * i + 1)) = do_ssrarns_ ## E1(Vd->E2(j + ofs * i), \ 2054 imm, BIT / 2 - 1); \ 2055 } \ 2056 } \ 2057 *Vd = temp; \ 2058 } 2059 2060 static void do_vssrarni_d_q(VReg *Vd, VReg *Vj, 2061 uint64_t imm, int idx, Int128 mask1, Int128 mask2) 2062 { 2063 Int128 shft_res1, shft_res2, r1, r2; 2064 2065 if (imm == 0) { 2066 shft_res1 = Vj->Q(idx); 2067 shft_res2 = Vd->Q(idx); 2068 } else { 2069 r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one()); 2070 r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one()); 2071 shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1); 2072 shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2); 2073 } 2074 if (int128_gt(shft_res1, mask1)) { 2075 Vd->D(idx * 2) = int128_getlo(mask1); 2076 } else if (int128_lt(shft_res1, int128_neg(mask2))) { 2077 Vd->D(idx * 2) = int128_getlo(mask2); 2078 } else { 2079 Vd->D(idx * 2) = int128_getlo(shft_res1); 2080 } 2081 2082 if (int128_gt(shft_res2, mask1)) { 2083 Vd->D(idx * 2 + 1) = int128_getlo(mask1); 2084 } else if (int128_lt(shft_res2, int128_neg(mask2))) { 2085 Vd->D(idx * 2 + 1) = int128_getlo(mask2); 2086 } else { 2087 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 2088 } 2089 } 2090 2091 void HELPER(vssrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 2092 { 2093 int i; 2094 Int128 mask1, mask2; 2095 VReg *Vd = (VReg *)vd; 2096 VReg *Vj = (VReg *)vj; 2097 int oprsz = simd_oprsz(desc); 2098 2099 mask1 = int128_sub(int128_lshift(int128_one(), 63), int128_one()); 2100 mask2 = int128_lshift(int128_one(), 63); 2101 2102 for (i = 0; i < oprsz / 16; i++) { 2103 do_vssrarni_d_q(Vd, Vj, imm, i, mask1, mask2); 2104 } 2105 } 2106 2107 VSSRARNI(vssrarni_b_h, 16, B, H) 2108 VSSRARNI(vssrarni_h_w, 32, H, W) 2109 VSSRARNI(vssrarni_w_d, 64, W, D) 2110 2111 #define VSSRLRNUI(NAME, BIT, E1, E2) \ 2112 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2113 { \ 2114 int i, j, ofs; \ 2115 VReg temp = {}; \ 2116 VReg *Vd = (VReg *)vd; \ 2117 VReg *Vj = (VReg *)vj; \ 2118 int oprsz = simd_oprsz(desc); \ 2119 \ 2120 ofs = LSX_LEN / BIT; \ 2121 for (i = 0; i < oprsz / 16; i++) { \ 2122 for (j = 0; j < ofs; j++) { \ 2123 temp.E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i), \ 2124 imm, BIT / 2); \ 2125 temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrnu_ ## E1(Vd->E2(j + ofs * i), \ 2126 imm, BIT / 2); \ 2127 } \ 2128 } \ 2129 *Vd = temp; \ 2130 } 2131 2132 void HELPER(vssrlrni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 2133 { 2134 int i; 2135 Int128 mask; 2136 VReg *Vd = (VReg *)vd; 2137 VReg *Vj = (VReg *)vj; 2138 int oprsz = simd_oprsz(desc); 2139 2140 mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); 2141 2142 for (i = 0; i < oprsz / 16; i++) { 2143 do_vssrlrni_q(Vd, Vj, imm, i, mask); 2144 } 2145 } 2146 2147 VSSRLRNUI(vssrlrni_bu_h, 16, B, H) 2148 VSSRLRNUI(vssrlrni_hu_w, 32, H, W) 2149 VSSRLRNUI(vssrlrni_wu_d, 64, W, D) 2150 2151 #define VSSRARNUI(NAME, BIT, E1, E2) \ 2152 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2153 { \ 2154 int i, j, ofs; \ 2155 VReg temp = {}; \ 2156 VReg *Vd = (VReg *)vd; \ 2157 VReg *Vj = (VReg *)vj; \ 2158 int oprsz = simd_oprsz(desc); \ 2159 \ 2160 ofs = LSX_LEN / BIT; \ 2161 for (i = 0; i < oprsz / 16; i++) { \ 2162 for (j = 0; j < ofs; j++) { \ 2163 temp.E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i), \ 2164 imm, BIT / 2); \ 2165 temp.E1(j + ofs * (2 * i + 1)) = do_ssrarnu_ ## E1(Vd->E2(j + ofs * i), \ 2166 imm, BIT / 2); \ 2167 } \ 2168 } \ 2169 *Vd = temp; \ 2170 } 2171 2172 static void do_vssrarni_du_q(VReg *Vd, VReg *Vj, 2173 uint64_t imm, int idx, Int128 mask1, Int128 mask2) 2174 { 2175 Int128 shft_res1, shft_res2, r1, r2; 2176 2177 if (imm == 0) { 2178 shft_res1 = Vj->Q(idx); 2179 shft_res2 = Vd->Q(idx); 2180 } else { 2181 r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one()); 2182 r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one()); 2183 shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1); 2184 shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2); 2185 } 2186 2187 if (int128_lt(Vj->Q(idx), int128_zero())) { 2188 shft_res1 = int128_zero(); 2189 } 2190 if (int128_lt(Vd->Q(idx), int128_zero())) { 2191 shft_res2 = int128_zero(); 2192 } 2193 2194 if (int128_gt(shft_res1, mask1)) { 2195 Vd->D(idx * 2) = int128_getlo(mask1); 2196 } else if (int128_lt(shft_res1, int128_neg(mask2))) { 2197 Vd->D(idx * 2) = int128_getlo(mask2); 2198 } else { 2199 Vd->D(idx * 2) = int128_getlo(shft_res1); 2200 } 2201 2202 if (int128_gt(shft_res2, mask1)) { 2203 Vd->D(idx * 2 + 1) = int128_getlo(mask1); 2204 } else if (int128_lt(shft_res2, int128_neg(mask2))) { 2205 Vd->D(idx * 2 + 1) = int128_getlo(mask2); 2206 } else { 2207 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 2208 } 2209 } 2210 2211 void HELPER(vssrarni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 2212 { 2213 int i; 2214 Int128 mask1, mask2; 2215 VReg *Vd = (VReg *)vd; 2216 VReg *Vj = (VReg *)vj; 2217 int oprsz = simd_oprsz(desc); 2218 2219 mask1 = int128_sub(int128_lshift(int128_one(), 64), int128_one()); 2220 mask2 = int128_lshift(int128_one(), 64); 2221 2222 for (i = 0; i < oprsz / 16; i++) { 2223 do_vssrarni_du_q(Vd, Vj, imm, i, mask1, mask2); 2224 } 2225 } 2226 2227 VSSRARNUI(vssrarni_bu_h, 16, B, H) 2228 VSSRARNUI(vssrarni_hu_w, 32, H, W) 2229 VSSRARNUI(vssrarni_wu_d, 64, W, D) 2230 2231 #define DO_2OP(NAME, BIT, E, DO_OP) \ 2232 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ 2233 { \ 2234 int i; \ 2235 VReg *Vd = (VReg *)vd; \ 2236 VReg *Vj = (VReg *)vj; \ 2237 int oprsz = simd_oprsz(desc); \ 2238 \ 2239 for (i = 0; i < oprsz / (BIT / 8); i++) \ 2240 { \ 2241 Vd->E(i) = DO_OP(Vj->E(i)); \ 2242 } \ 2243 } 2244 2245 DO_2OP(vclo_b, 8, UB, DO_CLO_B) 2246 DO_2OP(vclo_h, 16, UH, DO_CLO_H) 2247 DO_2OP(vclo_w, 32, UW, DO_CLO_W) 2248 DO_2OP(vclo_d, 64, UD, DO_CLO_D) 2249 DO_2OP(vclz_b, 8, UB, DO_CLZ_B) 2250 DO_2OP(vclz_h, 16, UH, DO_CLZ_H) 2251 DO_2OP(vclz_w, 32, UW, DO_CLZ_W) 2252 DO_2OP(vclz_d, 64, UD, DO_CLZ_D) 2253 2254 #define VPCNT(NAME, BIT, E, FN) \ 2255 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ 2256 { \ 2257 int i; \ 2258 VReg *Vd = (VReg *)vd; \ 2259 VReg *Vj = (VReg *)vj; \ 2260 int oprsz = simd_oprsz(desc); \ 2261 \ 2262 for (i = 0; i < oprsz / (BIT / 8); i++) \ 2263 { \ 2264 Vd->E(i) = FN(Vj->E(i)); \ 2265 } \ 2266 } 2267 2268 VPCNT(vpcnt_b, 8, UB, ctpop8) 2269 VPCNT(vpcnt_h, 16, UH, ctpop16) 2270 VPCNT(vpcnt_w, 32, UW, ctpop32) 2271 VPCNT(vpcnt_d, 64, UD, ctpop64) 2272 2273 #define DO_BIT(NAME, BIT, E, DO_OP) \ 2274 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 2275 { \ 2276 int i; \ 2277 VReg *Vd = (VReg *)vd; \ 2278 VReg *Vj = (VReg *)vj; \ 2279 VReg *Vk = (VReg *)vk; \ 2280 int oprsz = simd_oprsz(desc); \ 2281 \ 2282 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2283 Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)%BIT); \ 2284 } \ 2285 } 2286 2287 DO_BIT(vbitclr_b, 8, UB, DO_BITCLR) 2288 DO_BIT(vbitclr_h, 16, UH, DO_BITCLR) 2289 DO_BIT(vbitclr_w, 32, UW, DO_BITCLR) 2290 DO_BIT(vbitclr_d, 64, UD, DO_BITCLR) 2291 DO_BIT(vbitset_b, 8, UB, DO_BITSET) 2292 DO_BIT(vbitset_h, 16, UH, DO_BITSET) 2293 DO_BIT(vbitset_w, 32, UW, DO_BITSET) 2294 DO_BIT(vbitset_d, 64, UD, DO_BITSET) 2295 DO_BIT(vbitrev_b, 8, UB, DO_BITREV) 2296 DO_BIT(vbitrev_h, 16, UH, DO_BITREV) 2297 DO_BIT(vbitrev_w, 32, UW, DO_BITREV) 2298 DO_BIT(vbitrev_d, 64, UD, DO_BITREV) 2299 2300 #define DO_BITI(NAME, BIT, E, DO_OP) \ 2301 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2302 { \ 2303 int i; \ 2304 VReg *Vd = (VReg *)vd; \ 2305 VReg *Vj = (VReg *)vj; \ 2306 int oprsz = simd_oprsz(desc); \ 2307 \ 2308 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2309 Vd->E(i) = DO_OP(Vj->E(i), imm); \ 2310 } \ 2311 } 2312 2313 DO_BITI(vbitclri_b, 8, UB, DO_BITCLR) 2314 DO_BITI(vbitclri_h, 16, UH, DO_BITCLR) 2315 DO_BITI(vbitclri_w, 32, UW, DO_BITCLR) 2316 DO_BITI(vbitclri_d, 64, UD, DO_BITCLR) 2317 DO_BITI(vbitseti_b, 8, UB, DO_BITSET) 2318 DO_BITI(vbitseti_h, 16, UH, DO_BITSET) 2319 DO_BITI(vbitseti_w, 32, UW, DO_BITSET) 2320 DO_BITI(vbitseti_d, 64, UD, DO_BITSET) 2321 DO_BITI(vbitrevi_b, 8, UB, DO_BITREV) 2322 DO_BITI(vbitrevi_h, 16, UH, DO_BITREV) 2323 DO_BITI(vbitrevi_w, 32, UW, DO_BITREV) 2324 DO_BITI(vbitrevi_d, 64, UD, DO_BITREV) 2325 2326 #define VFRSTP(NAME, BIT, MASK, E) \ 2327 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 2328 { \ 2329 int i, j, m, ofs; \ 2330 VReg *Vd = (VReg *)vd; \ 2331 VReg *Vj = (VReg *)vj; \ 2332 VReg *Vk = (VReg *)vk; \ 2333 int oprsz = simd_oprsz(desc); \ 2334 \ 2335 ofs = LSX_LEN / BIT; \ 2336 for (i = 0; i < oprsz / 16; i++) { \ 2337 m = Vk->E(i * ofs) & MASK; \ 2338 for (j = 0; j < ofs; j++) { \ 2339 if (Vj->E(j + ofs * i) < 0) { \ 2340 break; \ 2341 } \ 2342 } \ 2343 Vd->E(m + i * ofs) = j; \ 2344 } \ 2345 } 2346 2347 VFRSTP(vfrstp_b, 8, 0xf, B) 2348 VFRSTP(vfrstp_h, 16, 0x7, H) 2349 2350 #define VFRSTPI(NAME, BIT, E) \ 2351 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2352 { \ 2353 int i, j, m, ofs; \ 2354 VReg *Vd = (VReg *)vd; \ 2355 VReg *Vj = (VReg *)vj; \ 2356 int oprsz = simd_oprsz(desc); \ 2357 \ 2358 ofs = LSX_LEN / BIT; \ 2359 m = imm % ofs; \ 2360 for (i = 0; i < oprsz / 16; i++) { \ 2361 for (j = 0; j < ofs; j++) { \ 2362 if (Vj->E(j + ofs * i) < 0) { \ 2363 break; \ 2364 } \ 2365 } \ 2366 Vd->E(m + i * ofs) = j; \ 2367 } \ 2368 } 2369 2370 VFRSTPI(vfrstpi_b, 8, B) 2371 VFRSTPI(vfrstpi_h, 16, H) 2372 2373 static void vec_update_fcsr0_mask(CPULoongArchState *env, 2374 uintptr_t pc, int mask) 2375 { 2376 int flags = get_float_exception_flags(&env->fp_status); 2377 2378 set_float_exception_flags(0, &env->fp_status); 2379 2380 flags &= ~mask; 2381 2382 if (flags) { 2383 flags = ieee_ex_to_loongarch(flags); 2384 UPDATE_FP_CAUSE(env->fcsr0, flags); 2385 } 2386 2387 if (GET_FP_ENABLES(env->fcsr0) & flags) { 2388 do_raise_exception(env, EXCCODE_FPE, pc); 2389 } else { 2390 UPDATE_FP_FLAGS(env->fcsr0, flags); 2391 } 2392 } 2393 2394 static void vec_update_fcsr0(CPULoongArchState *env, uintptr_t pc) 2395 { 2396 vec_update_fcsr0_mask(env, pc, 0); 2397 } 2398 2399 static inline void vec_clear_cause(CPULoongArchState *env) 2400 { 2401 SET_FP_CAUSE(env->fcsr0, 0); 2402 } 2403 2404 #define DO_3OP_F(NAME, BIT, E, FN) \ 2405 void HELPER(NAME)(void *vd, void *vj, void *vk, \ 2406 CPULoongArchState *env, uint32_t desc) \ 2407 { \ 2408 int i; \ 2409 VReg *Vd = (VReg *)vd; \ 2410 VReg *Vj = (VReg *)vj; \ 2411 VReg *Vk = (VReg *)vk; \ 2412 int oprsz = simd_oprsz(desc); \ 2413 \ 2414 vec_clear_cause(env); \ 2415 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2416 Vd->E(i) = FN(Vj->E(i), Vk->E(i), &env->fp_status); \ 2417 vec_update_fcsr0(env, GETPC()); \ 2418 } \ 2419 } 2420 2421 DO_3OP_F(vfadd_s, 32, UW, float32_add) 2422 DO_3OP_F(vfadd_d, 64, UD, float64_add) 2423 DO_3OP_F(vfsub_s, 32, UW, float32_sub) 2424 DO_3OP_F(vfsub_d, 64, UD, float64_sub) 2425 DO_3OP_F(vfmul_s, 32, UW, float32_mul) 2426 DO_3OP_F(vfmul_d, 64, UD, float64_mul) 2427 DO_3OP_F(vfdiv_s, 32, UW, float32_div) 2428 DO_3OP_F(vfdiv_d, 64, UD, float64_div) 2429 DO_3OP_F(vfmax_s, 32, UW, float32_maxnum) 2430 DO_3OP_F(vfmax_d, 64, UD, float64_maxnum) 2431 DO_3OP_F(vfmin_s, 32, UW, float32_minnum) 2432 DO_3OP_F(vfmin_d, 64, UD, float64_minnum) 2433 DO_3OP_F(vfmaxa_s, 32, UW, float32_maxnummag) 2434 DO_3OP_F(vfmaxa_d, 64, UD, float64_maxnummag) 2435 DO_3OP_F(vfmina_s, 32, UW, float32_minnummag) 2436 DO_3OP_F(vfmina_d, 64, UD, float64_minnummag) 2437 2438 #define DO_4OP_F(NAME, BIT, E, FN, flags) \ 2439 void HELPER(NAME)(void *vd, void *vj, void *vk, void *va, \ 2440 CPULoongArchState *env, uint32_t desc) \ 2441 { \ 2442 int i; \ 2443 VReg *Vd = (VReg *)vd; \ 2444 VReg *Vj = (VReg *)vj; \ 2445 VReg *Vk = (VReg *)vk; \ 2446 VReg *Va = (VReg *)va; \ 2447 int oprsz = simd_oprsz(desc); \ 2448 \ 2449 vec_clear_cause(env); \ 2450 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2451 Vd->E(i) = FN(Vj->E(i), Vk->E(i), Va->E(i), flags, &env->fp_status); \ 2452 vec_update_fcsr0(env, GETPC()); \ 2453 } \ 2454 } 2455 2456 DO_4OP_F(vfmadd_s, 32, UW, float32_muladd, 0) 2457 DO_4OP_F(vfmadd_d, 64, UD, float64_muladd, 0) 2458 DO_4OP_F(vfmsub_s, 32, UW, float32_muladd, float_muladd_negate_c) 2459 DO_4OP_F(vfmsub_d, 64, UD, float64_muladd, float_muladd_negate_c) 2460 DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_result) 2461 DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_result) 2462 DO_4OP_F(vfnmsub_s, 32, UW, float32_muladd, 2463 float_muladd_negate_c | float_muladd_negate_result) 2464 DO_4OP_F(vfnmsub_d, 64, UD, float64_muladd, 2465 float_muladd_negate_c | float_muladd_negate_result) 2466 2467 #define DO_2OP_F(NAME, BIT, E, FN) \ 2468 void HELPER(NAME)(void *vd, void *vj, \ 2469 CPULoongArchState *env, uint32_t desc) \ 2470 { \ 2471 int i; \ 2472 VReg *Vd = (VReg *)vd; \ 2473 VReg *Vj = (VReg *)vj; \ 2474 int oprsz = simd_oprsz(desc); \ 2475 \ 2476 vec_clear_cause(env); \ 2477 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2478 Vd->E(i) = FN(env, Vj->E(i)); \ 2479 } \ 2480 } 2481 2482 #define FLOGB(BIT, T) \ 2483 static T do_flogb_## BIT(CPULoongArchState *env, T fj) \ 2484 { \ 2485 T fp, fd; \ 2486 float_status *status = &env->fp_status; \ 2487 FloatRoundMode old_mode = get_float_rounding_mode(status); \ 2488 \ 2489 set_float_rounding_mode(float_round_down, status); \ 2490 fp = float ## BIT ##_log2(fj, status); \ 2491 fd = float ## BIT ##_round_to_int(fp, status); \ 2492 set_float_rounding_mode(old_mode, status); \ 2493 vec_update_fcsr0_mask(env, GETPC(), float_flag_inexact); \ 2494 return fd; \ 2495 } 2496 2497 FLOGB(32, uint32_t) 2498 FLOGB(64, uint64_t) 2499 2500 #define FCLASS(NAME, BIT, E, FN) \ 2501 void HELPER(NAME)(void *vd, void *vj, \ 2502 CPULoongArchState *env, uint32_t desc) \ 2503 { \ 2504 int i; \ 2505 VReg *Vd = (VReg *)vd; \ 2506 VReg *Vj = (VReg *)vj; \ 2507 int oprsz = simd_oprsz(desc); \ 2508 \ 2509 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2510 Vd->E(i) = FN(env, Vj->E(i)); \ 2511 } \ 2512 } 2513 2514 FCLASS(vfclass_s, 32, UW, helper_fclass_s) 2515 FCLASS(vfclass_d, 64, UD, helper_fclass_d) 2516 2517 #define FSQRT(BIT, T) \ 2518 static T do_fsqrt_## BIT(CPULoongArchState *env, T fj) \ 2519 { \ 2520 T fd; \ 2521 fd = float ## BIT ##_sqrt(fj, &env->fp_status); \ 2522 vec_update_fcsr0(env, GETPC()); \ 2523 return fd; \ 2524 } 2525 2526 FSQRT(32, uint32_t) 2527 FSQRT(64, uint64_t) 2528 2529 #define FRECIP(BIT, T) \ 2530 static T do_frecip_## BIT(CPULoongArchState *env, T fj) \ 2531 { \ 2532 T fd; \ 2533 fd = float ## BIT ##_div(float ## BIT ##_one, fj, &env->fp_status); \ 2534 vec_update_fcsr0(env, GETPC()); \ 2535 return fd; \ 2536 } 2537 2538 FRECIP(32, uint32_t) 2539 FRECIP(64, uint64_t) 2540 2541 #define FRSQRT(BIT, T) \ 2542 static T do_frsqrt_## BIT(CPULoongArchState *env, T fj) \ 2543 { \ 2544 T fd, fp; \ 2545 fp = float ## BIT ##_sqrt(fj, &env->fp_status); \ 2546 fd = float ## BIT ##_div(float ## BIT ##_one, fp, &env->fp_status); \ 2547 vec_update_fcsr0(env, GETPC()); \ 2548 return fd; \ 2549 } 2550 2551 FRSQRT(32, uint32_t) 2552 FRSQRT(64, uint64_t) 2553 2554 DO_2OP_F(vflogb_s, 32, UW, do_flogb_32) 2555 DO_2OP_F(vflogb_d, 64, UD, do_flogb_64) 2556 DO_2OP_F(vfsqrt_s, 32, UW, do_fsqrt_32) 2557 DO_2OP_F(vfsqrt_d, 64, UD, do_fsqrt_64) 2558 DO_2OP_F(vfrecip_s, 32, UW, do_frecip_32) 2559 DO_2OP_F(vfrecip_d, 64, UD, do_frecip_64) 2560 DO_2OP_F(vfrsqrt_s, 32, UW, do_frsqrt_32) 2561 DO_2OP_F(vfrsqrt_d, 64, UD, do_frsqrt_64) 2562 2563 static uint32_t float16_cvt_float32(uint16_t h, float_status *status) 2564 { 2565 return float16_to_float32(h, true, status); 2566 } 2567 static uint64_t float32_cvt_float64(uint32_t s, float_status *status) 2568 { 2569 return float32_to_float64(s, status); 2570 } 2571 2572 static uint16_t float32_cvt_float16(uint32_t s, float_status *status) 2573 { 2574 return float32_to_float16(s, true, status); 2575 } 2576 static uint32_t float64_cvt_float32(uint64_t d, float_status *status) 2577 { 2578 return float64_to_float32(d, status); 2579 } 2580 2581 void HELPER(vfcvtl_s_h)(void *vd, void *vj, 2582 CPULoongArchState *env, uint32_t desc) 2583 { 2584 int i, j, ofs; 2585 VReg temp = {}; 2586 VReg *Vd = (VReg *)vd; 2587 VReg *Vj = (VReg *)vj; 2588 int oprsz = simd_oprsz(desc); 2589 2590 ofs = LSX_LEN / 32; 2591 vec_clear_cause(env); 2592 for (i = 0; i < oprsz / 16; i++) { 2593 for (j = 0; j < ofs; j++) { 2594 temp.UW(j + ofs * i) =float16_cvt_float32(Vj->UH(j + ofs * 2 * i), 2595 &env->fp_status); 2596 } 2597 vec_update_fcsr0(env, GETPC()); 2598 } 2599 *Vd = temp; 2600 } 2601 2602 void HELPER(vfcvtl_d_s)(void *vd, void *vj, 2603 CPULoongArchState *env, uint32_t desc) 2604 { 2605 int i, j, ofs; 2606 VReg temp = {}; 2607 VReg *Vd = (VReg *)vd; 2608 VReg *Vj = (VReg *)vj; 2609 int oprsz = simd_oprsz(desc); 2610 2611 ofs = LSX_LEN / 64; 2612 vec_clear_cause(env); 2613 for (i = 0; i < oprsz / 16; i++) { 2614 for (j = 0; j < ofs; j++) { 2615 temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * 2 * i), 2616 &env->fp_status); 2617 } 2618 vec_update_fcsr0(env, GETPC()); 2619 } 2620 *Vd = temp; 2621 } 2622 2623 void HELPER(vfcvth_s_h)(void *vd, void *vj, 2624 CPULoongArchState *env, uint32_t desc) 2625 { 2626 int i, j, ofs; 2627 VReg temp = {}; 2628 VReg *Vd = (VReg *)vd; 2629 VReg *Vj = (VReg *)vj; 2630 int oprsz = simd_oprsz(desc); 2631 2632 ofs = LSX_LEN / 32; 2633 vec_clear_cause(env); 2634 for (i = 0; i < oprsz / 16; i++) { 2635 for (j = 0; j < ofs; j++) { 2636 temp.UW(j + ofs * i) = float16_cvt_float32(Vj->UH(j + ofs * (2 * i + 1)), 2637 &env->fp_status); 2638 } 2639 vec_update_fcsr0(env, GETPC()); 2640 } 2641 *Vd = temp; 2642 } 2643 2644 void HELPER(vfcvth_d_s)(void *vd, void *vj, 2645 CPULoongArchState *env, uint32_t desc) 2646 { 2647 int i, j, ofs; 2648 VReg temp = {}; 2649 VReg *Vd = (VReg *)vd; 2650 VReg *Vj = (VReg *)vj; 2651 int oprsz = simd_oprsz(desc); 2652 2653 ofs = LSX_LEN / 64; 2654 vec_clear_cause(env); 2655 for (i = 0; i < oprsz / 16; i++) { 2656 for (j = 0; j < ofs; j++) { 2657 temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * (2 * i + 1)), 2658 &env->fp_status); 2659 } 2660 vec_update_fcsr0(env, GETPC()); 2661 } 2662 *Vd = temp; 2663 } 2664 2665 void HELPER(vfcvt_h_s)(void *vd, void *vj, void *vk, 2666 CPULoongArchState *env, uint32_t desc) 2667 { 2668 int i, j, ofs; 2669 VReg temp = {}; 2670 VReg *Vd = (VReg *)vd; 2671 VReg *Vj = (VReg *)vj; 2672 VReg *Vk = (VReg *)vk; 2673 int oprsz = simd_oprsz(desc); 2674 2675 ofs = LSX_LEN / 32; 2676 vec_clear_cause(env); 2677 for(i = 0; i < oprsz / 16; i++) { 2678 for (j = 0; j < ofs; j++) { 2679 temp.UH(j + ofs * (2 * i + 1)) = float32_cvt_float16(Vj->UW(j + ofs * i), 2680 &env->fp_status); 2681 temp.UH(j + ofs * 2 * i) = float32_cvt_float16(Vk->UW(j + ofs * i), 2682 &env->fp_status); 2683 } 2684 vec_update_fcsr0(env, GETPC()); 2685 } 2686 *Vd = temp; 2687 } 2688 2689 void HELPER(vfcvt_s_d)(void *vd, void *vj, void *vk, 2690 CPULoongArchState *env, uint32_t desc) 2691 { 2692 int i, j, ofs; 2693 VReg temp = {}; 2694 VReg *Vd = (VReg *)vd; 2695 VReg *Vj = (VReg *)vj; 2696 VReg *Vk = (VReg *)vk; 2697 int oprsz = simd_oprsz(desc); 2698 2699 ofs = LSX_LEN / 64; 2700 vec_clear_cause(env); 2701 for(i = 0; i < oprsz / 16; i++) { 2702 for (j = 0; j < ofs; j++) { 2703 temp.UW(j + ofs * (2 * i + 1)) = float64_cvt_float32(Vj->UD(j + ofs * i), 2704 &env->fp_status); 2705 temp.UW(j + ofs * 2 * i) = float64_cvt_float32(Vk->UD(j + ofs * i), 2706 &env->fp_status); 2707 } 2708 vec_update_fcsr0(env, GETPC()); 2709 } 2710 *Vd = temp; 2711 } 2712 2713 void HELPER(vfrint_s)(void *vd, void *vj, 2714 CPULoongArchState *env, uint32_t desc) 2715 { 2716 int i; 2717 VReg *Vd = (VReg *)vd; 2718 VReg *Vj = (VReg *)vj; 2719 int oprsz = simd_oprsz(desc); 2720 2721 vec_clear_cause(env); 2722 for (i = 0; i < oprsz / 4; i++) { 2723 Vd->W(i) = float32_round_to_int(Vj->UW(i), &env->fp_status); 2724 vec_update_fcsr0(env, GETPC()); 2725 } 2726 } 2727 2728 void HELPER(vfrint_d)(void *vd, void *vj, 2729 CPULoongArchState *env, uint32_t desc) 2730 { 2731 int i; 2732 VReg *Vd = (VReg *)vd; 2733 VReg *Vj = (VReg *)vj; 2734 int oprsz = simd_oprsz(desc); 2735 2736 vec_clear_cause(env); 2737 for (i = 0; i < oprsz / 8; i++) { 2738 Vd->D(i) = float64_round_to_int(Vj->UD(i), &env->fp_status); 2739 vec_update_fcsr0(env, GETPC()); 2740 } 2741 } 2742 2743 #define FCVT_2OP(NAME, BIT, E, MODE) \ 2744 void HELPER(NAME)(void *vd, void *vj, \ 2745 CPULoongArchState *env, uint32_t desc) \ 2746 { \ 2747 int i; \ 2748 VReg *Vd = (VReg *)vd; \ 2749 VReg *Vj = (VReg *)vj; \ 2750 int oprsz = simd_oprsz(desc); \ 2751 \ 2752 vec_clear_cause(env); \ 2753 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2754 FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \ 2755 set_float_rounding_mode(MODE, &env->fp_status); \ 2756 Vd->E(i) = float## BIT ## _round_to_int(Vj->E(i), &env->fp_status); \ 2757 set_float_rounding_mode(old_mode, &env->fp_status); \ 2758 vec_update_fcsr0(env, GETPC()); \ 2759 } \ 2760 } 2761 2762 FCVT_2OP(vfrintrne_s, 32, UW, float_round_nearest_even) 2763 FCVT_2OP(vfrintrne_d, 64, UD, float_round_nearest_even) 2764 FCVT_2OP(vfrintrz_s, 32, UW, float_round_to_zero) 2765 FCVT_2OP(vfrintrz_d, 64, UD, float_round_to_zero) 2766 FCVT_2OP(vfrintrp_s, 32, UW, float_round_up) 2767 FCVT_2OP(vfrintrp_d, 64, UD, float_round_up) 2768 FCVT_2OP(vfrintrm_s, 32, UW, float_round_down) 2769 FCVT_2OP(vfrintrm_d, 64, UD, float_round_down) 2770 2771 #define FTINT(NAME, FMT1, FMT2, T1, T2, MODE) \ 2772 static T2 do_ftint ## NAME(CPULoongArchState *env, T1 fj) \ 2773 { \ 2774 T2 fd; \ 2775 FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \ 2776 \ 2777 set_float_rounding_mode(MODE, &env->fp_status); \ 2778 fd = do_## FMT1 ##_to_## FMT2(env, fj); \ 2779 set_float_rounding_mode(old_mode, &env->fp_status); \ 2780 return fd; \ 2781 } 2782 2783 #define DO_FTINT(FMT1, FMT2, T1, T2) \ 2784 static T2 do_## FMT1 ##_to_## FMT2(CPULoongArchState *env, T1 fj) \ 2785 { \ 2786 T2 fd; \ 2787 \ 2788 fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \ 2789 if (get_float_exception_flags(&env->fp_status) & (float_flag_invalid)) { \ 2790 if (FMT1 ##_is_any_nan(fj)) { \ 2791 fd = 0; \ 2792 } \ 2793 } \ 2794 vec_update_fcsr0(env, GETPC()); \ 2795 return fd; \ 2796 } 2797 2798 DO_FTINT(float32, int32, uint32_t, uint32_t) 2799 DO_FTINT(float64, int64, uint64_t, uint64_t) 2800 DO_FTINT(float32, uint32, uint32_t, uint32_t) 2801 DO_FTINT(float64, uint64, uint64_t, uint64_t) 2802 DO_FTINT(float64, int32, uint64_t, uint32_t) 2803 DO_FTINT(float32, int64, uint32_t, uint64_t) 2804 2805 FTINT(rne_w_s, float32, int32, uint32_t, uint32_t, float_round_nearest_even) 2806 FTINT(rne_l_d, float64, int64, uint64_t, uint64_t, float_round_nearest_even) 2807 FTINT(rp_w_s, float32, int32, uint32_t, uint32_t, float_round_up) 2808 FTINT(rp_l_d, float64, int64, uint64_t, uint64_t, float_round_up) 2809 FTINT(rz_w_s, float32, int32, uint32_t, uint32_t, float_round_to_zero) 2810 FTINT(rz_l_d, float64, int64, uint64_t, uint64_t, float_round_to_zero) 2811 FTINT(rm_w_s, float32, int32, uint32_t, uint32_t, float_round_down) 2812 FTINT(rm_l_d, float64, int64, uint64_t, uint64_t, float_round_down) 2813 2814 DO_2OP_F(vftintrne_w_s, 32, UW, do_ftintrne_w_s) 2815 DO_2OP_F(vftintrne_l_d, 64, UD, do_ftintrne_l_d) 2816 DO_2OP_F(vftintrp_w_s, 32, UW, do_ftintrp_w_s) 2817 DO_2OP_F(vftintrp_l_d, 64, UD, do_ftintrp_l_d) 2818 DO_2OP_F(vftintrz_w_s, 32, UW, do_ftintrz_w_s) 2819 DO_2OP_F(vftintrz_l_d, 64, UD, do_ftintrz_l_d) 2820 DO_2OP_F(vftintrm_w_s, 32, UW, do_ftintrm_w_s) 2821 DO_2OP_F(vftintrm_l_d, 64, UD, do_ftintrm_l_d) 2822 DO_2OP_F(vftint_w_s, 32, UW, do_float32_to_int32) 2823 DO_2OP_F(vftint_l_d, 64, UD, do_float64_to_int64) 2824 2825 FTINT(rz_wu_s, float32, uint32, uint32_t, uint32_t, float_round_to_zero) 2826 FTINT(rz_lu_d, float64, uint64, uint64_t, uint64_t, float_round_to_zero) 2827 2828 DO_2OP_F(vftintrz_wu_s, 32, UW, do_ftintrz_wu_s) 2829 DO_2OP_F(vftintrz_lu_d, 64, UD, do_ftintrz_lu_d) 2830 DO_2OP_F(vftint_wu_s, 32, UW, do_float32_to_uint32) 2831 DO_2OP_F(vftint_lu_d, 64, UD, do_float64_to_uint64) 2832 2833 FTINT(rm_w_d, float64, int32, uint64_t, uint32_t, float_round_down) 2834 FTINT(rp_w_d, float64, int32, uint64_t, uint32_t, float_round_up) 2835 FTINT(rz_w_d, float64, int32, uint64_t, uint32_t, float_round_to_zero) 2836 FTINT(rne_w_d, float64, int32, uint64_t, uint32_t, float_round_nearest_even) 2837 2838 #define FTINT_W_D(NAME, FN) \ 2839 void HELPER(NAME)(void *vd, void *vj, void *vk, \ 2840 CPULoongArchState *env, uint32_t desc) \ 2841 { \ 2842 int i, j, ofs; \ 2843 VReg temp = {}; \ 2844 VReg *Vd = (VReg *)vd; \ 2845 VReg *Vj = (VReg *)vj; \ 2846 VReg *Vk = (VReg *)vk; \ 2847 int oprsz = simd_oprsz(desc); \ 2848 \ 2849 ofs = LSX_LEN / 64; \ 2850 vec_clear_cause(env); \ 2851 for (i = 0; i < oprsz / 16; i++) { \ 2852 for (j = 0; j < ofs; j++) { \ 2853 temp.W(j + ofs * (2 * i + 1)) = FN(env, Vj->UD(j + ofs * i)); \ 2854 temp.W(j + ofs * 2 * i) = FN(env, Vk->UD(j + ofs * i)); \ 2855 } \ 2856 } \ 2857 *Vd = temp; \ 2858 } 2859 2860 FTINT_W_D(vftint_w_d, do_float64_to_int32) 2861 FTINT_W_D(vftintrm_w_d, do_ftintrm_w_d) 2862 FTINT_W_D(vftintrp_w_d, do_ftintrp_w_d) 2863 FTINT_W_D(vftintrz_w_d, do_ftintrz_w_d) 2864 FTINT_W_D(vftintrne_w_d, do_ftintrne_w_d) 2865 2866 FTINT(rml_l_s, float32, int64, uint32_t, uint64_t, float_round_down) 2867 FTINT(rpl_l_s, float32, int64, uint32_t, uint64_t, float_round_up) 2868 FTINT(rzl_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero) 2869 FTINT(rnel_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even) 2870 FTINT(rmh_l_s, float32, int64, uint32_t, uint64_t, float_round_down) 2871 FTINT(rph_l_s, float32, int64, uint32_t, uint64_t, float_round_up) 2872 FTINT(rzh_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero) 2873 FTINT(rneh_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even) 2874 2875 #define FTINTL_L_S(NAME, FN) \ 2876 void HELPER(NAME)(void *vd, void *vj, \ 2877 CPULoongArchState *env, uint32_t desc) \ 2878 { \ 2879 int i, j, ofs; \ 2880 VReg temp; \ 2881 VReg *Vd = (VReg *)vd; \ 2882 VReg *Vj = (VReg *)vj; \ 2883 int oprsz = simd_oprsz(desc); \ 2884 \ 2885 ofs = LSX_LEN / 64; \ 2886 vec_clear_cause(env); \ 2887 for (i = 0; i < oprsz / 16; i++) { \ 2888 for (j = 0; j < ofs; j++) { \ 2889 temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * 2 * i)); \ 2890 } \ 2891 } \ 2892 *Vd = temp; \ 2893 } 2894 2895 FTINTL_L_S(vftintl_l_s, do_float32_to_int64) 2896 FTINTL_L_S(vftintrml_l_s, do_ftintrml_l_s) 2897 FTINTL_L_S(vftintrpl_l_s, do_ftintrpl_l_s) 2898 FTINTL_L_S(vftintrzl_l_s, do_ftintrzl_l_s) 2899 FTINTL_L_S(vftintrnel_l_s, do_ftintrnel_l_s) 2900 2901 #define FTINTH_L_S(NAME, FN) \ 2902 void HELPER(NAME)(void *vd, void *vj, \ 2903 CPULoongArchState *env, uint32_t desc) \ 2904 { \ 2905 int i, j, ofs; \ 2906 VReg temp = {}; \ 2907 VReg *Vd = (VReg *)vd; \ 2908 VReg *Vj = (VReg *)vj; \ 2909 int oprsz = simd_oprsz(desc); \ 2910 \ 2911 ofs = LSX_LEN / 64; \ 2912 vec_clear_cause(env); \ 2913 for (i = 0; i < oprsz / 16; i++) { \ 2914 for (j = 0; j < ofs; j++) { \ 2915 temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * (2 * i + 1))); \ 2916 } \ 2917 } \ 2918 *Vd = temp; \ 2919 } 2920 2921 FTINTH_L_S(vftinth_l_s, do_float32_to_int64) 2922 FTINTH_L_S(vftintrmh_l_s, do_ftintrmh_l_s) 2923 FTINTH_L_S(vftintrph_l_s, do_ftintrph_l_s) 2924 FTINTH_L_S(vftintrzh_l_s, do_ftintrzh_l_s) 2925 FTINTH_L_S(vftintrneh_l_s, do_ftintrneh_l_s) 2926 2927 #define FFINT(NAME, FMT1, FMT2, T1, T2) \ 2928 static T2 do_ffint_ ## NAME(CPULoongArchState *env, T1 fj) \ 2929 { \ 2930 T2 fd; \ 2931 \ 2932 fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \ 2933 vec_update_fcsr0(env, GETPC()); \ 2934 return fd; \ 2935 } 2936 2937 FFINT(s_w, int32, float32, int32_t, uint32_t) 2938 FFINT(d_l, int64, float64, int64_t, uint64_t) 2939 FFINT(s_wu, uint32, float32, uint32_t, uint32_t) 2940 FFINT(d_lu, uint64, float64, uint64_t, uint64_t) 2941 2942 DO_2OP_F(vffint_s_w, 32, W, do_ffint_s_w) 2943 DO_2OP_F(vffint_d_l, 64, D, do_ffint_d_l) 2944 DO_2OP_F(vffint_s_wu, 32, UW, do_ffint_s_wu) 2945 DO_2OP_F(vffint_d_lu, 64, UD, do_ffint_d_lu) 2946 2947 void HELPER(vffintl_d_w)(void *vd, void *vj, 2948 CPULoongArchState *env, uint32_t desc) 2949 { 2950 int i, j, ofs; 2951 VReg temp = {}; 2952 VReg *Vd = (VReg *)vd; 2953 VReg *Vj = (VReg *)vj; 2954 int oprsz = simd_oprsz(desc); 2955 2956 ofs = LSX_LEN / 64; 2957 vec_clear_cause(env); 2958 for (i = 0; i < oprsz / 16; i++) { 2959 for (j = 0; j < ofs; j++) { 2960 temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * 2 * i), 2961 &env->fp_status); 2962 } 2963 vec_update_fcsr0(env, GETPC()); 2964 } 2965 *Vd = temp; 2966 } 2967 2968 void HELPER(vffinth_d_w)(void *vd, void *vj, 2969 CPULoongArchState *env, uint32_t desc) 2970 { 2971 int i, j, ofs; 2972 VReg temp = {}; 2973 VReg *Vd = (VReg *)vd; 2974 VReg *Vj = (VReg *)vj; 2975 int oprsz = simd_oprsz(desc); 2976 2977 ofs = LSX_LEN / 64; 2978 vec_clear_cause(env); 2979 for (i = 0; i < oprsz /16; i++) { 2980 for (j = 0; j < ofs; j++) { 2981 temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * (2 * i + 1)), 2982 &env->fp_status); 2983 } 2984 vec_update_fcsr0(env, GETPC()); 2985 } 2986 *Vd = temp; 2987 } 2988 2989 void HELPER(vffint_s_l)(void *vd, void *vj, void *vk, 2990 CPULoongArchState *env, uint32_t desc) 2991 { 2992 int i, j, ofs; 2993 VReg temp = {}; 2994 VReg *Vd = (VReg *)vd; 2995 VReg *Vj = (VReg *)vj; 2996 VReg *Vk = (VReg *)vk; 2997 int oprsz = simd_oprsz(desc); 2998 2999 ofs = LSX_LEN / 64; 3000 vec_clear_cause(env); 3001 for (i = 0; i < oprsz / 16; i++) { 3002 for (j = 0; j < ofs; j++) { 3003 temp.W(j + ofs * (2 * i + 1)) = int64_to_float32(Vj->D(j + ofs * i), 3004 &env->fp_status); 3005 temp.W(j + ofs * 2 * i) = int64_to_float32(Vk->D(j + ofs * i), 3006 &env->fp_status); 3007 } 3008 vec_update_fcsr0(env, GETPC()); 3009 } 3010 *Vd = temp; 3011 } 3012 3013 #define VCMPI(NAME, BIT, E, DO_OP) \ 3014 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3015 { \ 3016 int i; \ 3017 VReg *Vd = (VReg *)vd; \ 3018 VReg *Vj = (VReg *)vj; \ 3019 typedef __typeof(Vd->E(0)) TD; \ 3020 int oprsz = simd_oprsz(desc); \ 3021 \ 3022 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3023 Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \ 3024 } \ 3025 } 3026 3027 VCMPI(vseqi_b, 8, B, VSEQ) 3028 VCMPI(vseqi_h, 16, H, VSEQ) 3029 VCMPI(vseqi_w, 32, W, VSEQ) 3030 VCMPI(vseqi_d, 64, D, VSEQ) 3031 VCMPI(vslei_b, 8, B, VSLE) 3032 VCMPI(vslei_h, 16, H, VSLE) 3033 VCMPI(vslei_w, 32, W, VSLE) 3034 VCMPI(vslei_d, 64, D, VSLE) 3035 VCMPI(vslei_bu, 8, UB, VSLE) 3036 VCMPI(vslei_hu, 16, UH, VSLE) 3037 VCMPI(vslei_wu, 32, UW, VSLE) 3038 VCMPI(vslei_du, 64, UD, VSLE) 3039 VCMPI(vslti_b, 8, B, VSLT) 3040 VCMPI(vslti_h, 16, H, VSLT) 3041 VCMPI(vslti_w, 32, W, VSLT) 3042 VCMPI(vslti_d, 64, D, VSLT) 3043 VCMPI(vslti_bu, 8, UB, VSLT) 3044 VCMPI(vslti_hu, 16, UH, VSLT) 3045 VCMPI(vslti_wu, 32, UW, VSLT) 3046 VCMPI(vslti_du, 64, UD, VSLT) 3047 3048 static uint64_t vfcmp_common(CPULoongArchState *env, 3049 FloatRelation cmp, uint32_t flags) 3050 { 3051 uint64_t ret = 0; 3052 3053 switch (cmp) { 3054 case float_relation_less: 3055 ret = (flags & FCMP_LT); 3056 break; 3057 case float_relation_equal: 3058 ret = (flags & FCMP_EQ); 3059 break; 3060 case float_relation_greater: 3061 ret = (flags & FCMP_GT); 3062 break; 3063 case float_relation_unordered: 3064 ret = (flags & FCMP_UN); 3065 break; 3066 default: 3067 g_assert_not_reached(); 3068 } 3069 3070 if (ret) { 3071 ret = -1; 3072 } 3073 3074 return ret; 3075 } 3076 3077 #define VFCMP(NAME, BIT, E, FN) \ 3078 void HELPER(NAME)(CPULoongArchState *env, uint32_t oprsz, \ 3079 uint32_t vd, uint32_t vj, uint32_t vk, uint32_t flags) \ 3080 { \ 3081 int i; \ 3082 VReg t; \ 3083 VReg *Vd = &(env->fpr[vd].vreg); \ 3084 VReg *Vj = &(env->fpr[vj].vreg); \ 3085 VReg *Vk = &(env->fpr[vk].vreg); \ 3086 \ 3087 vec_clear_cause(env); \ 3088 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3089 FloatRelation cmp; \ 3090 cmp = FN(Vj->E(i), Vk->E(i), &env->fp_status); \ 3091 t.E(i) = vfcmp_common(env, cmp, flags); \ 3092 vec_update_fcsr0(env, GETPC()); \ 3093 } \ 3094 *Vd = t; \ 3095 } 3096 3097 VFCMP(vfcmp_c_s, 32, UW, float32_compare_quiet) 3098 VFCMP(vfcmp_s_s, 32, UW, float32_compare) 3099 VFCMP(vfcmp_c_d, 64, UD, float64_compare_quiet) 3100 VFCMP(vfcmp_s_d, 64, UD, float64_compare) 3101 3102 void HELPER(vbitseli_b)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3103 { 3104 int i; 3105 VReg *Vd = (VReg *)vd; 3106 VReg *Vj = (VReg *)vj; 3107 3108 for (i = 0; i < simd_oprsz(desc); i++) { 3109 Vd->B(i) = (~Vd->B(i) & Vj->B(i)) | (Vd->B(i) & imm); 3110 } 3111 } 3112 3113 /* Copy from target/arm/tcg/sve_helper.c */ 3114 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 3115 { 3116 int bits = 8 << esz; 3117 uint64_t ones = dup_const(esz, 1); 3118 uint64_t signs = ones << (bits - 1); 3119 uint64_t cmp0, cmp1; 3120 3121 cmp1 = dup_const(esz, n); 3122 cmp0 = cmp1 ^ m0; 3123 cmp1 = cmp1 ^ m1; 3124 cmp0 = (cmp0 - ones) & ~cmp0; 3125 cmp1 = (cmp1 - ones) & ~cmp1; 3126 return (cmp0 | cmp1) & signs; 3127 } 3128 3129 #define SETANYEQZ(NAME, MO) \ 3130 void HELPER(NAME)(CPULoongArchState *env, \ 3131 uint32_t oprsz, uint32_t cd, uint32_t vj) \ 3132 { \ 3133 VReg *Vj = &(env->fpr[vj].vreg); \ 3134 \ 3135 env->cf[cd & 0x7] = do_match2(0, Vj->D(0), Vj->D(1), MO); \ 3136 if (oprsz == 32) { \ 3137 env->cf[cd & 0x7] = env->cf[cd & 0x7] || \ 3138 do_match2(0, Vj->D(2), Vj->D(3), MO); \ 3139 } \ 3140 } 3141 3142 SETANYEQZ(vsetanyeqz_b, MO_8) 3143 SETANYEQZ(vsetanyeqz_h, MO_16) 3144 SETANYEQZ(vsetanyeqz_w, MO_32) 3145 SETANYEQZ(vsetanyeqz_d, MO_64) 3146 3147 #define SETALLNEZ(NAME, MO) \ 3148 void HELPER(NAME)(CPULoongArchState *env, \ 3149 uint32_t oprsz, uint32_t cd, uint32_t vj) \ 3150 { \ 3151 VReg *Vj = &(env->fpr[vj].vreg); \ 3152 \ 3153 env->cf[cd & 0x7]= !do_match2(0, Vj->D(0), Vj->D(1), MO); \ 3154 if (oprsz == 32) { \ 3155 env->cf[cd & 0x7] = env->cf[cd & 0x7] && \ 3156 !do_match2(0, Vj->D(2), Vj->D(3), MO); \ 3157 } \ 3158 } 3159 3160 SETALLNEZ(vsetallnez_b, MO_8) 3161 SETALLNEZ(vsetallnez_h, MO_16) 3162 SETALLNEZ(vsetallnez_w, MO_32) 3163 SETALLNEZ(vsetallnez_d, MO_64) 3164 3165 #define XVINSVE0(NAME, E, MASK) \ 3166 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3167 { \ 3168 VReg *Vd = (VReg *)vd; \ 3169 VReg *Vj = (VReg *)vj; \ 3170 Vd->E(imm & MASK) = Vj->E(0); \ 3171 } 3172 3173 XVINSVE0(xvinsve0_w, W, 0x7) 3174 XVINSVE0(xvinsve0_d, D, 0x3) 3175 3176 #define XVPICKVE(NAME, E, BIT, MASK) \ 3177 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3178 { \ 3179 int i; \ 3180 VReg *Vd = (VReg *)vd; \ 3181 VReg *Vj = (VReg *)vj; \ 3182 int oprsz = simd_oprsz(desc); \ 3183 \ 3184 Vd->E(0) = Vj->E(imm & MASK); \ 3185 for (i = 1; i < oprsz / (BIT / 8); i++) { \ 3186 Vd->E(i) = 0; \ 3187 } \ 3188 } 3189 3190 XVPICKVE(xvpickve_w, W, 32, 0x7) 3191 XVPICKVE(xvpickve_d, D, 64, 0x3) 3192 3193 #define VPACKEV(NAME, BIT, E) \ 3194 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3195 { \ 3196 int i; \ 3197 VReg temp = {}; \ 3198 VReg *Vd = (VReg *)vd; \ 3199 VReg *Vj = (VReg *)vj; \ 3200 VReg *Vk = (VReg *)vk; \ 3201 int oprsz = simd_oprsz(desc); \ 3202 \ 3203 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3204 temp.E(2 * i + 1) = Vj->E(2 * i); \ 3205 temp.E(2 *i) = Vk->E(2 * i); \ 3206 } \ 3207 *Vd = temp; \ 3208 } 3209 3210 VPACKEV(vpackev_b, 16, B) 3211 VPACKEV(vpackev_h, 32, H) 3212 VPACKEV(vpackev_w, 64, W) 3213 VPACKEV(vpackev_d, 128, D) 3214 3215 #define VPACKOD(NAME, BIT, E) \ 3216 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3217 { \ 3218 int i; \ 3219 VReg temp = {}; \ 3220 VReg *Vd = (VReg *)vd; \ 3221 VReg *Vj = (VReg *)vj; \ 3222 VReg *Vk = (VReg *)vk; \ 3223 int oprsz = simd_oprsz(desc); \ 3224 \ 3225 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3226 temp.E(2 * i + 1) = Vj->E(2 * i + 1); \ 3227 temp.E(2 * i) = Vk->E(2 * i + 1); \ 3228 } \ 3229 *Vd = temp; \ 3230 } 3231 3232 VPACKOD(vpackod_b, 16, B) 3233 VPACKOD(vpackod_h, 32, H) 3234 VPACKOD(vpackod_w, 64, W) 3235 VPACKOD(vpackod_d, 128, D) 3236 3237 #define VPICKEV(NAME, BIT, E) \ 3238 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3239 { \ 3240 int i, j, ofs; \ 3241 VReg temp = {}; \ 3242 VReg *Vd = (VReg *)vd; \ 3243 VReg *Vj = (VReg *)vj; \ 3244 VReg *Vk = (VReg *)vk; \ 3245 int oprsz = simd_oprsz(desc); \ 3246 \ 3247 ofs = LSX_LEN / BIT; \ 3248 for (i = 0; i < oprsz / 16; i++) { \ 3249 for (j = 0; j < ofs; j++) { \ 3250 temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i)); \ 3251 temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i)); \ 3252 } \ 3253 } \ 3254 *Vd = temp; \ 3255 } 3256 3257 VPICKEV(vpickev_b, 16, B) 3258 VPICKEV(vpickev_h, 32, H) 3259 VPICKEV(vpickev_w, 64, W) 3260 VPICKEV(vpickev_d, 128, D) 3261 3262 #define VPICKOD(NAME, BIT, E) \ 3263 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3264 { \ 3265 int i, j, ofs; \ 3266 VReg temp = {}; \ 3267 VReg *Vd = (VReg *)vd; \ 3268 VReg *Vj = (VReg *)vj; \ 3269 VReg *Vk = (VReg *)vk; \ 3270 int oprsz = simd_oprsz(desc); \ 3271 \ 3272 ofs = LSX_LEN / BIT; \ 3273 for (i = 0; i < oprsz / 16; i++) { \ 3274 for (j = 0; j < ofs; j++) { \ 3275 temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i) + 1); \ 3276 temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i) + 1); \ 3277 } \ 3278 } \ 3279 *Vd = temp; \ 3280 } 3281 3282 VPICKOD(vpickod_b, 16, B) 3283 VPICKOD(vpickod_h, 32, H) 3284 VPICKOD(vpickod_w, 64, W) 3285 VPICKOD(vpickod_d, 128, D) 3286 3287 #define VILVL(NAME, BIT, E) \ 3288 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3289 { \ 3290 int i, j, ofs; \ 3291 VReg temp = {}; \ 3292 VReg *Vd = (VReg *)vd; \ 3293 VReg *Vj = (VReg *)vj; \ 3294 VReg *Vk = (VReg *)vk; \ 3295 int oprsz = simd_oprsz(desc); \ 3296 \ 3297 ofs = LSX_LEN / BIT; \ 3298 for (i = 0; i < oprsz / 16; i++) { \ 3299 for (j = 0; j < ofs; j++) { \ 3300 temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * 2 * i); \ 3301 temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * 2 * i); \ 3302 } \ 3303 } \ 3304 *Vd = temp; \ 3305 } 3306 3307 VILVL(vilvl_b, 16, B) 3308 VILVL(vilvl_h, 32, H) 3309 VILVL(vilvl_w, 64, W) 3310 VILVL(vilvl_d, 128, D) 3311 3312 #define VILVH(NAME, BIT, E) \ 3313 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3314 { \ 3315 int i, j, ofs; \ 3316 VReg temp = {}; \ 3317 VReg *Vd = (VReg *)vd; \ 3318 VReg *Vj = (VReg *)vj; \ 3319 VReg *Vk = (VReg *)vk; \ 3320 int oprsz = simd_oprsz(desc); \ 3321 \ 3322 ofs = LSX_LEN / BIT; \ 3323 for (i = 0; i < oprsz / 16; i++) { \ 3324 for (j = 0; j < ofs; j++) { \ 3325 temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * (2 * i + 1)); \ 3326 temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * (2 * i + 1)); \ 3327 } \ 3328 } \ 3329 *Vd = temp; \ 3330 } 3331 3332 VILVH(vilvh_b, 16, B) 3333 VILVH(vilvh_h, 32, H) 3334 VILVH(vilvh_w, 64, W) 3335 VILVH(vilvh_d, 128, D) 3336 3337 void HELPER(vshuf_b)(void *vd, void *vj, void *vk, void *va, uint32_t desc) 3338 { 3339 int i, j, m; 3340 VReg temp = {}; 3341 VReg *Vd = (VReg *)vd; 3342 VReg *Vj = (VReg *)vj; 3343 VReg *Vk = (VReg *)vk; 3344 VReg *Va = (VReg *)va; 3345 int oprsz = simd_oprsz(desc); 3346 3347 m = LSX_LEN / 8; 3348 for (i = 0; i < (oprsz / 16) * m; i++) { 3349 j = i < m ? 0 : 1; 3350 uint64_t k = (uint8_t)Va->B(i) % (2 * m); 3351 temp.B(i) = k < m ? Vk->B(k + j * m): Vj->B(k + (j - 1) * m); 3352 } 3353 *Vd = temp; 3354 } 3355 3356 #define VSHUF(NAME, BIT, E) \ 3357 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3358 { \ 3359 int i, j, m; \ 3360 VReg temp = {}; \ 3361 VReg *Vd = (VReg *)vd; \ 3362 VReg *Vj = (VReg *)vj; \ 3363 VReg *Vk = (VReg *)vk; \ 3364 int oprsz = simd_oprsz(desc); \ 3365 \ 3366 m = LSX_LEN / BIT; \ 3367 for (i = 0; i < (oprsz / 16) * m; i++) { \ 3368 j = i < m ? 0 : 1; \ 3369 uint64_t k = ((uint8_t)Vd->E(i)) % (2 * m); \ 3370 temp.E(i) = k < m ? Vk->E(k + j * m) : Vj->E(k + (j - 1) * m); \ 3371 } \ 3372 *Vd = temp; \ 3373 } 3374 3375 VSHUF(vshuf_h, 16, H) 3376 VSHUF(vshuf_w, 32, W) 3377 VSHUF(vshuf_d, 64, D) 3378 3379 #define VSHUF4I(NAME, BIT, E) \ 3380 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3381 { \ 3382 int i, j, max; \ 3383 VReg temp = {}; \ 3384 VReg *Vd = (VReg *)vd; \ 3385 VReg *Vj = (VReg *)vj; \ 3386 int oprsz = simd_oprsz(desc); \ 3387 \ 3388 max = LSX_LEN / BIT; \ 3389 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3390 j = i < max ? 1 : 2; \ 3391 temp.E(i) = Vj->E(SHF_POS(i - ((j -1)* max), imm) + (j - 1) * max); \ 3392 } \ 3393 *Vd = temp; \ 3394 } 3395 3396 VSHUF4I(vshuf4i_b, 8, B) 3397 VSHUF4I(vshuf4i_h, 16, H) 3398 VSHUF4I(vshuf4i_w, 32, W) 3399 3400 void HELPER(vshuf4i_d)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3401 { 3402 int i; 3403 VReg temp = {}; 3404 VReg *Vd = (VReg *)vd; 3405 VReg *Vj = (VReg *)vj; 3406 int oprsz = simd_oprsz(desc); 3407 3408 for (i = 0; i < oprsz / 16; i++) { 3409 temp.D(2 * i) = (imm & 2 ? Vj : Vd)->D((imm & 1) + 2 * i); 3410 temp.D(2 * i + 1) = (imm & 8 ? Vj : Vd)->D(((imm >> 2) & 1) + 2 * i); 3411 } 3412 *Vd = temp; 3413 } 3414 3415 void HELPER(vperm_w)(void *vd, void *vj, void *vk, uint32_t desc) 3416 { 3417 int i, m; 3418 VReg temp = {}; 3419 VReg *Vd = (VReg *)vd; 3420 VReg *Vj = (VReg *)vj; 3421 VReg *Vk = (VReg *)vk; 3422 3423 m = LASX_LEN / 32; 3424 for (i = 0; i < m ; i++) { 3425 uint64_t k = (uint8_t)Vk->W(i) % 8; 3426 temp.W(i) = Vj->W(k); 3427 } 3428 *Vd = temp; 3429 } 3430 3431 void HELPER(vpermi_w)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3432 { 3433 int i; 3434 VReg temp = {}; 3435 VReg *Vd = (VReg *)vd; 3436 VReg *Vj = (VReg *)vj; 3437 int oprsz = simd_oprsz(desc); 3438 3439 for (i = 0; i < oprsz / 16; i++) { 3440 temp.W(4 * i) = Vj->W((imm & 0x3) + 4 * i); 3441 temp.W(4 * i + 1) = Vj->W(((imm >> 2) & 0x3) + 4 * i); 3442 temp.W(4 * i + 2) = Vd->W(((imm >> 4) & 0x3) + 4 * i); 3443 temp.W(4 * i + 3) = Vd->W(((imm >> 6) & 0x3) + 4 * i); 3444 } 3445 *Vd = temp; 3446 } 3447 3448 void HELPER(vpermi_d)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3449 { 3450 VReg temp = {}; 3451 VReg *Vd = (VReg *)vd; 3452 VReg *Vj = (VReg *)vj; 3453 3454 temp.D(0) = Vj->D(imm & 0x3); 3455 temp.D(1) = Vj->D((imm >> 2) & 0x3); 3456 temp.D(2) = Vj->D((imm >> 4) & 0x3); 3457 temp.D(3) = Vj->D((imm >> 6) & 0x3); 3458 *Vd = temp; 3459 } 3460 3461 void HELPER(vpermi_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3462 { 3463 int i; 3464 VReg temp; 3465 VReg *Vd = (VReg *)vd; 3466 VReg *Vj = (VReg *)vj; 3467 3468 for (i = 0; i < 2; i++, imm >>= 4) { 3469 temp.Q(i) = (imm & 2 ? Vd: Vj)->Q(imm & 1); 3470 } 3471 *Vd = temp; 3472 } 3473 3474 #define VEXTRINS(NAME, BIT, E, MASK) \ 3475 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3476 { \ 3477 int i, ins, extr, max; \ 3478 VReg *Vd = (VReg *)vd; \ 3479 VReg *Vj = (VReg *)vj; \ 3480 int oprsz = simd_oprsz(desc); \ 3481 \ 3482 max = LSX_LEN / BIT; \ 3483 ins = (imm >> 4) & MASK; \ 3484 extr = imm & MASK; \ 3485 for (i = 0; i < oprsz / 16; i++) { \ 3486 Vd->E(ins + i * max) = Vj->E(extr + i * max); \ 3487 } \ 3488 } 3489 3490 VEXTRINS(vextrins_b, 8, B, 0xf) 3491 VEXTRINS(vextrins_h, 16, H, 0x7) 3492 VEXTRINS(vextrins_w, 32, W, 0x3) 3493 VEXTRINS(vextrins_d, 64, D, 0x1) 3494