1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "tcg/tcg.h" 22 #include "tcg/tcg-temp-internal.h" 23 #include "tcg/tcg-op-common.h" 24 #include "tcg/tcg-op-gvec-common.h" 25 #include "tcg/tcg-gvec-desc.h" 26 #include "tcg-has.h" 27 28 #define MAX_UNROLL 4 29 30 #ifdef CONFIG_DEBUG_TCG 31 static const TCGOpcode vecop_list_empty[1] = { 0 }; 32 #else 33 #define vecop_list_empty NULL 34 #endif 35 36 37 /* Verify vector size and alignment rules. OFS should be the OR of all 38 of the operand offsets so that we can check them all at once. */ 39 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 40 { 41 uint32_t max_align; 42 43 switch (oprsz) { 44 case 8: 45 case 16: 46 case 32: 47 tcg_debug_assert(oprsz <= maxsz); 48 break; 49 default: 50 tcg_debug_assert(oprsz == maxsz); 51 break; 52 } 53 tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS)); 54 55 max_align = maxsz >= 16 ? 15 : 7; 56 tcg_debug_assert((maxsz & max_align) == 0); 57 tcg_debug_assert((ofs & max_align) == 0); 58 } 59 60 /* Verify vector overlap rules for two operands. */ 61 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 62 { 63 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 64 } 65 66 /* Verify vector overlap rules for three operands. */ 67 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 68 { 69 check_overlap_2(d, a, s); 70 check_overlap_2(d, b, s); 71 check_overlap_2(a, b, s); 72 } 73 74 /* Verify vector overlap rules for four operands. */ 75 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 76 uint32_t c, uint32_t s) 77 { 78 check_overlap_2(d, a, s); 79 check_overlap_2(d, b, s); 80 check_overlap_2(d, c, s); 81 check_overlap_2(a, b, s); 82 check_overlap_2(a, c, s); 83 check_overlap_2(b, c, s); 84 } 85 86 /* Create a descriptor from components. */ 87 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 88 { 89 uint32_t desc = 0; 90 91 check_size_align(oprsz, maxsz, 0); 92 93 /* 94 * We want to check that 'data' will fit into SIMD_DATA_BITS. 95 * However, some callers want to treat the data as a signed 96 * value (which they can later get back with simd_data()) 97 * and some want to treat it as an unsigned value. 98 * So here we assert only that the data will fit into the 99 * field in at least one way. This means that some invalid 100 * values from the caller will not be detected, e.g. if the 101 * caller wants to handle the value as a signed integer but 102 * incorrectly passes us 1 << (SIMD_DATA_BITS - 1). 103 */ 104 tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS) || 105 data == extract32(data, 0, SIMD_DATA_BITS)); 106 107 oprsz = (oprsz / 8) - 1; 108 maxsz = (maxsz / 8) - 1; 109 110 /* 111 * We have just asserted in check_size_align that either 112 * oprsz is {8,16,32} or matches maxsz. Encode the final 113 * case with '2', as that would otherwise map to 24. 114 */ 115 if (oprsz == maxsz) { 116 oprsz = 2; 117 } 118 119 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 120 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 121 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 122 123 return desc; 124 } 125 126 /* Generate a call to a gvec-style helper with two vector operands. */ 127 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 128 uint32_t oprsz, uint32_t maxsz, int32_t data, 129 gen_helper_gvec_2 *fn) 130 { 131 TCGv_ptr a0, a1; 132 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 133 134 a0 = tcg_temp_ebb_new_ptr(); 135 a1 = tcg_temp_ebb_new_ptr(); 136 137 tcg_gen_addi_ptr(a0, tcg_env, dofs); 138 tcg_gen_addi_ptr(a1, tcg_env, aofs); 139 140 fn(a0, a1, desc); 141 142 tcg_temp_free_ptr(a0); 143 tcg_temp_free_ptr(a1); 144 } 145 146 /* Generate a call to a gvec-style helper with two vector operands 147 and one scalar operand. */ 148 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 149 uint32_t oprsz, uint32_t maxsz, int32_t data, 150 gen_helper_gvec_2i *fn) 151 { 152 TCGv_ptr a0, a1; 153 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 154 155 a0 = tcg_temp_ebb_new_ptr(); 156 a1 = tcg_temp_ebb_new_ptr(); 157 158 tcg_gen_addi_ptr(a0, tcg_env, dofs); 159 tcg_gen_addi_ptr(a1, tcg_env, aofs); 160 161 fn(a0, a1, c, desc); 162 163 tcg_temp_free_ptr(a0); 164 tcg_temp_free_ptr(a1); 165 } 166 167 /* Generate a call to a gvec-style helper with three vector operands. */ 168 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 169 uint32_t oprsz, uint32_t maxsz, int32_t data, 170 gen_helper_gvec_3 *fn) 171 { 172 TCGv_ptr a0, a1, a2; 173 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 174 175 a0 = tcg_temp_ebb_new_ptr(); 176 a1 = tcg_temp_ebb_new_ptr(); 177 a2 = tcg_temp_ebb_new_ptr(); 178 179 tcg_gen_addi_ptr(a0, tcg_env, dofs); 180 tcg_gen_addi_ptr(a1, tcg_env, aofs); 181 tcg_gen_addi_ptr(a2, tcg_env, bofs); 182 183 fn(a0, a1, a2, desc); 184 185 tcg_temp_free_ptr(a0); 186 tcg_temp_free_ptr(a1); 187 tcg_temp_free_ptr(a2); 188 } 189 190 /* Generate a call to a gvec-style helper with four vector operands. */ 191 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 192 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 193 int32_t data, gen_helper_gvec_4 *fn) 194 { 195 TCGv_ptr a0, a1, a2, a3; 196 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 197 198 a0 = tcg_temp_ebb_new_ptr(); 199 a1 = tcg_temp_ebb_new_ptr(); 200 a2 = tcg_temp_ebb_new_ptr(); 201 a3 = tcg_temp_ebb_new_ptr(); 202 203 tcg_gen_addi_ptr(a0, tcg_env, dofs); 204 tcg_gen_addi_ptr(a1, tcg_env, aofs); 205 tcg_gen_addi_ptr(a2, tcg_env, bofs); 206 tcg_gen_addi_ptr(a3, tcg_env, cofs); 207 208 fn(a0, a1, a2, a3, desc); 209 210 tcg_temp_free_ptr(a0); 211 tcg_temp_free_ptr(a1); 212 tcg_temp_free_ptr(a2); 213 tcg_temp_free_ptr(a3); 214 } 215 216 /* Generate a call to a gvec-style helper with five vector operands. */ 217 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 218 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 219 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 220 { 221 TCGv_ptr a0, a1, a2, a3, a4; 222 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 223 224 a0 = tcg_temp_ebb_new_ptr(); 225 a1 = tcg_temp_ebb_new_ptr(); 226 a2 = tcg_temp_ebb_new_ptr(); 227 a3 = tcg_temp_ebb_new_ptr(); 228 a4 = tcg_temp_ebb_new_ptr(); 229 230 tcg_gen_addi_ptr(a0, tcg_env, dofs); 231 tcg_gen_addi_ptr(a1, tcg_env, aofs); 232 tcg_gen_addi_ptr(a2, tcg_env, bofs); 233 tcg_gen_addi_ptr(a3, tcg_env, cofs); 234 tcg_gen_addi_ptr(a4, tcg_env, xofs); 235 236 fn(a0, a1, a2, a3, a4, desc); 237 238 tcg_temp_free_ptr(a0); 239 tcg_temp_free_ptr(a1); 240 tcg_temp_free_ptr(a2); 241 tcg_temp_free_ptr(a3); 242 tcg_temp_free_ptr(a4); 243 } 244 245 /* Generate a call to a gvec-style helper with three vector operands 246 and an extra pointer operand. */ 247 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 248 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 249 int32_t data, gen_helper_gvec_2_ptr *fn) 250 { 251 TCGv_ptr a0, a1; 252 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 253 254 a0 = tcg_temp_ebb_new_ptr(); 255 a1 = tcg_temp_ebb_new_ptr(); 256 257 tcg_gen_addi_ptr(a0, tcg_env, dofs); 258 tcg_gen_addi_ptr(a1, tcg_env, aofs); 259 260 fn(a0, a1, ptr, desc); 261 262 tcg_temp_free_ptr(a0); 263 tcg_temp_free_ptr(a1); 264 } 265 266 /* Generate a call to a gvec-style helper with three vector operands 267 and an extra pointer operand. */ 268 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 269 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 270 int32_t data, gen_helper_gvec_3_ptr *fn) 271 { 272 TCGv_ptr a0, a1, a2; 273 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 274 275 a0 = tcg_temp_ebb_new_ptr(); 276 a1 = tcg_temp_ebb_new_ptr(); 277 a2 = tcg_temp_ebb_new_ptr(); 278 279 tcg_gen_addi_ptr(a0, tcg_env, dofs); 280 tcg_gen_addi_ptr(a1, tcg_env, aofs); 281 tcg_gen_addi_ptr(a2, tcg_env, bofs); 282 283 fn(a0, a1, a2, ptr, desc); 284 285 tcg_temp_free_ptr(a0); 286 tcg_temp_free_ptr(a1); 287 tcg_temp_free_ptr(a2); 288 } 289 290 /* Generate a call to a gvec-style helper with four vector operands 291 and an extra pointer operand. */ 292 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 293 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 294 uint32_t maxsz, int32_t data, 295 gen_helper_gvec_4_ptr *fn) 296 { 297 TCGv_ptr a0, a1, a2, a3; 298 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 299 300 a0 = tcg_temp_ebb_new_ptr(); 301 a1 = tcg_temp_ebb_new_ptr(); 302 a2 = tcg_temp_ebb_new_ptr(); 303 a3 = tcg_temp_ebb_new_ptr(); 304 305 tcg_gen_addi_ptr(a0, tcg_env, dofs); 306 tcg_gen_addi_ptr(a1, tcg_env, aofs); 307 tcg_gen_addi_ptr(a2, tcg_env, bofs); 308 tcg_gen_addi_ptr(a3, tcg_env, cofs); 309 310 fn(a0, a1, a2, a3, ptr, desc); 311 312 tcg_temp_free_ptr(a0); 313 tcg_temp_free_ptr(a1); 314 tcg_temp_free_ptr(a2); 315 tcg_temp_free_ptr(a3); 316 } 317 318 /* Generate a call to a gvec-style helper with five vector operands 319 and an extra pointer operand. */ 320 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 321 uint32_t cofs, uint32_t eofs, TCGv_ptr ptr, 322 uint32_t oprsz, uint32_t maxsz, int32_t data, 323 gen_helper_gvec_5_ptr *fn) 324 { 325 TCGv_ptr a0, a1, a2, a3, a4; 326 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 327 328 a0 = tcg_temp_ebb_new_ptr(); 329 a1 = tcg_temp_ebb_new_ptr(); 330 a2 = tcg_temp_ebb_new_ptr(); 331 a3 = tcg_temp_ebb_new_ptr(); 332 a4 = tcg_temp_ebb_new_ptr(); 333 334 tcg_gen_addi_ptr(a0, tcg_env, dofs); 335 tcg_gen_addi_ptr(a1, tcg_env, aofs); 336 tcg_gen_addi_ptr(a2, tcg_env, bofs); 337 tcg_gen_addi_ptr(a3, tcg_env, cofs); 338 tcg_gen_addi_ptr(a4, tcg_env, eofs); 339 340 fn(a0, a1, a2, a3, a4, ptr, desc); 341 342 tcg_temp_free_ptr(a0); 343 tcg_temp_free_ptr(a1); 344 tcg_temp_free_ptr(a2); 345 tcg_temp_free_ptr(a3); 346 tcg_temp_free_ptr(a4); 347 } 348 349 /* Return true if we want to implement something of OPRSZ bytes 350 in units of LNSZ. This limits the expansion of inline code. */ 351 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 352 { 353 uint32_t q, r; 354 355 if (oprsz < lnsz) { 356 return false; 357 } 358 359 q = oprsz / lnsz; 360 r = oprsz % lnsz; 361 tcg_debug_assert((r & 7) == 0); 362 363 if (lnsz < 16) { 364 /* For sizes below 16, accept no remainder. */ 365 if (r != 0) { 366 return false; 367 } 368 } else { 369 /* 370 * Recall that ARM SVE allows vector sizes that are not a 371 * power of 2, but always a multiple of 16. The intent is 372 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 373 * In addition, expand_clr needs to handle a multiple of 8. 374 * Thus we can handle the tail with one more operation per 375 * diminishing power of 2. 376 */ 377 q += ctpop32(r); 378 } 379 380 return q <= MAX_UNROLL; 381 } 382 383 static void expand_clr(uint32_t dofs, uint32_t maxsz); 384 385 /* Duplicate C as per VECE. */ 386 uint64_t (dup_const)(unsigned vece, uint64_t c) 387 { 388 switch (vece) { 389 case MO_8: 390 return 0x0101010101010101ull * (uint8_t)c; 391 case MO_16: 392 return 0x0001000100010001ull * (uint16_t)c; 393 case MO_32: 394 return 0x0000000100000001ull * (uint32_t)c; 395 case MO_64: 396 return c; 397 default: 398 g_assert_not_reached(); 399 } 400 } 401 402 /* Duplicate IN into OUT as per VECE. */ 403 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 404 { 405 switch (vece) { 406 case MO_8: 407 tcg_gen_ext8u_i32(out, in); 408 tcg_gen_muli_i32(out, out, 0x01010101); 409 break; 410 case MO_16: 411 tcg_gen_deposit_i32(out, in, in, 16, 16); 412 break; 413 case MO_32: 414 tcg_gen_mov_i32(out, in); 415 break; 416 default: 417 g_assert_not_reached(); 418 } 419 } 420 421 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 422 { 423 switch (vece) { 424 case MO_8: 425 tcg_gen_ext8u_i64(out, in); 426 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 427 break; 428 case MO_16: 429 tcg_gen_ext16u_i64(out, in); 430 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 431 break; 432 case MO_32: 433 tcg_gen_deposit_i64(out, in, in, 32, 32); 434 break; 435 case MO_64: 436 tcg_gen_mov_i64(out, in); 437 break; 438 default: 439 g_assert_not_reached(); 440 } 441 } 442 443 /* Select a supported vector type for implementing an operation on SIZE 444 * bytes. If OP is 0, assume that the real operation to be performed is 445 * required by all backends. Otherwise, make sure than OP can be performed 446 * on elements of size VECE in the selected type. Do not select V64 if 447 * PREFER_I64 is true. Return 0 if no vector type is selected. 448 */ 449 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, 450 uint32_t size, bool prefer_i64) 451 { 452 /* 453 * Recall that ARM SVE allows vector sizes that are not a 454 * power of 2, but always a multiple of 16. The intent is 455 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 456 * It is hard to imagine a case in which v256 is supported 457 * but v128 is not, but check anyway. 458 * In addition, expand_clr needs to handle a multiple of 8. 459 */ 460 if (TCG_TARGET_HAS_v256 && 461 check_size_impl(size, 32) && 462 tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) && 463 (!(size & 16) || 464 (TCG_TARGET_HAS_v128 && 465 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) && 466 (!(size & 8) || 467 (TCG_TARGET_HAS_v64 && 468 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 469 return TCG_TYPE_V256; 470 } 471 if (TCG_TARGET_HAS_v128 && 472 check_size_impl(size, 16) && 473 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) && 474 (!(size & 8) || 475 (TCG_TARGET_HAS_v64 && 476 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 477 return TCG_TYPE_V128; 478 } 479 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 480 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) { 481 return TCG_TYPE_V64; 482 } 483 return 0; 484 } 485 486 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, 487 uint32_t maxsz, TCGv_vec t_vec) 488 { 489 uint32_t i = 0; 490 491 tcg_debug_assert(oprsz >= 8); 492 493 /* 494 * This may be expand_clr for the tail of an operation, e.g. 495 * oprsz == 8 && maxsz == 64. The first 8 bytes of this store 496 * are misaligned wrt the maximum vector size, so do that first. 497 */ 498 if (dofs & 8) { 499 tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V64); 500 i += 8; 501 } 502 503 switch (type) { 504 case TCG_TYPE_V256: 505 /* 506 * Recall that ARM SVE allows vector sizes that are not a 507 * power of 2, but always a multiple of 16. The intent is 508 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 509 */ 510 for (; i + 32 <= oprsz; i += 32) { 511 tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V256); 512 } 513 /* fallthru */ 514 case TCG_TYPE_V128: 515 for (; i + 16 <= oprsz; i += 16) { 516 tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V128); 517 } 518 break; 519 case TCG_TYPE_V64: 520 for (; i < oprsz; i += 8) { 521 tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V64); 522 } 523 break; 524 default: 525 g_assert_not_reached(); 526 } 527 528 if (oprsz < maxsz) { 529 expand_clr(dofs + oprsz, maxsz - oprsz); 530 } 531 } 532 533 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 534 * Only one of IN_32 or IN_64 may be set; 535 * IN_C is used if IN_32 and IN_64 are unset. 536 */ 537 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 538 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 539 uint64_t in_c) 540 { 541 TCGType type; 542 TCGv_i64 t_64; 543 TCGv_i32 t_32, t_desc; 544 TCGv_ptr t_ptr; 545 uint32_t i; 546 547 assert(vece <= (in_32 ? MO_32 : MO_64)); 548 assert(in_32 == NULL || in_64 == NULL); 549 550 /* If we're storing 0, expand oprsz to maxsz. */ 551 if (in_32 == NULL && in_64 == NULL) { 552 in_c = dup_const(vece, in_c); 553 if (in_c == 0) { 554 oprsz = maxsz; 555 vece = MO_8; 556 } else if (in_c == dup_const(MO_8, in_c)) { 557 vece = MO_8; 558 } 559 } 560 561 /* Implement inline with a vector type, if possible. 562 * Prefer integer when 64-bit host and no variable dup. 563 */ 564 type = choose_vector_type(NULL, vece, oprsz, 565 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 566 && (in_64 == NULL || vece == MO_64))); 567 if (type != 0) { 568 TCGv_vec t_vec = tcg_temp_new_vec(type); 569 570 if (in_32) { 571 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 572 } else if (in_64) { 573 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 574 } else { 575 tcg_gen_dupi_vec(vece, t_vec, in_c); 576 } 577 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 578 return; 579 } 580 581 /* Otherwise, inline with an integer type, unless "large". */ 582 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 583 t_64 = NULL; 584 t_32 = NULL; 585 586 if (in_32) { 587 /* We are given a 32-bit variable input. For a 64-bit host, 588 use a 64-bit operation unless the 32-bit operation would 589 be simple enough. */ 590 if (TCG_TARGET_REG_BITS == 64 591 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 592 t_64 = tcg_temp_ebb_new_i64(); 593 tcg_gen_extu_i32_i64(t_64, in_32); 594 tcg_gen_dup_i64(vece, t_64, t_64); 595 } else { 596 t_32 = tcg_temp_ebb_new_i32(); 597 tcg_gen_dup_i32(vece, t_32, in_32); 598 } 599 } else if (in_64) { 600 /* We are given a 64-bit variable input. */ 601 t_64 = tcg_temp_ebb_new_i64(); 602 tcg_gen_dup_i64(vece, t_64, in_64); 603 } else { 604 /* We are given a constant input. */ 605 /* For 64-bit hosts, use 64-bit constants for "simple" constants 606 or when we'd need too many 32-bit stores, or when a 64-bit 607 constant is really required. */ 608 if (vece == MO_64 609 || (TCG_TARGET_REG_BITS == 64 610 && (in_c == 0 || in_c == -1 611 || !check_size_impl(oprsz, 4)))) { 612 t_64 = tcg_constant_i64(in_c); 613 } else { 614 t_32 = tcg_constant_i32(in_c); 615 } 616 } 617 618 /* Implement inline if we picked an implementation size above. */ 619 if (t_32) { 620 for (i = 0; i < oprsz; i += 4) { 621 tcg_gen_st_i32(t_32, tcg_env, dofs + i); 622 } 623 tcg_temp_free_i32(t_32); 624 goto done; 625 } 626 if (t_64) { 627 for (i = 0; i < oprsz; i += 8) { 628 tcg_gen_st_i64(t_64, tcg_env, dofs + i); 629 } 630 tcg_temp_free_i64(t_64); 631 goto done; 632 } 633 } 634 635 /* Otherwise implement out of line. */ 636 t_ptr = tcg_temp_ebb_new_ptr(); 637 tcg_gen_addi_ptr(t_ptr, tcg_env, dofs); 638 639 /* 640 * This may be expand_clr for the tail of an operation, e.g. 641 * oprsz == 8 && maxsz == 64. The size of the clear is misaligned 642 * wrt simd_desc and will assert. Simply pass all replicated byte 643 * stores through to memset. 644 */ 645 if (oprsz == maxsz && vece == MO_8) { 646 TCGv_ptr t_size = tcg_constant_ptr(oprsz); 647 TCGv_i32 t_val; 648 649 if (in_32) { 650 t_val = in_32; 651 } else if (in_64) { 652 t_val = tcg_temp_ebb_new_i32(); 653 tcg_gen_extrl_i64_i32(t_val, in_64); 654 } else { 655 t_val = tcg_constant_i32(in_c); 656 } 657 gen_helper_memset(t_ptr, t_ptr, t_val, t_size); 658 659 if (in_64) { 660 tcg_temp_free_i32(t_val); 661 } 662 tcg_temp_free_ptr(t_ptr); 663 return; 664 } 665 666 t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0)); 667 668 if (vece == MO_64) { 669 if (in_64) { 670 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 671 } else { 672 t_64 = tcg_constant_i64(in_c); 673 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 674 } 675 } else { 676 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 677 static dup_fn * const fns[3] = { 678 gen_helper_gvec_dup8, 679 gen_helper_gvec_dup16, 680 gen_helper_gvec_dup32 681 }; 682 683 if (in_32) { 684 fns[vece](t_ptr, t_desc, in_32); 685 } else if (in_64) { 686 t_32 = tcg_temp_ebb_new_i32(); 687 tcg_gen_extrl_i64_i32(t_32, in_64); 688 fns[vece](t_ptr, t_desc, t_32); 689 tcg_temp_free_i32(t_32); 690 } else { 691 if (vece == MO_8) { 692 in_c &= 0xff; 693 } else if (vece == MO_16) { 694 in_c &= 0xffff; 695 } 696 t_32 = tcg_constant_i32(in_c); 697 fns[vece](t_ptr, t_desc, t_32); 698 } 699 } 700 701 tcg_temp_free_ptr(t_ptr); 702 return; 703 704 done: 705 if (oprsz < maxsz) { 706 expand_clr(dofs + oprsz, maxsz - oprsz); 707 } 708 } 709 710 /* Likewise, but with zero. */ 711 static void expand_clr(uint32_t dofs, uint32_t maxsz) 712 { 713 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 714 } 715 716 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 717 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 718 bool load_dest, void (*fni)(TCGv_i32, TCGv_i32)) 719 { 720 TCGv_i32 t0 = tcg_temp_new_i32(); 721 TCGv_i32 t1 = tcg_temp_new_i32(); 722 uint32_t i; 723 724 for (i = 0; i < oprsz; i += 4) { 725 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 726 if (load_dest) { 727 tcg_gen_ld_i32(t1, tcg_env, dofs + i); 728 } 729 fni(t1, t0); 730 tcg_gen_st_i32(t1, tcg_env, dofs + i); 731 } 732 tcg_temp_free_i32(t0); 733 tcg_temp_free_i32(t1); 734 } 735 736 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 737 int32_t c, bool load_dest, 738 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 739 { 740 TCGv_i32 t0 = tcg_temp_new_i32(); 741 TCGv_i32 t1 = tcg_temp_new_i32(); 742 uint32_t i; 743 744 for (i = 0; i < oprsz; i += 4) { 745 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 746 if (load_dest) { 747 tcg_gen_ld_i32(t1, tcg_env, dofs + i); 748 } 749 fni(t1, t0, c); 750 tcg_gen_st_i32(t1, tcg_env, dofs + i); 751 } 752 tcg_temp_free_i32(t0); 753 tcg_temp_free_i32(t1); 754 } 755 756 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 757 TCGv_i32 c, bool scalar_first, 758 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 759 { 760 TCGv_i32 t0 = tcg_temp_new_i32(); 761 TCGv_i32 t1 = tcg_temp_new_i32(); 762 uint32_t i; 763 764 for (i = 0; i < oprsz; i += 4) { 765 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 766 if (scalar_first) { 767 fni(t1, c, t0); 768 } else { 769 fni(t1, t0, c); 770 } 771 tcg_gen_st_i32(t1, tcg_env, dofs + i); 772 } 773 tcg_temp_free_i32(t0); 774 tcg_temp_free_i32(t1); 775 } 776 777 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 778 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 779 uint32_t bofs, uint32_t oprsz, bool load_dest, 780 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 781 { 782 TCGv_i32 t0 = tcg_temp_new_i32(); 783 TCGv_i32 t1 = tcg_temp_new_i32(); 784 TCGv_i32 t2 = tcg_temp_new_i32(); 785 uint32_t i; 786 787 for (i = 0; i < oprsz; i += 4) { 788 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 789 tcg_gen_ld_i32(t1, tcg_env, bofs + i); 790 if (load_dest) { 791 tcg_gen_ld_i32(t2, tcg_env, dofs + i); 792 } 793 fni(t2, t0, t1); 794 tcg_gen_st_i32(t2, tcg_env, dofs + i); 795 } 796 tcg_temp_free_i32(t2); 797 tcg_temp_free_i32(t1); 798 tcg_temp_free_i32(t0); 799 } 800 801 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 802 uint32_t oprsz, int32_t c, 803 bool load_dest, bool write_aofs, 804 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t)) 805 { 806 TCGv_i32 t0 = tcg_temp_new_i32(); 807 TCGv_i32 t1 = tcg_temp_new_i32(); 808 TCGv_i32 t2 = tcg_temp_new_i32(); 809 uint32_t i; 810 811 for (i = 0; i < oprsz; i += 4) { 812 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 813 tcg_gen_ld_i32(t1, tcg_env, bofs + i); 814 if (load_dest) { 815 tcg_gen_ld_i32(t2, tcg_env, dofs + i); 816 } 817 fni(t2, t0, t1, c); 818 tcg_gen_st_i32(t2, tcg_env, dofs + i); 819 if (write_aofs) { 820 tcg_gen_st_i32(t0, tcg_env, aofs + i); 821 } 822 } 823 tcg_temp_free_i32(t0); 824 tcg_temp_free_i32(t1); 825 tcg_temp_free_i32(t2); 826 } 827 828 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 829 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 830 uint32_t cofs, uint32_t oprsz, bool write_aofs, 831 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 832 { 833 TCGv_i32 t0 = tcg_temp_new_i32(); 834 TCGv_i32 t1 = tcg_temp_new_i32(); 835 TCGv_i32 t2 = tcg_temp_new_i32(); 836 TCGv_i32 t3 = tcg_temp_new_i32(); 837 uint32_t i; 838 839 for (i = 0; i < oprsz; i += 4) { 840 tcg_gen_ld_i32(t1, tcg_env, aofs + i); 841 tcg_gen_ld_i32(t2, tcg_env, bofs + i); 842 tcg_gen_ld_i32(t3, tcg_env, cofs + i); 843 fni(t0, t1, t2, t3); 844 tcg_gen_st_i32(t0, tcg_env, dofs + i); 845 if (write_aofs) { 846 tcg_gen_st_i32(t1, tcg_env, aofs + i); 847 } 848 } 849 tcg_temp_free_i32(t3); 850 tcg_temp_free_i32(t2); 851 tcg_temp_free_i32(t1); 852 tcg_temp_free_i32(t0); 853 } 854 855 static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 856 uint32_t cofs, uint32_t oprsz, int32_t c, 857 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32, 858 int32_t)) 859 { 860 TCGv_i32 t0 = tcg_temp_new_i32(); 861 TCGv_i32 t1 = tcg_temp_new_i32(); 862 TCGv_i32 t2 = tcg_temp_new_i32(); 863 TCGv_i32 t3 = tcg_temp_new_i32(); 864 uint32_t i; 865 866 for (i = 0; i < oprsz; i += 4) { 867 tcg_gen_ld_i32(t1, tcg_env, aofs + i); 868 tcg_gen_ld_i32(t2, tcg_env, bofs + i); 869 tcg_gen_ld_i32(t3, tcg_env, cofs + i); 870 fni(t0, t1, t2, t3, c); 871 tcg_gen_st_i32(t0, tcg_env, dofs + i); 872 } 873 tcg_temp_free_i32(t3); 874 tcg_temp_free_i32(t2); 875 tcg_temp_free_i32(t1); 876 tcg_temp_free_i32(t0); 877 } 878 879 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 880 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 881 bool load_dest, void (*fni)(TCGv_i64, TCGv_i64)) 882 { 883 TCGv_i64 t0 = tcg_temp_new_i64(); 884 TCGv_i64 t1 = tcg_temp_new_i64(); 885 uint32_t i; 886 887 for (i = 0; i < oprsz; i += 8) { 888 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 889 if (load_dest) { 890 tcg_gen_ld_i64(t1, tcg_env, dofs + i); 891 } 892 fni(t1, t0); 893 tcg_gen_st_i64(t1, tcg_env, dofs + i); 894 } 895 tcg_temp_free_i64(t0); 896 tcg_temp_free_i64(t1); 897 } 898 899 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 900 int64_t c, bool load_dest, 901 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 902 { 903 TCGv_i64 t0 = tcg_temp_new_i64(); 904 TCGv_i64 t1 = tcg_temp_new_i64(); 905 uint32_t i; 906 907 for (i = 0; i < oprsz; i += 8) { 908 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 909 if (load_dest) { 910 tcg_gen_ld_i64(t1, tcg_env, dofs + i); 911 } 912 fni(t1, t0, c); 913 tcg_gen_st_i64(t1, tcg_env, dofs + i); 914 } 915 tcg_temp_free_i64(t0); 916 tcg_temp_free_i64(t1); 917 } 918 919 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 920 TCGv_i64 c, bool scalar_first, 921 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 922 { 923 TCGv_i64 t0 = tcg_temp_new_i64(); 924 TCGv_i64 t1 = tcg_temp_new_i64(); 925 uint32_t i; 926 927 for (i = 0; i < oprsz; i += 8) { 928 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 929 if (scalar_first) { 930 fni(t1, c, t0); 931 } else { 932 fni(t1, t0, c); 933 } 934 tcg_gen_st_i64(t1, tcg_env, dofs + i); 935 } 936 tcg_temp_free_i64(t0); 937 tcg_temp_free_i64(t1); 938 } 939 940 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 941 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 942 uint32_t bofs, uint32_t oprsz, bool load_dest, 943 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 944 { 945 TCGv_i64 t0 = tcg_temp_new_i64(); 946 TCGv_i64 t1 = tcg_temp_new_i64(); 947 TCGv_i64 t2 = tcg_temp_new_i64(); 948 uint32_t i; 949 950 for (i = 0; i < oprsz; i += 8) { 951 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 952 tcg_gen_ld_i64(t1, tcg_env, bofs + i); 953 if (load_dest) { 954 tcg_gen_ld_i64(t2, tcg_env, dofs + i); 955 } 956 fni(t2, t0, t1); 957 tcg_gen_st_i64(t2, tcg_env, dofs + i); 958 } 959 tcg_temp_free_i64(t2); 960 tcg_temp_free_i64(t1); 961 tcg_temp_free_i64(t0); 962 } 963 964 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 965 uint32_t oprsz, int64_t c, 966 bool load_dest, bool write_aofs, 967 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t)) 968 { 969 TCGv_i64 t0 = tcg_temp_new_i64(); 970 TCGv_i64 t1 = tcg_temp_new_i64(); 971 TCGv_i64 t2 = tcg_temp_new_i64(); 972 uint32_t i; 973 974 for (i = 0; i < oprsz; i += 8) { 975 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 976 tcg_gen_ld_i64(t1, tcg_env, bofs + i); 977 if (load_dest) { 978 tcg_gen_ld_i64(t2, tcg_env, dofs + i); 979 } 980 fni(t2, t0, t1, c); 981 tcg_gen_st_i64(t2, tcg_env, dofs + i); 982 if (write_aofs) { 983 tcg_gen_st_i64(t0, tcg_env, aofs + i); 984 } 985 } 986 tcg_temp_free_i64(t0); 987 tcg_temp_free_i64(t1); 988 tcg_temp_free_i64(t2); 989 } 990 991 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 992 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 993 uint32_t cofs, uint32_t oprsz, bool write_aofs, 994 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 995 { 996 TCGv_i64 t0 = tcg_temp_new_i64(); 997 TCGv_i64 t1 = tcg_temp_new_i64(); 998 TCGv_i64 t2 = tcg_temp_new_i64(); 999 TCGv_i64 t3 = tcg_temp_new_i64(); 1000 uint32_t i; 1001 1002 for (i = 0; i < oprsz; i += 8) { 1003 tcg_gen_ld_i64(t1, tcg_env, aofs + i); 1004 tcg_gen_ld_i64(t2, tcg_env, bofs + i); 1005 tcg_gen_ld_i64(t3, tcg_env, cofs + i); 1006 fni(t0, t1, t2, t3); 1007 tcg_gen_st_i64(t0, tcg_env, dofs + i); 1008 if (write_aofs) { 1009 tcg_gen_st_i64(t1, tcg_env, aofs + i); 1010 } 1011 } 1012 tcg_temp_free_i64(t3); 1013 tcg_temp_free_i64(t2); 1014 tcg_temp_free_i64(t1); 1015 tcg_temp_free_i64(t0); 1016 } 1017 1018 static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1019 uint32_t cofs, uint32_t oprsz, int64_t c, 1020 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64, 1021 int64_t)) 1022 { 1023 TCGv_i64 t0 = tcg_temp_new_i64(); 1024 TCGv_i64 t1 = tcg_temp_new_i64(); 1025 TCGv_i64 t2 = tcg_temp_new_i64(); 1026 TCGv_i64 t3 = tcg_temp_new_i64(); 1027 uint32_t i; 1028 1029 for (i = 0; i < oprsz; i += 8) { 1030 tcg_gen_ld_i64(t1, tcg_env, aofs + i); 1031 tcg_gen_ld_i64(t2, tcg_env, bofs + i); 1032 tcg_gen_ld_i64(t3, tcg_env, cofs + i); 1033 fni(t0, t1, t2, t3, c); 1034 tcg_gen_st_i64(t0, tcg_env, dofs + i); 1035 } 1036 tcg_temp_free_i64(t3); 1037 tcg_temp_free_i64(t2); 1038 tcg_temp_free_i64(t1); 1039 tcg_temp_free_i64(t0); 1040 } 1041 1042 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 1043 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1044 uint32_t oprsz, uint32_t tysz, TCGType type, 1045 bool load_dest, 1046 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 1047 { 1048 for (uint32_t i = 0; i < oprsz; i += tysz) { 1049 TCGv_vec t0 = tcg_temp_new_vec(type); 1050 TCGv_vec t1 = tcg_temp_new_vec(type); 1051 1052 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 1053 if (load_dest) { 1054 tcg_gen_ld_vec(t1, tcg_env, dofs + i); 1055 } 1056 fni(vece, t1, t0); 1057 tcg_gen_st_vec(t1, tcg_env, dofs + i); 1058 } 1059 } 1060 1061 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 1062 using host vectors. */ 1063 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1064 uint32_t oprsz, uint32_t tysz, TCGType type, 1065 int64_t c, bool load_dest, 1066 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 1067 { 1068 for (uint32_t i = 0; i < oprsz; i += tysz) { 1069 TCGv_vec t0 = tcg_temp_new_vec(type); 1070 TCGv_vec t1 = tcg_temp_new_vec(type); 1071 1072 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 1073 if (load_dest) { 1074 tcg_gen_ld_vec(t1, tcg_env, dofs + i); 1075 } 1076 fni(vece, t1, t0, c); 1077 tcg_gen_st_vec(t1, tcg_env, dofs + i); 1078 } 1079 } 1080 1081 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1082 uint32_t oprsz, uint32_t tysz, TCGType type, 1083 TCGv_vec c, bool scalar_first, 1084 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1085 { 1086 for (uint32_t i = 0; i < oprsz; i += tysz) { 1087 TCGv_vec t0 = tcg_temp_new_vec(type); 1088 TCGv_vec t1 = tcg_temp_new_vec(type); 1089 1090 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 1091 if (scalar_first) { 1092 fni(vece, t1, c, t0); 1093 } else { 1094 fni(vece, t1, t0, c); 1095 } 1096 tcg_gen_st_vec(t1, tcg_env, dofs + i); 1097 } 1098 } 1099 1100 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 1101 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1102 uint32_t bofs, uint32_t oprsz, 1103 uint32_t tysz, TCGType type, bool load_dest, 1104 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1105 { 1106 for (uint32_t i = 0; i < oprsz; i += tysz) { 1107 TCGv_vec t0 = tcg_temp_new_vec(type); 1108 TCGv_vec t1 = tcg_temp_new_vec(type); 1109 TCGv_vec t2 = tcg_temp_new_vec(type); 1110 1111 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 1112 tcg_gen_ld_vec(t1, tcg_env, bofs + i); 1113 if (load_dest) { 1114 tcg_gen_ld_vec(t2, tcg_env, dofs + i); 1115 } 1116 fni(vece, t2, t0, t1); 1117 tcg_gen_st_vec(t2, tcg_env, dofs + i); 1118 } 1119 } 1120 1121 /* 1122 * Expand OPSZ bytes worth of three-vector operands and an immediate operand 1123 * using host vectors. 1124 */ 1125 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1126 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 1127 TCGType type, int64_t c, 1128 bool load_dest, bool write_aofs, 1129 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, 1130 int64_t)) 1131 { 1132 for (uint32_t i = 0; i < oprsz; i += tysz) { 1133 TCGv_vec t0 = tcg_temp_new_vec(type); 1134 TCGv_vec t1 = tcg_temp_new_vec(type); 1135 TCGv_vec t2 = tcg_temp_new_vec(type); 1136 1137 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 1138 tcg_gen_ld_vec(t1, tcg_env, bofs + i); 1139 if (load_dest) { 1140 tcg_gen_ld_vec(t2, tcg_env, dofs + i); 1141 } 1142 fni(vece, t2, t0, t1, c); 1143 tcg_gen_st_vec(t2, tcg_env, dofs + i); 1144 if (write_aofs) { 1145 tcg_gen_st_vec(t0, tcg_env, aofs + i); 1146 } 1147 } 1148 } 1149 1150 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 1151 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1152 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 1153 uint32_t tysz, TCGType type, bool write_aofs, 1154 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 1155 TCGv_vec, TCGv_vec)) 1156 { 1157 for (uint32_t i = 0; i < oprsz; i += tysz) { 1158 TCGv_vec t0 = tcg_temp_new_vec(type); 1159 TCGv_vec t1 = tcg_temp_new_vec(type); 1160 TCGv_vec t2 = tcg_temp_new_vec(type); 1161 TCGv_vec t3 = tcg_temp_new_vec(type); 1162 1163 tcg_gen_ld_vec(t1, tcg_env, aofs + i); 1164 tcg_gen_ld_vec(t2, tcg_env, bofs + i); 1165 tcg_gen_ld_vec(t3, tcg_env, cofs + i); 1166 fni(vece, t0, t1, t2, t3); 1167 tcg_gen_st_vec(t0, tcg_env, dofs + i); 1168 if (write_aofs) { 1169 tcg_gen_st_vec(t1, tcg_env, aofs + i); 1170 } 1171 } 1172 } 1173 1174 /* 1175 * Expand OPSZ bytes worth of four-vector operands and an immediate operand 1176 * using host vectors. 1177 */ 1178 static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1179 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 1180 uint32_t tysz, TCGType type, int64_t c, 1181 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 1182 TCGv_vec, TCGv_vec, int64_t)) 1183 { 1184 for (uint32_t i = 0; i < oprsz; i += tysz) { 1185 TCGv_vec t0 = tcg_temp_new_vec(type); 1186 TCGv_vec t1 = tcg_temp_new_vec(type); 1187 TCGv_vec t2 = tcg_temp_new_vec(type); 1188 TCGv_vec t3 = tcg_temp_new_vec(type); 1189 1190 tcg_gen_ld_vec(t1, tcg_env, aofs + i); 1191 tcg_gen_ld_vec(t2, tcg_env, bofs + i); 1192 tcg_gen_ld_vec(t3, tcg_env, cofs + i); 1193 fni(vece, t0, t1, t2, t3, c); 1194 tcg_gen_st_vec(t0, tcg_env, dofs + i); 1195 } 1196 } 1197 1198 /* Expand a vector two-operand operation. */ 1199 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 1200 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 1201 { 1202 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1203 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1204 TCGType type; 1205 uint32_t some; 1206 1207 check_size_align(oprsz, maxsz, dofs | aofs); 1208 check_overlap_2(dofs, aofs, maxsz); 1209 1210 type = 0; 1211 if (g->fniv) { 1212 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1213 } 1214 switch (type) { 1215 case TCG_TYPE_V256: 1216 /* Recall that ARM SVE allows vector sizes that are not a 1217 * power of 2, but always a multiple of 16. The intent is 1218 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1219 */ 1220 some = QEMU_ALIGN_DOWN(oprsz, 32); 1221 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1222 g->load_dest, g->fniv); 1223 if (some == oprsz) { 1224 break; 1225 } 1226 dofs += some; 1227 aofs += some; 1228 oprsz -= some; 1229 maxsz -= some; 1230 /* fallthru */ 1231 case TCG_TYPE_V128: 1232 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1233 g->load_dest, g->fniv); 1234 break; 1235 case TCG_TYPE_V64: 1236 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1237 g->load_dest, g->fniv); 1238 break; 1239 1240 case 0: 1241 if (g->fni8 && check_size_impl(oprsz, 8)) { 1242 expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8); 1243 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1244 expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4); 1245 } else { 1246 assert(g->fno != NULL); 1247 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 1248 oprsz = maxsz; 1249 } 1250 break; 1251 1252 default: 1253 g_assert_not_reached(); 1254 } 1255 tcg_swap_vecop_list(hold_list); 1256 1257 if (oprsz < maxsz) { 1258 expand_clr(dofs + oprsz, maxsz - oprsz); 1259 } 1260 } 1261 1262 /* Expand a vector operation with two vectors and an immediate. */ 1263 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1264 uint32_t maxsz, int64_t c, const GVecGen2i *g) 1265 { 1266 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1267 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1268 TCGType type; 1269 uint32_t some; 1270 1271 check_size_align(oprsz, maxsz, dofs | aofs); 1272 check_overlap_2(dofs, aofs, maxsz); 1273 1274 type = 0; 1275 if (g->fniv) { 1276 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1277 } 1278 switch (type) { 1279 case TCG_TYPE_V256: 1280 /* Recall that ARM SVE allows vector sizes that are not a 1281 * power of 2, but always a multiple of 16. The intent is 1282 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1283 */ 1284 some = QEMU_ALIGN_DOWN(oprsz, 32); 1285 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1286 c, g->load_dest, g->fniv); 1287 if (some == oprsz) { 1288 break; 1289 } 1290 dofs += some; 1291 aofs += some; 1292 oprsz -= some; 1293 maxsz -= some; 1294 /* fallthru */ 1295 case TCG_TYPE_V128: 1296 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1297 c, g->load_dest, g->fniv); 1298 break; 1299 case TCG_TYPE_V64: 1300 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1301 c, g->load_dest, g->fniv); 1302 break; 1303 1304 case 0: 1305 if (g->fni8 && check_size_impl(oprsz, 8)) { 1306 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1307 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1308 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1309 } else { 1310 if (g->fno) { 1311 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1312 } else { 1313 TCGv_i64 tcg_c = tcg_constant_i64(c); 1314 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1315 maxsz, c, g->fnoi); 1316 } 1317 oprsz = maxsz; 1318 } 1319 break; 1320 1321 default: 1322 g_assert_not_reached(); 1323 } 1324 tcg_swap_vecop_list(hold_list); 1325 1326 if (oprsz < maxsz) { 1327 expand_clr(dofs + oprsz, maxsz - oprsz); 1328 } 1329 } 1330 1331 /* Expand a vector operation with two vectors and a scalar. */ 1332 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1333 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1334 { 1335 TCGType type; 1336 1337 check_size_align(oprsz, maxsz, dofs | aofs); 1338 check_overlap_2(dofs, aofs, maxsz); 1339 1340 type = 0; 1341 if (g->fniv) { 1342 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1343 } 1344 if (type != 0) { 1345 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1346 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1347 TCGv_vec t_vec = tcg_temp_new_vec(type); 1348 uint32_t some; 1349 1350 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1351 1352 switch (type) { 1353 case TCG_TYPE_V256: 1354 /* Recall that ARM SVE allows vector sizes that are not a 1355 * power of 2, but always a multiple of 16. The intent is 1356 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1357 */ 1358 some = QEMU_ALIGN_DOWN(oprsz, 32); 1359 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1360 t_vec, g->scalar_first, g->fniv); 1361 if (some == oprsz) { 1362 break; 1363 } 1364 dofs += some; 1365 aofs += some; 1366 oprsz -= some; 1367 maxsz -= some; 1368 /* fallthru */ 1369 1370 case TCG_TYPE_V128: 1371 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1372 t_vec, g->scalar_first, g->fniv); 1373 break; 1374 1375 case TCG_TYPE_V64: 1376 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1377 t_vec, g->scalar_first, g->fniv); 1378 break; 1379 1380 default: 1381 g_assert_not_reached(); 1382 } 1383 tcg_temp_free_vec(t_vec); 1384 tcg_swap_vecop_list(hold_list); 1385 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1386 TCGv_i64 t64 = tcg_temp_new_i64(); 1387 1388 tcg_gen_dup_i64(g->vece, t64, c); 1389 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1390 tcg_temp_free_i64(t64); 1391 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1392 TCGv_i32 t32 = tcg_temp_new_i32(); 1393 1394 tcg_gen_extrl_i64_i32(t32, c); 1395 tcg_gen_dup_i32(g->vece, t32, t32); 1396 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1397 tcg_temp_free_i32(t32); 1398 } else { 1399 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1400 return; 1401 } 1402 1403 if (oprsz < maxsz) { 1404 expand_clr(dofs + oprsz, maxsz - oprsz); 1405 } 1406 } 1407 1408 /* Expand a vector three-operand operation. */ 1409 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1410 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1411 { 1412 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1413 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1414 TCGType type; 1415 uint32_t some; 1416 1417 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1418 check_overlap_3(dofs, aofs, bofs, maxsz); 1419 1420 type = 0; 1421 if (g->fniv) { 1422 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1423 } 1424 switch (type) { 1425 case TCG_TYPE_V256: 1426 /* Recall that ARM SVE allows vector sizes that are not a 1427 * power of 2, but always a multiple of 16. The intent is 1428 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1429 */ 1430 some = QEMU_ALIGN_DOWN(oprsz, 32); 1431 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1432 g->load_dest, g->fniv); 1433 if (some == oprsz) { 1434 break; 1435 } 1436 dofs += some; 1437 aofs += some; 1438 bofs += some; 1439 oprsz -= some; 1440 maxsz -= some; 1441 /* fallthru */ 1442 case TCG_TYPE_V128: 1443 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1444 g->load_dest, g->fniv); 1445 break; 1446 case TCG_TYPE_V64: 1447 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1448 g->load_dest, g->fniv); 1449 break; 1450 1451 case 0: 1452 if (g->fni8 && check_size_impl(oprsz, 8)) { 1453 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1454 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1455 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1456 } else { 1457 assert(g->fno != NULL); 1458 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1459 maxsz, g->data, g->fno); 1460 oprsz = maxsz; 1461 } 1462 break; 1463 1464 default: 1465 g_assert_not_reached(); 1466 } 1467 tcg_swap_vecop_list(hold_list); 1468 1469 if (oprsz < maxsz) { 1470 expand_clr(dofs + oprsz, maxsz - oprsz); 1471 } 1472 } 1473 1474 /* Expand a vector operation with three vectors and an immediate. */ 1475 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1476 uint32_t oprsz, uint32_t maxsz, int64_t c, 1477 const GVecGen3i *g) 1478 { 1479 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1480 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1481 TCGType type; 1482 uint32_t some; 1483 1484 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1485 check_overlap_3(dofs, aofs, bofs, maxsz); 1486 1487 type = 0; 1488 if (g->fniv) { 1489 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1490 } 1491 switch (type) { 1492 case TCG_TYPE_V256: 1493 /* 1494 * Recall that ARM SVE allows vector sizes that are not a 1495 * power of 2, but always a multiple of 16. The intent is 1496 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1497 */ 1498 some = QEMU_ALIGN_DOWN(oprsz, 32); 1499 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1500 c, g->load_dest, g->write_aofs, g->fniv); 1501 if (some == oprsz) { 1502 break; 1503 } 1504 dofs += some; 1505 aofs += some; 1506 bofs += some; 1507 oprsz -= some; 1508 maxsz -= some; 1509 /* fallthru */ 1510 case TCG_TYPE_V128: 1511 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1512 c, g->load_dest, g->write_aofs, g->fniv); 1513 break; 1514 case TCG_TYPE_V64: 1515 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1516 c, g->load_dest, g->write_aofs, g->fniv); 1517 break; 1518 1519 case 0: 1520 if (g->fni8 && check_size_impl(oprsz, 8)) { 1521 expand_3i_i64(dofs, aofs, bofs, oprsz, c, 1522 g->load_dest, g->write_aofs, g->fni8); 1523 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1524 expand_3i_i32(dofs, aofs, bofs, oprsz, c, 1525 g->load_dest, g->write_aofs, g->fni4); 1526 } else { 1527 assert(g->fno != NULL); 1528 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno); 1529 oprsz = maxsz; 1530 } 1531 break; 1532 1533 default: 1534 g_assert_not_reached(); 1535 } 1536 tcg_swap_vecop_list(hold_list); 1537 1538 if (oprsz < maxsz) { 1539 expand_clr(dofs + oprsz, maxsz - oprsz); 1540 } 1541 } 1542 1543 /* Expand a vector four-operand operation. */ 1544 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1545 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1546 { 1547 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1548 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1549 TCGType type; 1550 uint32_t some; 1551 1552 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1553 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1554 1555 type = 0; 1556 if (g->fniv) { 1557 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1558 } 1559 switch (type) { 1560 case TCG_TYPE_V256: 1561 /* Recall that ARM SVE allows vector sizes that are not a 1562 * power of 2, but always a multiple of 16. The intent is 1563 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1564 */ 1565 some = QEMU_ALIGN_DOWN(oprsz, 32); 1566 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1567 32, TCG_TYPE_V256, g->write_aofs, g->fniv); 1568 if (some == oprsz) { 1569 break; 1570 } 1571 dofs += some; 1572 aofs += some; 1573 bofs += some; 1574 cofs += some; 1575 oprsz -= some; 1576 maxsz -= some; 1577 /* fallthru */ 1578 case TCG_TYPE_V128: 1579 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1580 16, TCG_TYPE_V128, g->write_aofs, g->fniv); 1581 break; 1582 case TCG_TYPE_V64: 1583 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1584 8, TCG_TYPE_V64, g->write_aofs, g->fniv); 1585 break; 1586 1587 case 0: 1588 if (g->fni8 && check_size_impl(oprsz, 8)) { 1589 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, 1590 g->write_aofs, g->fni8); 1591 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1592 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, 1593 g->write_aofs, g->fni4); 1594 } else { 1595 assert(g->fno != NULL); 1596 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1597 oprsz, maxsz, g->data, g->fno); 1598 oprsz = maxsz; 1599 } 1600 break; 1601 1602 default: 1603 g_assert_not_reached(); 1604 } 1605 tcg_swap_vecop_list(hold_list); 1606 1607 if (oprsz < maxsz) { 1608 expand_clr(dofs + oprsz, maxsz - oprsz); 1609 } 1610 } 1611 1612 /* Expand a vector four-operand operation. */ 1613 void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1614 uint32_t oprsz, uint32_t maxsz, int64_t c, 1615 const GVecGen4i *g) 1616 { 1617 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1618 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1619 TCGType type; 1620 uint32_t some; 1621 1622 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1623 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1624 1625 type = 0; 1626 if (g->fniv) { 1627 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1628 } 1629 switch (type) { 1630 case TCG_TYPE_V256: 1631 /* 1632 * Recall that ARM SVE allows vector sizes that are not a 1633 * power of 2, but always a multiple of 16. The intent is 1634 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1635 */ 1636 some = QEMU_ALIGN_DOWN(oprsz, 32); 1637 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some, 1638 32, TCG_TYPE_V256, c, g->fniv); 1639 if (some == oprsz) { 1640 break; 1641 } 1642 dofs += some; 1643 aofs += some; 1644 bofs += some; 1645 cofs += some; 1646 oprsz -= some; 1647 maxsz -= some; 1648 /* fallthru */ 1649 case TCG_TYPE_V128: 1650 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1651 16, TCG_TYPE_V128, c, g->fniv); 1652 break; 1653 case TCG_TYPE_V64: 1654 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1655 8, TCG_TYPE_V64, c, g->fniv); 1656 break; 1657 1658 case 0: 1659 if (g->fni8 && check_size_impl(oprsz, 8)) { 1660 expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8); 1661 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1662 expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4); 1663 } else { 1664 assert(g->fno != NULL); 1665 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1666 oprsz, maxsz, c, g->fno); 1667 oprsz = maxsz; 1668 } 1669 break; 1670 1671 default: 1672 g_assert_not_reached(); 1673 } 1674 tcg_swap_vecop_list(hold_list); 1675 1676 if (oprsz < maxsz) { 1677 expand_clr(dofs + oprsz, maxsz - oprsz); 1678 } 1679 } 1680 1681 /* 1682 * Expand specific vector operations. 1683 */ 1684 1685 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1686 { 1687 tcg_gen_mov_vec(a, b); 1688 } 1689 1690 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1691 uint32_t oprsz, uint32_t maxsz) 1692 { 1693 static const GVecGen2 g = { 1694 .fni8 = tcg_gen_mov_i64, 1695 .fniv = vec_mov2, 1696 .fno = gen_helper_gvec_mov, 1697 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1698 }; 1699 if (dofs != aofs) { 1700 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1701 } else { 1702 check_size_align(oprsz, maxsz, dofs); 1703 if (oprsz < maxsz) { 1704 expand_clr(dofs + oprsz, maxsz - oprsz); 1705 } 1706 } 1707 } 1708 1709 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1710 uint32_t maxsz, TCGv_i32 in) 1711 { 1712 check_size_align(oprsz, maxsz, dofs); 1713 tcg_debug_assert(vece <= MO_32); 1714 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1715 } 1716 1717 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1718 uint32_t maxsz, TCGv_i64 in) 1719 { 1720 check_size_align(oprsz, maxsz, dofs); 1721 tcg_debug_assert(vece <= MO_64); 1722 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1723 } 1724 1725 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1726 uint32_t oprsz, uint32_t maxsz) 1727 { 1728 check_size_align(oprsz, maxsz, dofs); 1729 if (vece <= MO_64) { 1730 TCGType type = choose_vector_type(NULL, vece, oprsz, 0); 1731 if (type != 0) { 1732 TCGv_vec t_vec = tcg_temp_new_vec(type); 1733 tcg_gen_dup_mem_vec(vece, t_vec, tcg_env, aofs); 1734 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 1735 } else if (vece <= MO_32) { 1736 TCGv_i32 in = tcg_temp_ebb_new_i32(); 1737 switch (vece) { 1738 case MO_8: 1739 tcg_gen_ld8u_i32(in, tcg_env, aofs); 1740 break; 1741 case MO_16: 1742 tcg_gen_ld16u_i32(in, tcg_env, aofs); 1743 break; 1744 default: 1745 tcg_gen_ld_i32(in, tcg_env, aofs); 1746 break; 1747 } 1748 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1749 tcg_temp_free_i32(in); 1750 } else { 1751 TCGv_i64 in = tcg_temp_ebb_new_i64(); 1752 tcg_gen_ld_i64(in, tcg_env, aofs); 1753 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1754 tcg_temp_free_i64(in); 1755 } 1756 } else if (vece == 4) { 1757 /* 128-bit duplicate. */ 1758 int i; 1759 1760 tcg_debug_assert(oprsz >= 16); 1761 if (TCG_TARGET_HAS_v128) { 1762 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1763 1764 tcg_gen_ld_vec(in, tcg_env, aofs); 1765 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1766 tcg_gen_st_vec(in, tcg_env, dofs + i); 1767 } 1768 } else { 1769 TCGv_i64 in0 = tcg_temp_ebb_new_i64(); 1770 TCGv_i64 in1 = tcg_temp_ebb_new_i64(); 1771 1772 tcg_gen_ld_i64(in0, tcg_env, aofs); 1773 tcg_gen_ld_i64(in1, tcg_env, aofs + 8); 1774 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1775 tcg_gen_st_i64(in0, tcg_env, dofs + i); 1776 tcg_gen_st_i64(in1, tcg_env, dofs + i + 8); 1777 } 1778 tcg_temp_free_i64(in0); 1779 tcg_temp_free_i64(in1); 1780 } 1781 if (oprsz < maxsz) { 1782 expand_clr(dofs + oprsz, maxsz - oprsz); 1783 } 1784 } else if (vece == 5) { 1785 /* 256-bit duplicate. */ 1786 int i; 1787 1788 tcg_debug_assert(oprsz >= 32); 1789 tcg_debug_assert(oprsz % 32 == 0); 1790 if (TCG_TARGET_HAS_v256) { 1791 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256); 1792 1793 tcg_gen_ld_vec(in, tcg_env, aofs); 1794 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1795 tcg_gen_st_vec(in, tcg_env, dofs + i); 1796 } 1797 } else if (TCG_TARGET_HAS_v128) { 1798 TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128); 1799 TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128); 1800 1801 tcg_gen_ld_vec(in0, tcg_env, aofs); 1802 tcg_gen_ld_vec(in1, tcg_env, aofs + 16); 1803 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1804 tcg_gen_st_vec(in0, tcg_env, dofs + i); 1805 tcg_gen_st_vec(in1, tcg_env, dofs + i + 16); 1806 } 1807 } else { 1808 TCGv_i64 in[4]; 1809 int j; 1810 1811 for (j = 0; j < 4; ++j) { 1812 in[j] = tcg_temp_ebb_new_i64(); 1813 tcg_gen_ld_i64(in[j], tcg_env, aofs + j * 8); 1814 } 1815 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1816 for (j = 0; j < 4; ++j) { 1817 tcg_gen_st_i64(in[j], tcg_env, dofs + i + j * 8); 1818 } 1819 } 1820 for (j = 0; j < 4; ++j) { 1821 tcg_temp_free_i64(in[j]); 1822 } 1823 } 1824 if (oprsz < maxsz) { 1825 expand_clr(dofs + oprsz, maxsz - oprsz); 1826 } 1827 } else { 1828 g_assert_not_reached(); 1829 } 1830 } 1831 1832 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz, 1833 uint32_t maxsz, uint64_t x) 1834 { 1835 check_size_align(oprsz, maxsz, dofs); 1836 do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x); 1837 } 1838 1839 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1840 uint32_t oprsz, uint32_t maxsz) 1841 { 1842 static const GVecGen2 g = { 1843 .fni8 = tcg_gen_not_i64, 1844 .fniv = tcg_gen_not_vec, 1845 .fno = gen_helper_gvec_not, 1846 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1847 }; 1848 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1849 } 1850 1851 /* Perform a vector addition using normal addition and a mask. The mask 1852 should be the sign bit of each lane. This 6-operation form is more 1853 efficient than separate additions when there are 4 or more lanes in 1854 the 64-bit operation. */ 1855 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1856 { 1857 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 1858 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 1859 TCGv_i64 t3 = tcg_temp_ebb_new_i64(); 1860 1861 tcg_gen_andc_i64(t1, a, m); 1862 tcg_gen_andc_i64(t2, b, m); 1863 tcg_gen_xor_i64(t3, a, b); 1864 tcg_gen_add_i64(d, t1, t2); 1865 tcg_gen_and_i64(t3, t3, m); 1866 tcg_gen_xor_i64(d, d, t3); 1867 1868 tcg_temp_free_i64(t1); 1869 tcg_temp_free_i64(t2); 1870 tcg_temp_free_i64(t3); 1871 } 1872 1873 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1874 { 1875 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80)); 1876 gen_addv_mask(d, a, b, m); 1877 } 1878 1879 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1880 { 1881 TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80)); 1882 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 1883 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 1884 TCGv_i32 t3 = tcg_temp_ebb_new_i32(); 1885 1886 tcg_gen_andc_i32(t1, a, m); 1887 tcg_gen_andc_i32(t2, b, m); 1888 tcg_gen_xor_i32(t3, a, b); 1889 tcg_gen_add_i32(d, t1, t2); 1890 tcg_gen_and_i32(t3, t3, m); 1891 tcg_gen_xor_i32(d, d, t3); 1892 1893 tcg_temp_free_i32(t1); 1894 tcg_temp_free_i32(t2); 1895 tcg_temp_free_i32(t3); 1896 } 1897 1898 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1899 { 1900 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000)); 1901 gen_addv_mask(d, a, b, m); 1902 } 1903 1904 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1905 { 1906 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 1907 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 1908 1909 tcg_gen_andi_i32(t1, a, ~0xffff); 1910 tcg_gen_add_i32(t2, a, b); 1911 tcg_gen_add_i32(t1, t1, b); 1912 tcg_gen_deposit_i32(d, t1, t2, 0, 16); 1913 1914 tcg_temp_free_i32(t1); 1915 tcg_temp_free_i32(t2); 1916 } 1917 1918 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1919 { 1920 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 1921 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 1922 1923 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1924 tcg_gen_add_i64(t2, a, b); 1925 tcg_gen_add_i64(t1, t1, b); 1926 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1927 1928 tcg_temp_free_i64(t1); 1929 tcg_temp_free_i64(t2); 1930 } 1931 1932 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 }; 1933 1934 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1935 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1936 { 1937 static const GVecGen3 g[4] = { 1938 { .fni8 = tcg_gen_vec_add8_i64, 1939 .fniv = tcg_gen_add_vec, 1940 .fno = gen_helper_gvec_add8, 1941 .opt_opc = vecop_list_add, 1942 .vece = MO_8 }, 1943 { .fni8 = tcg_gen_vec_add16_i64, 1944 .fniv = tcg_gen_add_vec, 1945 .fno = gen_helper_gvec_add16, 1946 .opt_opc = vecop_list_add, 1947 .vece = MO_16 }, 1948 { .fni4 = tcg_gen_add_i32, 1949 .fniv = tcg_gen_add_vec, 1950 .fno = gen_helper_gvec_add32, 1951 .opt_opc = vecop_list_add, 1952 .vece = MO_32 }, 1953 { .fni8 = tcg_gen_add_i64, 1954 .fniv = tcg_gen_add_vec, 1955 .fno = gen_helper_gvec_add64, 1956 .opt_opc = vecop_list_add, 1957 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1958 .vece = MO_64 }, 1959 }; 1960 1961 tcg_debug_assert(vece <= MO_64); 1962 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1963 } 1964 1965 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1966 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1967 { 1968 static const GVecGen2s g[4] = { 1969 { .fni8 = tcg_gen_vec_add8_i64, 1970 .fniv = tcg_gen_add_vec, 1971 .fno = gen_helper_gvec_adds8, 1972 .opt_opc = vecop_list_add, 1973 .vece = MO_8 }, 1974 { .fni8 = tcg_gen_vec_add16_i64, 1975 .fniv = tcg_gen_add_vec, 1976 .fno = gen_helper_gvec_adds16, 1977 .opt_opc = vecop_list_add, 1978 .vece = MO_16 }, 1979 { .fni4 = tcg_gen_add_i32, 1980 .fniv = tcg_gen_add_vec, 1981 .fno = gen_helper_gvec_adds32, 1982 .opt_opc = vecop_list_add, 1983 .vece = MO_32 }, 1984 { .fni8 = tcg_gen_add_i64, 1985 .fniv = tcg_gen_add_vec, 1986 .fno = gen_helper_gvec_adds64, 1987 .opt_opc = vecop_list_add, 1988 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1989 .vece = MO_64 }, 1990 }; 1991 1992 tcg_debug_assert(vece <= MO_64); 1993 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1994 } 1995 1996 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1997 int64_t c, uint32_t oprsz, uint32_t maxsz) 1998 { 1999 TCGv_i64 tmp = tcg_constant_i64(c); 2000 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 2001 } 2002 2003 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 }; 2004 2005 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 2006 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2007 { 2008 static const GVecGen2s g[4] = { 2009 { .fni8 = tcg_gen_vec_sub8_i64, 2010 .fniv = tcg_gen_sub_vec, 2011 .fno = gen_helper_gvec_subs8, 2012 .opt_opc = vecop_list_sub, 2013 .vece = MO_8 }, 2014 { .fni8 = tcg_gen_vec_sub16_i64, 2015 .fniv = tcg_gen_sub_vec, 2016 .fno = gen_helper_gvec_subs16, 2017 .opt_opc = vecop_list_sub, 2018 .vece = MO_16 }, 2019 { .fni4 = tcg_gen_sub_i32, 2020 .fniv = tcg_gen_sub_vec, 2021 .fno = gen_helper_gvec_subs32, 2022 .opt_opc = vecop_list_sub, 2023 .vece = MO_32 }, 2024 { .fni8 = tcg_gen_sub_i64, 2025 .fniv = tcg_gen_sub_vec, 2026 .fno = gen_helper_gvec_subs64, 2027 .opt_opc = vecop_list_sub, 2028 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2029 .vece = MO_64 }, 2030 }; 2031 2032 tcg_debug_assert(vece <= MO_64); 2033 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 2034 } 2035 2036 /* Perform a vector subtraction using normal subtraction and a mask. 2037 Compare gen_addv_mask above. */ 2038 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 2039 { 2040 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 2041 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2042 TCGv_i64 t3 = tcg_temp_ebb_new_i64(); 2043 2044 tcg_gen_or_i64(t1, a, m); 2045 tcg_gen_andc_i64(t2, b, m); 2046 tcg_gen_eqv_i64(t3, a, b); 2047 tcg_gen_sub_i64(d, t1, t2); 2048 tcg_gen_and_i64(t3, t3, m); 2049 tcg_gen_xor_i64(d, d, t3); 2050 2051 tcg_temp_free_i64(t1); 2052 tcg_temp_free_i64(t2); 2053 tcg_temp_free_i64(t3); 2054 } 2055 2056 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2057 { 2058 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80)); 2059 gen_subv_mask(d, a, b, m); 2060 } 2061 2062 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2063 { 2064 TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80)); 2065 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 2066 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 2067 TCGv_i32 t3 = tcg_temp_ebb_new_i32(); 2068 2069 tcg_gen_or_i32(t1, a, m); 2070 tcg_gen_andc_i32(t2, b, m); 2071 tcg_gen_eqv_i32(t3, a, b); 2072 tcg_gen_sub_i32(d, t1, t2); 2073 tcg_gen_and_i32(t3, t3, m); 2074 tcg_gen_xor_i32(d, d, t3); 2075 2076 tcg_temp_free_i32(t1); 2077 tcg_temp_free_i32(t2); 2078 tcg_temp_free_i32(t3); 2079 } 2080 2081 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2082 { 2083 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000)); 2084 gen_subv_mask(d, a, b, m); 2085 } 2086 2087 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2088 { 2089 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 2090 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 2091 2092 tcg_gen_andi_i32(t1, b, ~0xffff); 2093 tcg_gen_sub_i32(t2, a, b); 2094 tcg_gen_sub_i32(t1, a, t1); 2095 tcg_gen_deposit_i32(d, t1, t2, 0, 16); 2096 2097 tcg_temp_free_i32(t1); 2098 tcg_temp_free_i32(t2); 2099 } 2100 2101 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2102 { 2103 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 2104 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2105 2106 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2107 tcg_gen_sub_i64(t2, a, b); 2108 tcg_gen_sub_i64(t1, a, t1); 2109 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2110 2111 tcg_temp_free_i64(t1); 2112 tcg_temp_free_i64(t2); 2113 } 2114 2115 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 2116 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2117 { 2118 static const GVecGen3 g[4] = { 2119 { .fni8 = tcg_gen_vec_sub8_i64, 2120 .fniv = tcg_gen_sub_vec, 2121 .fno = gen_helper_gvec_sub8, 2122 .opt_opc = vecop_list_sub, 2123 .vece = MO_8 }, 2124 { .fni8 = tcg_gen_vec_sub16_i64, 2125 .fniv = tcg_gen_sub_vec, 2126 .fno = gen_helper_gvec_sub16, 2127 .opt_opc = vecop_list_sub, 2128 .vece = MO_16 }, 2129 { .fni4 = tcg_gen_sub_i32, 2130 .fniv = tcg_gen_sub_vec, 2131 .fno = gen_helper_gvec_sub32, 2132 .opt_opc = vecop_list_sub, 2133 .vece = MO_32 }, 2134 { .fni8 = tcg_gen_sub_i64, 2135 .fniv = tcg_gen_sub_vec, 2136 .fno = gen_helper_gvec_sub64, 2137 .opt_opc = vecop_list_sub, 2138 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2139 .vece = MO_64 }, 2140 }; 2141 2142 tcg_debug_assert(vece <= MO_64); 2143 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2144 } 2145 2146 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 }; 2147 2148 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 2149 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2150 { 2151 static const GVecGen3 g[4] = { 2152 { .fniv = tcg_gen_mul_vec, 2153 .fno = gen_helper_gvec_mul8, 2154 .opt_opc = vecop_list_mul, 2155 .vece = MO_8 }, 2156 { .fniv = tcg_gen_mul_vec, 2157 .fno = gen_helper_gvec_mul16, 2158 .opt_opc = vecop_list_mul, 2159 .vece = MO_16 }, 2160 { .fni4 = tcg_gen_mul_i32, 2161 .fniv = tcg_gen_mul_vec, 2162 .fno = gen_helper_gvec_mul32, 2163 .opt_opc = vecop_list_mul, 2164 .vece = MO_32 }, 2165 { .fni8 = tcg_gen_mul_i64, 2166 .fniv = tcg_gen_mul_vec, 2167 .fno = gen_helper_gvec_mul64, 2168 .opt_opc = vecop_list_mul, 2169 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2170 .vece = MO_64 }, 2171 }; 2172 2173 tcg_debug_assert(vece <= MO_64); 2174 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2175 } 2176 2177 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 2178 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2179 { 2180 static const GVecGen2s g[4] = { 2181 { .fniv = tcg_gen_mul_vec, 2182 .fno = gen_helper_gvec_muls8, 2183 .opt_opc = vecop_list_mul, 2184 .vece = MO_8 }, 2185 { .fniv = tcg_gen_mul_vec, 2186 .fno = gen_helper_gvec_muls16, 2187 .opt_opc = vecop_list_mul, 2188 .vece = MO_16 }, 2189 { .fni4 = tcg_gen_mul_i32, 2190 .fniv = tcg_gen_mul_vec, 2191 .fno = gen_helper_gvec_muls32, 2192 .opt_opc = vecop_list_mul, 2193 .vece = MO_32 }, 2194 { .fni8 = tcg_gen_mul_i64, 2195 .fniv = tcg_gen_mul_vec, 2196 .fno = gen_helper_gvec_muls64, 2197 .opt_opc = vecop_list_mul, 2198 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2199 .vece = MO_64 }, 2200 }; 2201 2202 tcg_debug_assert(vece <= MO_64); 2203 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 2204 } 2205 2206 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 2207 int64_t c, uint32_t oprsz, uint32_t maxsz) 2208 { 2209 TCGv_i64 tmp = tcg_constant_i64(c); 2210 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 2211 } 2212 2213 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 2214 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2215 { 2216 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 }; 2217 static const GVecGen3 g[4] = { 2218 { .fniv = tcg_gen_ssadd_vec, 2219 .fno = gen_helper_gvec_ssadd8, 2220 .opt_opc = vecop_list, 2221 .vece = MO_8 }, 2222 { .fniv = tcg_gen_ssadd_vec, 2223 .fno = gen_helper_gvec_ssadd16, 2224 .opt_opc = vecop_list, 2225 .vece = MO_16 }, 2226 { .fniv = tcg_gen_ssadd_vec, 2227 .fno = gen_helper_gvec_ssadd32, 2228 .opt_opc = vecop_list, 2229 .vece = MO_32 }, 2230 { .fniv = tcg_gen_ssadd_vec, 2231 .fno = gen_helper_gvec_ssadd64, 2232 .opt_opc = vecop_list, 2233 .vece = MO_64 }, 2234 }; 2235 tcg_debug_assert(vece <= MO_64); 2236 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2237 } 2238 2239 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 2240 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2241 { 2242 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 }; 2243 static const GVecGen3 g[4] = { 2244 { .fniv = tcg_gen_sssub_vec, 2245 .fno = gen_helper_gvec_sssub8, 2246 .opt_opc = vecop_list, 2247 .vece = MO_8 }, 2248 { .fniv = tcg_gen_sssub_vec, 2249 .fno = gen_helper_gvec_sssub16, 2250 .opt_opc = vecop_list, 2251 .vece = MO_16 }, 2252 { .fniv = tcg_gen_sssub_vec, 2253 .fno = gen_helper_gvec_sssub32, 2254 .opt_opc = vecop_list, 2255 .vece = MO_32 }, 2256 { .fniv = tcg_gen_sssub_vec, 2257 .fno = gen_helper_gvec_sssub64, 2258 .opt_opc = vecop_list, 2259 .vece = MO_64 }, 2260 }; 2261 tcg_debug_assert(vece <= MO_64); 2262 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2263 } 2264 2265 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2266 { 2267 TCGv_i32 max = tcg_constant_i32(-1); 2268 tcg_gen_add_i32(d, a, b); 2269 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 2270 } 2271 2272 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2273 { 2274 TCGv_i64 max = tcg_constant_i64(-1); 2275 tcg_gen_add_i64(d, a, b); 2276 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 2277 } 2278 2279 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 2280 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2281 { 2282 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 }; 2283 static const GVecGen3 g[4] = { 2284 { .fniv = tcg_gen_usadd_vec, 2285 .fno = gen_helper_gvec_usadd8, 2286 .opt_opc = vecop_list, 2287 .vece = MO_8 }, 2288 { .fniv = tcg_gen_usadd_vec, 2289 .fno = gen_helper_gvec_usadd16, 2290 .opt_opc = vecop_list, 2291 .vece = MO_16 }, 2292 { .fni4 = tcg_gen_usadd_i32, 2293 .fniv = tcg_gen_usadd_vec, 2294 .fno = gen_helper_gvec_usadd32, 2295 .opt_opc = vecop_list, 2296 .vece = MO_32 }, 2297 { .fni8 = tcg_gen_usadd_i64, 2298 .fniv = tcg_gen_usadd_vec, 2299 .fno = gen_helper_gvec_usadd64, 2300 .opt_opc = vecop_list, 2301 .vece = MO_64 } 2302 }; 2303 tcg_debug_assert(vece <= MO_64); 2304 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2305 } 2306 2307 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2308 { 2309 TCGv_i32 min = tcg_constant_i32(0); 2310 tcg_gen_sub_i32(d, a, b); 2311 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 2312 } 2313 2314 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2315 { 2316 TCGv_i64 min = tcg_constant_i64(0); 2317 tcg_gen_sub_i64(d, a, b); 2318 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 2319 } 2320 2321 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 2322 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2323 { 2324 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 }; 2325 static const GVecGen3 g[4] = { 2326 { .fniv = tcg_gen_ussub_vec, 2327 .fno = gen_helper_gvec_ussub8, 2328 .opt_opc = vecop_list, 2329 .vece = MO_8 }, 2330 { .fniv = tcg_gen_ussub_vec, 2331 .fno = gen_helper_gvec_ussub16, 2332 .opt_opc = vecop_list, 2333 .vece = MO_16 }, 2334 { .fni4 = tcg_gen_ussub_i32, 2335 .fniv = tcg_gen_ussub_vec, 2336 .fno = gen_helper_gvec_ussub32, 2337 .opt_opc = vecop_list, 2338 .vece = MO_32 }, 2339 { .fni8 = tcg_gen_ussub_i64, 2340 .fniv = tcg_gen_ussub_vec, 2341 .fno = gen_helper_gvec_ussub64, 2342 .opt_opc = vecop_list, 2343 .vece = MO_64 } 2344 }; 2345 tcg_debug_assert(vece <= MO_64); 2346 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2347 } 2348 2349 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs, 2350 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2351 { 2352 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 }; 2353 static const GVecGen3 g[4] = { 2354 { .fniv = tcg_gen_smin_vec, 2355 .fno = gen_helper_gvec_smin8, 2356 .opt_opc = vecop_list, 2357 .vece = MO_8 }, 2358 { .fniv = tcg_gen_smin_vec, 2359 .fno = gen_helper_gvec_smin16, 2360 .opt_opc = vecop_list, 2361 .vece = MO_16 }, 2362 { .fni4 = tcg_gen_smin_i32, 2363 .fniv = tcg_gen_smin_vec, 2364 .fno = gen_helper_gvec_smin32, 2365 .opt_opc = vecop_list, 2366 .vece = MO_32 }, 2367 { .fni8 = tcg_gen_smin_i64, 2368 .fniv = tcg_gen_smin_vec, 2369 .fno = gen_helper_gvec_smin64, 2370 .opt_opc = vecop_list, 2371 .vece = MO_64 } 2372 }; 2373 tcg_debug_assert(vece <= MO_64); 2374 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2375 } 2376 2377 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs, 2378 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2379 { 2380 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 }; 2381 static const GVecGen3 g[4] = { 2382 { .fniv = tcg_gen_umin_vec, 2383 .fno = gen_helper_gvec_umin8, 2384 .opt_opc = vecop_list, 2385 .vece = MO_8 }, 2386 { .fniv = tcg_gen_umin_vec, 2387 .fno = gen_helper_gvec_umin16, 2388 .opt_opc = vecop_list, 2389 .vece = MO_16 }, 2390 { .fni4 = tcg_gen_umin_i32, 2391 .fniv = tcg_gen_umin_vec, 2392 .fno = gen_helper_gvec_umin32, 2393 .opt_opc = vecop_list, 2394 .vece = MO_32 }, 2395 { .fni8 = tcg_gen_umin_i64, 2396 .fniv = tcg_gen_umin_vec, 2397 .fno = gen_helper_gvec_umin64, 2398 .opt_opc = vecop_list, 2399 .vece = MO_64 } 2400 }; 2401 tcg_debug_assert(vece <= MO_64); 2402 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2403 } 2404 2405 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs, 2406 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2407 { 2408 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 }; 2409 static const GVecGen3 g[4] = { 2410 { .fniv = tcg_gen_smax_vec, 2411 .fno = gen_helper_gvec_smax8, 2412 .opt_opc = vecop_list, 2413 .vece = MO_8 }, 2414 { .fniv = tcg_gen_smax_vec, 2415 .fno = gen_helper_gvec_smax16, 2416 .opt_opc = vecop_list, 2417 .vece = MO_16 }, 2418 { .fni4 = tcg_gen_smax_i32, 2419 .fniv = tcg_gen_smax_vec, 2420 .fno = gen_helper_gvec_smax32, 2421 .opt_opc = vecop_list, 2422 .vece = MO_32 }, 2423 { .fni8 = tcg_gen_smax_i64, 2424 .fniv = tcg_gen_smax_vec, 2425 .fno = gen_helper_gvec_smax64, 2426 .opt_opc = vecop_list, 2427 .vece = MO_64 } 2428 }; 2429 tcg_debug_assert(vece <= MO_64); 2430 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2431 } 2432 2433 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs, 2434 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2435 { 2436 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 }; 2437 static const GVecGen3 g[4] = { 2438 { .fniv = tcg_gen_umax_vec, 2439 .fno = gen_helper_gvec_umax8, 2440 .opt_opc = vecop_list, 2441 .vece = MO_8 }, 2442 { .fniv = tcg_gen_umax_vec, 2443 .fno = gen_helper_gvec_umax16, 2444 .opt_opc = vecop_list, 2445 .vece = MO_16 }, 2446 { .fni4 = tcg_gen_umax_i32, 2447 .fniv = tcg_gen_umax_vec, 2448 .fno = gen_helper_gvec_umax32, 2449 .opt_opc = vecop_list, 2450 .vece = MO_32 }, 2451 { .fni8 = tcg_gen_umax_i64, 2452 .fniv = tcg_gen_umax_vec, 2453 .fno = gen_helper_gvec_umax64, 2454 .opt_opc = vecop_list, 2455 .vece = MO_64 } 2456 }; 2457 tcg_debug_assert(vece <= MO_64); 2458 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2459 } 2460 2461 /* Perform a vector negation using normal negation and a mask. 2462 Compare gen_subv_mask above. */ 2463 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 2464 { 2465 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2466 TCGv_i64 t3 = tcg_temp_ebb_new_i64(); 2467 2468 tcg_gen_andc_i64(t3, m, b); 2469 tcg_gen_andc_i64(t2, b, m); 2470 tcg_gen_sub_i64(d, m, t2); 2471 tcg_gen_xor_i64(d, d, t3); 2472 2473 tcg_temp_free_i64(t2); 2474 tcg_temp_free_i64(t3); 2475 } 2476 2477 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 2478 { 2479 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80)); 2480 gen_negv_mask(d, b, m); 2481 } 2482 2483 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 2484 { 2485 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000)); 2486 gen_negv_mask(d, b, m); 2487 } 2488 2489 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 2490 { 2491 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 2492 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2493 2494 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2495 tcg_gen_neg_i64(t2, b); 2496 tcg_gen_neg_i64(t1, t1); 2497 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2498 2499 tcg_temp_free_i64(t1); 2500 tcg_temp_free_i64(t2); 2501 } 2502 2503 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 2504 uint32_t oprsz, uint32_t maxsz) 2505 { 2506 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 }; 2507 static const GVecGen2 g[4] = { 2508 { .fni8 = tcg_gen_vec_neg8_i64, 2509 .fniv = tcg_gen_neg_vec, 2510 .fno = gen_helper_gvec_neg8, 2511 .opt_opc = vecop_list, 2512 .vece = MO_8 }, 2513 { .fni8 = tcg_gen_vec_neg16_i64, 2514 .fniv = tcg_gen_neg_vec, 2515 .fno = gen_helper_gvec_neg16, 2516 .opt_opc = vecop_list, 2517 .vece = MO_16 }, 2518 { .fni4 = tcg_gen_neg_i32, 2519 .fniv = tcg_gen_neg_vec, 2520 .fno = gen_helper_gvec_neg32, 2521 .opt_opc = vecop_list, 2522 .vece = MO_32 }, 2523 { .fni8 = tcg_gen_neg_i64, 2524 .fniv = tcg_gen_neg_vec, 2525 .fno = gen_helper_gvec_neg64, 2526 .opt_opc = vecop_list, 2527 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2528 .vece = MO_64 }, 2529 }; 2530 2531 tcg_debug_assert(vece <= MO_64); 2532 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2533 } 2534 2535 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) 2536 { 2537 TCGv_i64 t = tcg_temp_ebb_new_i64(); 2538 int nbit = 8 << vece; 2539 2540 /* Create -1 for each negative element. */ 2541 tcg_gen_shri_i64(t, b, nbit - 1); 2542 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2543 tcg_gen_muli_i64(t, t, (1 << nbit) - 1); 2544 2545 /* 2546 * Invert (via xor -1) and add one. 2547 * Because of the ordering the msb is cleared, 2548 * so we never have carry into the next element. 2549 */ 2550 tcg_gen_xor_i64(d, b, t); 2551 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2552 tcg_gen_add_i64(d, d, t); 2553 2554 tcg_temp_free_i64(t); 2555 } 2556 2557 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b) 2558 { 2559 gen_absv_mask(d, b, MO_8); 2560 } 2561 2562 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b) 2563 { 2564 gen_absv_mask(d, b, MO_16); 2565 } 2566 2567 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs, 2568 uint32_t oprsz, uint32_t maxsz) 2569 { 2570 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 }; 2571 static const GVecGen2 g[4] = { 2572 { .fni8 = tcg_gen_vec_abs8_i64, 2573 .fniv = tcg_gen_abs_vec, 2574 .fno = gen_helper_gvec_abs8, 2575 .opt_opc = vecop_list, 2576 .vece = MO_8 }, 2577 { .fni8 = tcg_gen_vec_abs16_i64, 2578 .fniv = tcg_gen_abs_vec, 2579 .fno = gen_helper_gvec_abs16, 2580 .opt_opc = vecop_list, 2581 .vece = MO_16 }, 2582 { .fni4 = tcg_gen_abs_i32, 2583 .fniv = tcg_gen_abs_vec, 2584 .fno = gen_helper_gvec_abs32, 2585 .opt_opc = vecop_list, 2586 .vece = MO_32 }, 2587 { .fni8 = tcg_gen_abs_i64, 2588 .fniv = tcg_gen_abs_vec, 2589 .fno = gen_helper_gvec_abs64, 2590 .opt_opc = vecop_list, 2591 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2592 .vece = MO_64 }, 2593 }; 2594 2595 tcg_debug_assert(vece <= MO_64); 2596 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2597 } 2598 2599 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 2600 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2601 { 2602 static const GVecGen3 g = { 2603 .fni8 = tcg_gen_and_i64, 2604 .fniv = tcg_gen_and_vec, 2605 .fno = gen_helper_gvec_and, 2606 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2607 }; 2608 2609 if (aofs == bofs) { 2610 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2611 } else { 2612 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2613 } 2614 } 2615 2616 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 2617 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2618 { 2619 static const GVecGen3 g = { 2620 .fni8 = tcg_gen_or_i64, 2621 .fniv = tcg_gen_or_vec, 2622 .fno = gen_helper_gvec_or, 2623 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2624 }; 2625 2626 if (aofs == bofs) { 2627 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2628 } else { 2629 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2630 } 2631 } 2632 2633 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 2634 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2635 { 2636 static const GVecGen3 g = { 2637 .fni8 = tcg_gen_xor_i64, 2638 .fniv = tcg_gen_xor_vec, 2639 .fno = gen_helper_gvec_xor, 2640 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2641 }; 2642 2643 if (aofs == bofs) { 2644 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2645 } else { 2646 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2647 } 2648 } 2649 2650 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 2651 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2652 { 2653 static const GVecGen3 g = { 2654 .fni8 = tcg_gen_andc_i64, 2655 .fniv = tcg_gen_andc_vec, 2656 .fno = gen_helper_gvec_andc, 2657 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2658 }; 2659 2660 if (aofs == bofs) { 2661 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2662 } else { 2663 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2664 } 2665 } 2666 2667 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 2668 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2669 { 2670 static const GVecGen3 g = { 2671 .fni8 = tcg_gen_orc_i64, 2672 .fniv = tcg_gen_orc_vec, 2673 .fno = gen_helper_gvec_orc, 2674 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2675 }; 2676 2677 if (aofs == bofs) { 2678 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2679 } else { 2680 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2681 } 2682 } 2683 2684 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs, 2685 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2686 { 2687 static const GVecGen3 g = { 2688 .fni8 = tcg_gen_nand_i64, 2689 .fniv = tcg_gen_nand_vec, 2690 .fno = gen_helper_gvec_nand, 2691 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2692 }; 2693 2694 if (aofs == bofs) { 2695 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2696 } else { 2697 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2698 } 2699 } 2700 2701 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs, 2702 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2703 { 2704 static const GVecGen3 g = { 2705 .fni8 = tcg_gen_nor_i64, 2706 .fniv = tcg_gen_nor_vec, 2707 .fno = gen_helper_gvec_nor, 2708 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2709 }; 2710 2711 if (aofs == bofs) { 2712 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2713 } else { 2714 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2715 } 2716 } 2717 2718 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, 2719 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2720 { 2721 static const GVecGen3 g = { 2722 .fni8 = tcg_gen_eqv_i64, 2723 .fniv = tcg_gen_eqv_vec, 2724 .fno = gen_helper_gvec_eqv, 2725 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2726 }; 2727 2728 if (aofs == bofs) { 2729 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2730 } else { 2731 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2732 } 2733 } 2734 2735 static const GVecGen2s gop_ands = { 2736 .fni8 = tcg_gen_and_i64, 2737 .fniv = tcg_gen_and_vec, 2738 .fno = gen_helper_gvec_ands, 2739 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2740 .vece = MO_64 2741 }; 2742 2743 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 2744 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2745 { 2746 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2747 tcg_gen_dup_i64(vece, tmp, c); 2748 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2749 tcg_temp_free_i64(tmp); 2750 } 2751 2752 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 2753 int64_t c, uint32_t oprsz, uint32_t maxsz) 2754 { 2755 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c)); 2756 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2757 } 2758 2759 void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs, 2760 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2761 { 2762 static GVecGen2s g = { 2763 .fni8 = tcg_gen_andc_i64, 2764 .fniv = tcg_gen_andc_vec, 2765 .fno = gen_helper_gvec_andcs, 2766 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2767 .vece = MO_64 2768 }; 2769 2770 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2771 tcg_gen_dup_i64(vece, tmp, c); 2772 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &g); 2773 tcg_temp_free_i64(tmp); 2774 } 2775 2776 static const GVecGen2s gop_xors = { 2777 .fni8 = tcg_gen_xor_i64, 2778 .fniv = tcg_gen_xor_vec, 2779 .fno = gen_helper_gvec_xors, 2780 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2781 .vece = MO_64 2782 }; 2783 2784 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 2785 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2786 { 2787 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2788 tcg_gen_dup_i64(vece, tmp, c); 2789 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2790 tcg_temp_free_i64(tmp); 2791 } 2792 2793 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 2794 int64_t c, uint32_t oprsz, uint32_t maxsz) 2795 { 2796 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c)); 2797 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2798 } 2799 2800 static const GVecGen2s gop_ors = { 2801 .fni8 = tcg_gen_or_i64, 2802 .fniv = tcg_gen_or_vec, 2803 .fno = gen_helper_gvec_ors, 2804 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2805 .vece = MO_64 2806 }; 2807 2808 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 2809 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2810 { 2811 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2812 tcg_gen_dup_i64(vece, tmp, c); 2813 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2814 tcg_temp_free_i64(tmp); 2815 } 2816 2817 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 2818 int64_t c, uint32_t oprsz, uint32_t maxsz) 2819 { 2820 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c)); 2821 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2822 } 2823 2824 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2825 { 2826 uint64_t mask = dup_const(MO_8, 0xff << c); 2827 tcg_gen_shli_i64(d, a, c); 2828 tcg_gen_andi_i64(d, d, mask); 2829 } 2830 2831 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2832 { 2833 uint64_t mask = dup_const(MO_16, 0xffff << c); 2834 tcg_gen_shli_i64(d, a, c); 2835 tcg_gen_andi_i64(d, d, mask); 2836 } 2837 2838 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2839 { 2840 uint32_t mask = dup_const(MO_8, 0xff << c); 2841 tcg_gen_shli_i32(d, a, c); 2842 tcg_gen_andi_i32(d, d, mask); 2843 } 2844 2845 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2846 { 2847 uint32_t mask = dup_const(MO_16, 0xffff << c); 2848 tcg_gen_shli_i32(d, a, c); 2849 tcg_gen_andi_i32(d, d, mask); 2850 } 2851 2852 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 2853 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2854 { 2855 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 2856 static const GVecGen2i g[4] = { 2857 { .fni8 = tcg_gen_vec_shl8i_i64, 2858 .fniv = tcg_gen_shli_vec, 2859 .fno = gen_helper_gvec_shl8i, 2860 .opt_opc = vecop_list, 2861 .vece = MO_8 }, 2862 { .fni8 = tcg_gen_vec_shl16i_i64, 2863 .fniv = tcg_gen_shli_vec, 2864 .fno = gen_helper_gvec_shl16i, 2865 .opt_opc = vecop_list, 2866 .vece = MO_16 }, 2867 { .fni4 = tcg_gen_shli_i32, 2868 .fniv = tcg_gen_shli_vec, 2869 .fno = gen_helper_gvec_shl32i, 2870 .opt_opc = vecop_list, 2871 .vece = MO_32 }, 2872 { .fni8 = tcg_gen_shli_i64, 2873 .fniv = tcg_gen_shli_vec, 2874 .fno = gen_helper_gvec_shl64i, 2875 .opt_opc = vecop_list, 2876 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2877 .vece = MO_64 }, 2878 }; 2879 2880 tcg_debug_assert(vece <= MO_64); 2881 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2882 if (shift == 0) { 2883 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2884 } else { 2885 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2886 } 2887 } 2888 2889 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2890 { 2891 uint64_t mask = dup_const(MO_8, 0xff >> c); 2892 tcg_gen_shri_i64(d, a, c); 2893 tcg_gen_andi_i64(d, d, mask); 2894 } 2895 2896 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2897 { 2898 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2899 tcg_gen_shri_i64(d, a, c); 2900 tcg_gen_andi_i64(d, d, mask); 2901 } 2902 2903 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2904 { 2905 uint32_t mask = dup_const(MO_8, 0xff >> c); 2906 tcg_gen_shri_i32(d, a, c); 2907 tcg_gen_andi_i32(d, d, mask); 2908 } 2909 2910 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2911 { 2912 uint32_t mask = dup_const(MO_16, 0xffff >> c); 2913 tcg_gen_shri_i32(d, a, c); 2914 tcg_gen_andi_i32(d, d, mask); 2915 } 2916 2917 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2918 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2919 { 2920 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 2921 static const GVecGen2i g[4] = { 2922 { .fni8 = tcg_gen_vec_shr8i_i64, 2923 .fniv = tcg_gen_shri_vec, 2924 .fno = gen_helper_gvec_shr8i, 2925 .opt_opc = vecop_list, 2926 .vece = MO_8 }, 2927 { .fni8 = tcg_gen_vec_shr16i_i64, 2928 .fniv = tcg_gen_shri_vec, 2929 .fno = gen_helper_gvec_shr16i, 2930 .opt_opc = vecop_list, 2931 .vece = MO_16 }, 2932 { .fni4 = tcg_gen_shri_i32, 2933 .fniv = tcg_gen_shri_vec, 2934 .fno = gen_helper_gvec_shr32i, 2935 .opt_opc = vecop_list, 2936 .vece = MO_32 }, 2937 { .fni8 = tcg_gen_shri_i64, 2938 .fniv = tcg_gen_shri_vec, 2939 .fno = gen_helper_gvec_shr64i, 2940 .opt_opc = vecop_list, 2941 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2942 .vece = MO_64 }, 2943 }; 2944 2945 tcg_debug_assert(vece <= MO_64); 2946 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2947 if (shift == 0) { 2948 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2949 } else { 2950 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2951 } 2952 } 2953 2954 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2955 { 2956 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2957 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2958 TCGv_i64 s = tcg_temp_ebb_new_i64(); 2959 2960 tcg_gen_shri_i64(d, a, c); 2961 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2962 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2963 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2964 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2965 tcg_temp_free_i64(s); 2966 } 2967 2968 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2969 { 2970 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2971 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2972 TCGv_i64 s = tcg_temp_ebb_new_i64(); 2973 2974 tcg_gen_shri_i64(d, a, c); 2975 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2976 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2977 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2978 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2979 tcg_temp_free_i64(s); 2980 } 2981 2982 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2983 { 2984 uint32_t s_mask = dup_const(MO_8, 0x80 >> c); 2985 uint32_t c_mask = dup_const(MO_8, 0xff >> c); 2986 TCGv_i32 s = tcg_temp_ebb_new_i32(); 2987 2988 tcg_gen_shri_i32(d, a, c); 2989 tcg_gen_andi_i32(s, d, s_mask); /* isolate (shifted) sign bit */ 2990 tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */ 2991 tcg_gen_andi_i32(d, d, c_mask); /* clear out bits above sign */ 2992 tcg_gen_or_i32(d, d, s); /* include sign extension */ 2993 tcg_temp_free_i32(s); 2994 } 2995 2996 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2997 { 2998 uint32_t s_mask = dup_const(MO_16, 0x8000 >> c); 2999 uint32_t c_mask = dup_const(MO_16, 0xffff >> c); 3000 TCGv_i32 s = tcg_temp_ebb_new_i32(); 3001 3002 tcg_gen_shri_i32(d, a, c); 3003 tcg_gen_andi_i32(s, d, s_mask); /* isolate (shifted) sign bit */ 3004 tcg_gen_andi_i32(d, d, c_mask); /* clear out bits above sign */ 3005 tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */ 3006 tcg_gen_or_i32(d, d, s); /* include sign extension */ 3007 tcg_temp_free_i32(s); 3008 } 3009 3010 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 3011 int64_t shift, uint32_t oprsz, uint32_t maxsz) 3012 { 3013 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 }; 3014 static const GVecGen2i g[4] = { 3015 { .fni8 = tcg_gen_vec_sar8i_i64, 3016 .fniv = tcg_gen_sari_vec, 3017 .fno = gen_helper_gvec_sar8i, 3018 .opt_opc = vecop_list, 3019 .vece = MO_8 }, 3020 { .fni8 = tcg_gen_vec_sar16i_i64, 3021 .fniv = tcg_gen_sari_vec, 3022 .fno = gen_helper_gvec_sar16i, 3023 .opt_opc = vecop_list, 3024 .vece = MO_16 }, 3025 { .fni4 = tcg_gen_sari_i32, 3026 .fniv = tcg_gen_sari_vec, 3027 .fno = gen_helper_gvec_sar32i, 3028 .opt_opc = vecop_list, 3029 .vece = MO_32 }, 3030 { .fni8 = tcg_gen_sari_i64, 3031 .fniv = tcg_gen_sari_vec, 3032 .fno = gen_helper_gvec_sar64i, 3033 .opt_opc = vecop_list, 3034 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3035 .vece = MO_64 }, 3036 }; 3037 3038 tcg_debug_assert(vece <= MO_64); 3039 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 3040 if (shift == 0) { 3041 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 3042 } else { 3043 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 3044 } 3045 } 3046 3047 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 3048 { 3049 uint64_t mask = dup_const(MO_8, 0xff << c); 3050 3051 tcg_gen_shli_i64(d, a, c); 3052 tcg_gen_shri_i64(a, a, 8 - c); 3053 tcg_gen_andi_i64(d, d, mask); 3054 tcg_gen_andi_i64(a, a, ~mask); 3055 tcg_gen_or_i64(d, d, a); 3056 } 3057 3058 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 3059 { 3060 uint64_t mask = dup_const(MO_16, 0xffff << c); 3061 3062 tcg_gen_shli_i64(d, a, c); 3063 tcg_gen_shri_i64(a, a, 16 - c); 3064 tcg_gen_andi_i64(d, d, mask); 3065 tcg_gen_andi_i64(a, a, ~mask); 3066 tcg_gen_or_i64(d, d, a); 3067 } 3068 3069 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs, 3070 int64_t shift, uint32_t oprsz, uint32_t maxsz) 3071 { 3072 static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 }; 3073 static const GVecGen2i g[4] = { 3074 { .fni8 = tcg_gen_vec_rotl8i_i64, 3075 .fniv = tcg_gen_rotli_vec, 3076 .fno = gen_helper_gvec_rotl8i, 3077 .opt_opc = vecop_list, 3078 .vece = MO_8 }, 3079 { .fni8 = tcg_gen_vec_rotl16i_i64, 3080 .fniv = tcg_gen_rotli_vec, 3081 .fno = gen_helper_gvec_rotl16i, 3082 .opt_opc = vecop_list, 3083 .vece = MO_16 }, 3084 { .fni4 = tcg_gen_rotli_i32, 3085 .fniv = tcg_gen_rotli_vec, 3086 .fno = gen_helper_gvec_rotl32i, 3087 .opt_opc = vecop_list, 3088 .vece = MO_32 }, 3089 { .fni8 = tcg_gen_rotli_i64, 3090 .fniv = tcg_gen_rotli_vec, 3091 .fno = gen_helper_gvec_rotl64i, 3092 .opt_opc = vecop_list, 3093 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3094 .vece = MO_64 }, 3095 }; 3096 3097 tcg_debug_assert(vece <= MO_64); 3098 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 3099 if (shift == 0) { 3100 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 3101 } else { 3102 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 3103 } 3104 } 3105 3106 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs, 3107 int64_t shift, uint32_t oprsz, uint32_t maxsz) 3108 { 3109 tcg_debug_assert(vece <= MO_64); 3110 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 3111 tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1), 3112 oprsz, maxsz); 3113 } 3114 3115 /* 3116 * Specialized generation vector shifts by a non-constant scalar. 3117 */ 3118 3119 typedef struct { 3120 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); 3121 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); 3122 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32); 3123 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); 3124 gen_helper_gvec_2 *fno[4]; 3125 TCGOpcode s_list[2]; 3126 TCGOpcode v_list[2]; 3127 } GVecGen2sh; 3128 3129 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3130 uint32_t oprsz, uint32_t tysz, TCGType type, 3131 TCGv_i32 shift, 3132 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32)) 3133 { 3134 for (uint32_t i = 0; i < oprsz; i += tysz) { 3135 TCGv_vec t0 = tcg_temp_new_vec(type); 3136 TCGv_vec t1 = tcg_temp_new_vec(type); 3137 3138 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 3139 fni(vece, t1, t0, shift); 3140 tcg_gen_st_vec(t1, tcg_env, dofs + i); 3141 } 3142 } 3143 3144 static void 3145 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, 3146 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g) 3147 { 3148 TCGType type; 3149 uint32_t some; 3150 3151 check_size_align(oprsz, maxsz, dofs | aofs); 3152 check_overlap_2(dofs, aofs, maxsz); 3153 3154 /* If the backend has a scalar expansion, great. */ 3155 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64); 3156 if (type) { 3157 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 3158 switch (type) { 3159 case TCG_TYPE_V256: 3160 some = QEMU_ALIGN_DOWN(oprsz, 32); 3161 expand_2sh_vec(vece, dofs, aofs, some, 32, 3162 TCG_TYPE_V256, shift, g->fniv_s); 3163 if (some == oprsz) { 3164 break; 3165 } 3166 dofs += some; 3167 aofs += some; 3168 oprsz -= some; 3169 maxsz -= some; 3170 /* fallthru */ 3171 case TCG_TYPE_V128: 3172 expand_2sh_vec(vece, dofs, aofs, oprsz, 16, 3173 TCG_TYPE_V128, shift, g->fniv_s); 3174 break; 3175 case TCG_TYPE_V64: 3176 expand_2sh_vec(vece, dofs, aofs, oprsz, 8, 3177 TCG_TYPE_V64, shift, g->fniv_s); 3178 break; 3179 default: 3180 g_assert_not_reached(); 3181 } 3182 tcg_swap_vecop_list(hold_list); 3183 goto clear_tail; 3184 } 3185 3186 /* If the backend supports variable vector shifts, also cool. */ 3187 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64); 3188 if (type) { 3189 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 3190 TCGv_vec v_shift = tcg_temp_new_vec(type); 3191 3192 if (vece == MO_64) { 3193 TCGv_i64 sh64 = tcg_temp_ebb_new_i64(); 3194 tcg_gen_extu_i32_i64(sh64, shift); 3195 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64); 3196 tcg_temp_free_i64(sh64); 3197 } else { 3198 tcg_gen_dup_i32_vec(vece, v_shift, shift); 3199 } 3200 3201 switch (type) { 3202 case TCG_TYPE_V256: 3203 some = QEMU_ALIGN_DOWN(oprsz, 32); 3204 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256, 3205 v_shift, false, g->fniv_v); 3206 if (some == oprsz) { 3207 break; 3208 } 3209 dofs += some; 3210 aofs += some; 3211 oprsz -= some; 3212 maxsz -= some; 3213 /* fallthru */ 3214 case TCG_TYPE_V128: 3215 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 3216 v_shift, false, g->fniv_v); 3217 break; 3218 case TCG_TYPE_V64: 3219 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 3220 v_shift, false, g->fniv_v); 3221 break; 3222 default: 3223 g_assert_not_reached(); 3224 } 3225 tcg_temp_free_vec(v_shift); 3226 tcg_swap_vecop_list(hold_list); 3227 goto clear_tail; 3228 } 3229 3230 /* Otherwise fall back to integral... */ 3231 if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3232 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4); 3233 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3234 TCGv_i64 sh64 = tcg_temp_ebb_new_i64(); 3235 tcg_gen_extu_i32_i64(sh64, shift); 3236 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8); 3237 tcg_temp_free_i64(sh64); 3238 } else { 3239 TCGv_ptr a0 = tcg_temp_ebb_new_ptr(); 3240 TCGv_ptr a1 = tcg_temp_ebb_new_ptr(); 3241 TCGv_i32 desc = tcg_temp_ebb_new_i32(); 3242 3243 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT); 3244 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0)); 3245 tcg_gen_addi_ptr(a0, tcg_env, dofs); 3246 tcg_gen_addi_ptr(a1, tcg_env, aofs); 3247 3248 g->fno[vece](a0, a1, desc); 3249 3250 tcg_temp_free_ptr(a0); 3251 tcg_temp_free_ptr(a1); 3252 tcg_temp_free_i32(desc); 3253 return; 3254 } 3255 3256 clear_tail: 3257 if (oprsz < maxsz) { 3258 expand_clr(dofs + oprsz, maxsz - oprsz); 3259 } 3260 } 3261 3262 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs, 3263 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3264 { 3265 static const GVecGen2sh g = { 3266 .fni4 = tcg_gen_shl_i32, 3267 .fni8 = tcg_gen_shl_i64, 3268 .fniv_s = tcg_gen_shls_vec, 3269 .fniv_v = tcg_gen_shlv_vec, 3270 .fno = { 3271 gen_helper_gvec_shl8i, 3272 gen_helper_gvec_shl16i, 3273 gen_helper_gvec_shl32i, 3274 gen_helper_gvec_shl64i, 3275 }, 3276 .s_list = { INDEX_op_shls_vec, 0 }, 3277 .v_list = { INDEX_op_shlv_vec, 0 }, 3278 }; 3279 3280 tcg_debug_assert(vece <= MO_64); 3281 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3282 } 3283 3284 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs, 3285 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3286 { 3287 static const GVecGen2sh g = { 3288 .fni4 = tcg_gen_shr_i32, 3289 .fni8 = tcg_gen_shr_i64, 3290 .fniv_s = tcg_gen_shrs_vec, 3291 .fniv_v = tcg_gen_shrv_vec, 3292 .fno = { 3293 gen_helper_gvec_shr8i, 3294 gen_helper_gvec_shr16i, 3295 gen_helper_gvec_shr32i, 3296 gen_helper_gvec_shr64i, 3297 }, 3298 .s_list = { INDEX_op_shrs_vec, 0 }, 3299 .v_list = { INDEX_op_shrv_vec, 0 }, 3300 }; 3301 3302 tcg_debug_assert(vece <= MO_64); 3303 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3304 } 3305 3306 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs, 3307 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3308 { 3309 static const GVecGen2sh g = { 3310 .fni4 = tcg_gen_sar_i32, 3311 .fni8 = tcg_gen_sar_i64, 3312 .fniv_s = tcg_gen_sars_vec, 3313 .fniv_v = tcg_gen_sarv_vec, 3314 .fno = { 3315 gen_helper_gvec_sar8i, 3316 gen_helper_gvec_sar16i, 3317 gen_helper_gvec_sar32i, 3318 gen_helper_gvec_sar64i, 3319 }, 3320 .s_list = { INDEX_op_sars_vec, 0 }, 3321 .v_list = { INDEX_op_sarv_vec, 0 }, 3322 }; 3323 3324 tcg_debug_assert(vece <= MO_64); 3325 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3326 } 3327 3328 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs, 3329 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3330 { 3331 static const GVecGen2sh g = { 3332 .fni4 = tcg_gen_rotl_i32, 3333 .fni8 = tcg_gen_rotl_i64, 3334 .fniv_s = tcg_gen_rotls_vec, 3335 .fniv_v = tcg_gen_rotlv_vec, 3336 .fno = { 3337 gen_helper_gvec_rotl8i, 3338 gen_helper_gvec_rotl16i, 3339 gen_helper_gvec_rotl32i, 3340 gen_helper_gvec_rotl64i, 3341 }, 3342 .s_list = { INDEX_op_rotls_vec, 0 }, 3343 .v_list = { INDEX_op_rotlv_vec, 0 }, 3344 }; 3345 3346 tcg_debug_assert(vece <= MO_64); 3347 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3348 } 3349 3350 void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs, 3351 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3352 { 3353 TCGv_i32 tmp = tcg_temp_ebb_new_i32(); 3354 3355 tcg_gen_neg_i32(tmp, shift); 3356 tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1); 3357 tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz); 3358 tcg_temp_free_i32(tmp); 3359 } 3360 3361 /* 3362 * Expand D = A << (B % element bits) 3363 * 3364 * Unlike scalar shifts, where it is easy for the target front end 3365 * to include the modulo as part of the expansion. If the target 3366 * naturally includes the modulo as part of the operation, great! 3367 * If the target has some other behaviour from out-of-range shifts, 3368 * then it could not use this function anyway, and would need to 3369 * do it's own expansion with custom functions. 3370 */ 3371 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d, 3372 TCGv_vec a, TCGv_vec b) 3373 { 3374 TCGv_vec t = tcg_temp_new_vec_matching(d); 3375 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3376 3377 tcg_gen_and_vec(vece, t, b, m); 3378 tcg_gen_shlv_vec(vece, d, a, t); 3379 tcg_temp_free_vec(t); 3380 } 3381 3382 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3383 { 3384 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3385 3386 tcg_gen_andi_i32(t, b, 31); 3387 tcg_gen_shl_i32(d, a, t); 3388 tcg_temp_free_i32(t); 3389 } 3390 3391 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3392 { 3393 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3394 3395 tcg_gen_andi_i64(t, b, 63); 3396 tcg_gen_shl_i64(d, a, t); 3397 tcg_temp_free_i64(t); 3398 } 3399 3400 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3401 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3402 { 3403 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 }; 3404 static const GVecGen3 g[4] = { 3405 { .fniv = tcg_gen_shlv_mod_vec, 3406 .fno = gen_helper_gvec_shl8v, 3407 .opt_opc = vecop_list, 3408 .vece = MO_8 }, 3409 { .fniv = tcg_gen_shlv_mod_vec, 3410 .fno = gen_helper_gvec_shl16v, 3411 .opt_opc = vecop_list, 3412 .vece = MO_16 }, 3413 { .fni4 = tcg_gen_shl_mod_i32, 3414 .fniv = tcg_gen_shlv_mod_vec, 3415 .fno = gen_helper_gvec_shl32v, 3416 .opt_opc = vecop_list, 3417 .vece = MO_32 }, 3418 { .fni8 = tcg_gen_shl_mod_i64, 3419 .fniv = tcg_gen_shlv_mod_vec, 3420 .fno = gen_helper_gvec_shl64v, 3421 .opt_opc = vecop_list, 3422 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3423 .vece = MO_64 }, 3424 }; 3425 3426 tcg_debug_assert(vece <= MO_64); 3427 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3428 } 3429 3430 /* 3431 * Similarly for logical right shifts. 3432 */ 3433 3434 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d, 3435 TCGv_vec a, TCGv_vec b) 3436 { 3437 TCGv_vec t = tcg_temp_new_vec_matching(d); 3438 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3439 3440 tcg_gen_and_vec(vece, t, b, m); 3441 tcg_gen_shrv_vec(vece, d, a, t); 3442 tcg_temp_free_vec(t); 3443 } 3444 3445 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3446 { 3447 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3448 3449 tcg_gen_andi_i32(t, b, 31); 3450 tcg_gen_shr_i32(d, a, t); 3451 tcg_temp_free_i32(t); 3452 } 3453 3454 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3455 { 3456 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3457 3458 tcg_gen_andi_i64(t, b, 63); 3459 tcg_gen_shr_i64(d, a, t); 3460 tcg_temp_free_i64(t); 3461 } 3462 3463 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3464 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3465 { 3466 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 }; 3467 static const GVecGen3 g[4] = { 3468 { .fniv = tcg_gen_shrv_mod_vec, 3469 .fno = gen_helper_gvec_shr8v, 3470 .opt_opc = vecop_list, 3471 .vece = MO_8 }, 3472 { .fniv = tcg_gen_shrv_mod_vec, 3473 .fno = gen_helper_gvec_shr16v, 3474 .opt_opc = vecop_list, 3475 .vece = MO_16 }, 3476 { .fni4 = tcg_gen_shr_mod_i32, 3477 .fniv = tcg_gen_shrv_mod_vec, 3478 .fno = gen_helper_gvec_shr32v, 3479 .opt_opc = vecop_list, 3480 .vece = MO_32 }, 3481 { .fni8 = tcg_gen_shr_mod_i64, 3482 .fniv = tcg_gen_shrv_mod_vec, 3483 .fno = gen_helper_gvec_shr64v, 3484 .opt_opc = vecop_list, 3485 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3486 .vece = MO_64 }, 3487 }; 3488 3489 tcg_debug_assert(vece <= MO_64); 3490 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3491 } 3492 3493 /* 3494 * Similarly for arithmetic right shifts. 3495 */ 3496 3497 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d, 3498 TCGv_vec a, TCGv_vec b) 3499 { 3500 TCGv_vec t = tcg_temp_new_vec_matching(d); 3501 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3502 3503 tcg_gen_and_vec(vece, t, b, m); 3504 tcg_gen_sarv_vec(vece, d, a, t); 3505 tcg_temp_free_vec(t); 3506 } 3507 3508 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3509 { 3510 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3511 3512 tcg_gen_andi_i32(t, b, 31); 3513 tcg_gen_sar_i32(d, a, t); 3514 tcg_temp_free_i32(t); 3515 } 3516 3517 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3518 { 3519 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3520 3521 tcg_gen_andi_i64(t, b, 63); 3522 tcg_gen_sar_i64(d, a, t); 3523 tcg_temp_free_i64(t); 3524 } 3525 3526 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs, 3527 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3528 { 3529 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 }; 3530 static const GVecGen3 g[4] = { 3531 { .fniv = tcg_gen_sarv_mod_vec, 3532 .fno = gen_helper_gvec_sar8v, 3533 .opt_opc = vecop_list, 3534 .vece = MO_8 }, 3535 { .fniv = tcg_gen_sarv_mod_vec, 3536 .fno = gen_helper_gvec_sar16v, 3537 .opt_opc = vecop_list, 3538 .vece = MO_16 }, 3539 { .fni4 = tcg_gen_sar_mod_i32, 3540 .fniv = tcg_gen_sarv_mod_vec, 3541 .fno = gen_helper_gvec_sar32v, 3542 .opt_opc = vecop_list, 3543 .vece = MO_32 }, 3544 { .fni8 = tcg_gen_sar_mod_i64, 3545 .fniv = tcg_gen_sarv_mod_vec, 3546 .fno = gen_helper_gvec_sar64v, 3547 .opt_opc = vecop_list, 3548 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3549 .vece = MO_64 }, 3550 }; 3551 3552 tcg_debug_assert(vece <= MO_64); 3553 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3554 } 3555 3556 /* 3557 * Similarly for rotates. 3558 */ 3559 3560 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d, 3561 TCGv_vec a, TCGv_vec b) 3562 { 3563 TCGv_vec t = tcg_temp_new_vec_matching(d); 3564 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3565 3566 tcg_gen_and_vec(vece, t, b, m); 3567 tcg_gen_rotlv_vec(vece, d, a, t); 3568 tcg_temp_free_vec(t); 3569 } 3570 3571 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3572 { 3573 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3574 3575 tcg_gen_andi_i32(t, b, 31); 3576 tcg_gen_rotl_i32(d, a, t); 3577 tcg_temp_free_i32(t); 3578 } 3579 3580 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3581 { 3582 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3583 3584 tcg_gen_andi_i64(t, b, 63); 3585 tcg_gen_rotl_i64(d, a, t); 3586 tcg_temp_free_i64(t); 3587 } 3588 3589 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3590 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3591 { 3592 static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 }; 3593 static const GVecGen3 g[4] = { 3594 { .fniv = tcg_gen_rotlv_mod_vec, 3595 .fno = gen_helper_gvec_rotl8v, 3596 .opt_opc = vecop_list, 3597 .vece = MO_8 }, 3598 { .fniv = tcg_gen_rotlv_mod_vec, 3599 .fno = gen_helper_gvec_rotl16v, 3600 .opt_opc = vecop_list, 3601 .vece = MO_16 }, 3602 { .fni4 = tcg_gen_rotl_mod_i32, 3603 .fniv = tcg_gen_rotlv_mod_vec, 3604 .fno = gen_helper_gvec_rotl32v, 3605 .opt_opc = vecop_list, 3606 .vece = MO_32 }, 3607 { .fni8 = tcg_gen_rotl_mod_i64, 3608 .fniv = tcg_gen_rotlv_mod_vec, 3609 .fno = gen_helper_gvec_rotl64v, 3610 .opt_opc = vecop_list, 3611 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3612 .vece = MO_64 }, 3613 }; 3614 3615 tcg_debug_assert(vece <= MO_64); 3616 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3617 } 3618 3619 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d, 3620 TCGv_vec a, TCGv_vec b) 3621 { 3622 TCGv_vec t = tcg_temp_new_vec_matching(d); 3623 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3624 3625 tcg_gen_and_vec(vece, t, b, m); 3626 tcg_gen_rotrv_vec(vece, d, a, t); 3627 tcg_temp_free_vec(t); 3628 } 3629 3630 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3631 { 3632 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3633 3634 tcg_gen_andi_i32(t, b, 31); 3635 tcg_gen_rotr_i32(d, a, t); 3636 tcg_temp_free_i32(t); 3637 } 3638 3639 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3640 { 3641 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3642 3643 tcg_gen_andi_i64(t, b, 63); 3644 tcg_gen_rotr_i64(d, a, t); 3645 tcg_temp_free_i64(t); 3646 } 3647 3648 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3649 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3650 { 3651 static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 }; 3652 static const GVecGen3 g[4] = { 3653 { .fniv = tcg_gen_rotrv_mod_vec, 3654 .fno = gen_helper_gvec_rotr8v, 3655 .opt_opc = vecop_list, 3656 .vece = MO_8 }, 3657 { .fniv = tcg_gen_rotrv_mod_vec, 3658 .fno = gen_helper_gvec_rotr16v, 3659 .opt_opc = vecop_list, 3660 .vece = MO_16 }, 3661 { .fni4 = tcg_gen_rotr_mod_i32, 3662 .fniv = tcg_gen_rotrv_mod_vec, 3663 .fno = gen_helper_gvec_rotr32v, 3664 .opt_opc = vecop_list, 3665 .vece = MO_32 }, 3666 { .fni8 = tcg_gen_rotr_mod_i64, 3667 .fniv = tcg_gen_rotrv_mod_vec, 3668 .fno = gen_helper_gvec_rotr64v, 3669 .opt_opc = vecop_list, 3670 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3671 .vece = MO_64 }, 3672 }; 3673 3674 tcg_debug_assert(vece <= MO_64); 3675 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3676 } 3677 3678 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 3679 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3680 uint32_t oprsz, TCGCond cond) 3681 { 3682 TCGv_i32 t0 = tcg_temp_ebb_new_i32(); 3683 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 3684 uint32_t i; 3685 3686 for (i = 0; i < oprsz; i += 4) { 3687 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 3688 tcg_gen_ld_i32(t1, tcg_env, bofs + i); 3689 tcg_gen_negsetcond_i32(cond, t0, t0, t1); 3690 tcg_gen_st_i32(t0, tcg_env, dofs + i); 3691 } 3692 tcg_temp_free_i32(t1); 3693 tcg_temp_free_i32(t0); 3694 } 3695 3696 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3697 uint32_t oprsz, TCGCond cond) 3698 { 3699 TCGv_i64 t0 = tcg_temp_ebb_new_i64(); 3700 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 3701 uint32_t i; 3702 3703 for (i = 0; i < oprsz; i += 8) { 3704 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 3705 tcg_gen_ld_i64(t1, tcg_env, bofs + i); 3706 tcg_gen_negsetcond_i64(cond, t0, t0, t1); 3707 tcg_gen_st_i64(t0, tcg_env, dofs + i); 3708 } 3709 tcg_temp_free_i64(t1); 3710 tcg_temp_free_i64(t0); 3711 } 3712 3713 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3714 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 3715 TCGType type, TCGCond cond) 3716 { 3717 for (uint32_t i = 0; i < oprsz; i += tysz) { 3718 TCGv_vec t0 = tcg_temp_new_vec(type); 3719 TCGv_vec t1 = tcg_temp_new_vec(type); 3720 TCGv_vec t2 = tcg_temp_new_vec(type); 3721 3722 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 3723 tcg_gen_ld_vec(t1, tcg_env, bofs + i); 3724 tcg_gen_cmp_vec(cond, vece, t2, t0, t1); 3725 tcg_gen_st_vec(t2, tcg_env, dofs + i); 3726 } 3727 } 3728 3729 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 3730 uint32_t aofs, uint32_t bofs, 3731 uint32_t oprsz, uint32_t maxsz) 3732 { 3733 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3734 static gen_helper_gvec_3 * const eq_fn[4] = { 3735 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 3736 gen_helper_gvec_eq32, gen_helper_gvec_eq64 3737 }; 3738 static gen_helper_gvec_3 * const ne_fn[4] = { 3739 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 3740 gen_helper_gvec_ne32, gen_helper_gvec_ne64 3741 }; 3742 static gen_helper_gvec_3 * const lt_fn[4] = { 3743 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 3744 gen_helper_gvec_lt32, gen_helper_gvec_lt64 3745 }; 3746 static gen_helper_gvec_3 * const le_fn[4] = { 3747 gen_helper_gvec_le8, gen_helper_gvec_le16, 3748 gen_helper_gvec_le32, gen_helper_gvec_le64 3749 }; 3750 static gen_helper_gvec_3 * const ltu_fn[4] = { 3751 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 3752 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 3753 }; 3754 static gen_helper_gvec_3 * const leu_fn[4] = { 3755 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 3756 gen_helper_gvec_leu32, gen_helper_gvec_leu64 3757 }; 3758 static gen_helper_gvec_3 * const * const fns[16] = { 3759 [TCG_COND_EQ] = eq_fn, 3760 [TCG_COND_NE] = ne_fn, 3761 [TCG_COND_LT] = lt_fn, 3762 [TCG_COND_LE] = le_fn, 3763 [TCG_COND_LTU] = ltu_fn, 3764 [TCG_COND_LEU] = leu_fn, 3765 }; 3766 3767 const TCGOpcode *hold_list; 3768 TCGType type; 3769 uint32_t some; 3770 3771 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 3772 check_overlap_3(dofs, aofs, bofs, maxsz); 3773 3774 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3775 do_dup(MO_8, dofs, oprsz, maxsz, 3776 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3777 return; 3778 } 3779 3780 /* 3781 * Implement inline with a vector type, if possible. 3782 * Prefer integer when 64-bit host and 64-bit comparison. 3783 */ 3784 hold_list = tcg_swap_vecop_list(cmp_list); 3785 type = choose_vector_type(cmp_list, vece, oprsz, 3786 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3787 switch (type) { 3788 case TCG_TYPE_V256: 3789 /* Recall that ARM SVE allows vector sizes that are not a 3790 * power of 2, but always a multiple of 16. The intent is 3791 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 3792 */ 3793 some = QEMU_ALIGN_DOWN(oprsz, 32); 3794 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 3795 if (some == oprsz) { 3796 break; 3797 } 3798 dofs += some; 3799 aofs += some; 3800 bofs += some; 3801 oprsz -= some; 3802 maxsz -= some; 3803 /* fallthru */ 3804 case TCG_TYPE_V128: 3805 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 3806 break; 3807 case TCG_TYPE_V64: 3808 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 3809 break; 3810 3811 case 0: 3812 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3813 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 3814 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3815 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 3816 } else { 3817 gen_helper_gvec_3 * const *fn = fns[cond]; 3818 3819 if (fn == NULL) { 3820 uint32_t tmp; 3821 tmp = aofs, aofs = bofs, bofs = tmp; 3822 cond = tcg_swap_cond(cond); 3823 fn = fns[cond]; 3824 assert(fn != NULL); 3825 } 3826 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 3827 oprsz = maxsz; 3828 } 3829 break; 3830 3831 default: 3832 g_assert_not_reached(); 3833 } 3834 tcg_swap_vecop_list(hold_list); 3835 3836 if (oprsz < maxsz) { 3837 expand_clr(dofs + oprsz, maxsz - oprsz); 3838 } 3839 } 3840 3841 static void expand_cmps_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3842 uint32_t oprsz, uint32_t tysz, TCGType type, 3843 TCGCond cond, TCGv_vec c) 3844 { 3845 TCGv_vec t0 = tcg_temp_new_vec(type); 3846 TCGv_vec t1 = tcg_temp_new_vec(type); 3847 uint32_t i; 3848 3849 for (i = 0; i < oprsz; i += tysz) { 3850 tcg_gen_ld_vec(t1, tcg_env, aofs + i); 3851 tcg_gen_cmp_vec(cond, vece, t0, t1, c); 3852 tcg_gen_st_vec(t0, tcg_env, dofs + i); 3853 } 3854 } 3855 3856 void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs, 3857 uint32_t aofs, TCGv_i64 c, 3858 uint32_t oprsz, uint32_t maxsz) 3859 { 3860 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3861 static gen_helper_gvec_2i * const eq_fn[4] = { 3862 gen_helper_gvec_eqs8, gen_helper_gvec_eqs16, 3863 gen_helper_gvec_eqs32, gen_helper_gvec_eqs64 3864 }; 3865 static gen_helper_gvec_2i * const lt_fn[4] = { 3866 gen_helper_gvec_lts8, gen_helper_gvec_lts16, 3867 gen_helper_gvec_lts32, gen_helper_gvec_lts64 3868 }; 3869 static gen_helper_gvec_2i * const le_fn[4] = { 3870 gen_helper_gvec_les8, gen_helper_gvec_les16, 3871 gen_helper_gvec_les32, gen_helper_gvec_les64 3872 }; 3873 static gen_helper_gvec_2i * const ltu_fn[4] = { 3874 gen_helper_gvec_ltus8, gen_helper_gvec_ltus16, 3875 gen_helper_gvec_ltus32, gen_helper_gvec_ltus64 3876 }; 3877 static gen_helper_gvec_2i * const leu_fn[4] = { 3878 gen_helper_gvec_leus8, gen_helper_gvec_leus16, 3879 gen_helper_gvec_leus32, gen_helper_gvec_leus64 3880 }; 3881 static gen_helper_gvec_2i * const * const fns[16] = { 3882 [TCG_COND_EQ] = eq_fn, 3883 [TCG_COND_LT] = lt_fn, 3884 [TCG_COND_LE] = le_fn, 3885 [TCG_COND_LTU] = ltu_fn, 3886 [TCG_COND_LEU] = leu_fn, 3887 }; 3888 3889 TCGType type; 3890 3891 check_size_align(oprsz, maxsz, dofs | aofs); 3892 check_overlap_2(dofs, aofs, maxsz); 3893 3894 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3895 do_dup(MO_8, dofs, oprsz, maxsz, 3896 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3897 return; 3898 } 3899 3900 /* 3901 * Implement inline with a vector type, if possible. 3902 * Prefer integer when 64-bit host and 64-bit comparison. 3903 */ 3904 type = choose_vector_type(cmp_list, vece, oprsz, 3905 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3906 if (type != 0) { 3907 const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list); 3908 TCGv_vec t_vec = tcg_temp_new_vec(type); 3909 uint32_t some; 3910 3911 tcg_gen_dup_i64_vec(vece, t_vec, c); 3912 switch (type) { 3913 case TCG_TYPE_V256: 3914 some = QEMU_ALIGN_DOWN(oprsz, 32); 3915 expand_cmps_vec(vece, dofs, aofs, some, 32, 3916 TCG_TYPE_V256, cond, t_vec); 3917 aofs += some; 3918 dofs += some; 3919 oprsz -= some; 3920 maxsz -= some; 3921 /* fallthru */ 3922 3923 case TCG_TYPE_V128: 3924 some = QEMU_ALIGN_DOWN(oprsz, 16); 3925 expand_cmps_vec(vece, dofs, aofs, some, 16, 3926 TCG_TYPE_V128, cond, t_vec); 3927 break; 3928 3929 case TCG_TYPE_V64: 3930 some = QEMU_ALIGN_DOWN(oprsz, 8); 3931 expand_cmps_vec(vece, dofs, aofs, some, 8, 3932 TCG_TYPE_V64, cond, t_vec); 3933 break; 3934 3935 default: 3936 g_assert_not_reached(); 3937 } 3938 tcg_temp_free_vec(t_vec); 3939 tcg_swap_vecop_list(hold_list); 3940 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3941 TCGv_i64 t0 = tcg_temp_ebb_new_i64(); 3942 uint32_t i; 3943 3944 for (i = 0; i < oprsz; i += 8) { 3945 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 3946 tcg_gen_negsetcond_i64(cond, t0, t0, c); 3947 tcg_gen_st_i64(t0, tcg_env, dofs + i); 3948 } 3949 tcg_temp_free_i64(t0); 3950 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3951 TCGv_i32 t0 = tcg_temp_ebb_new_i32(); 3952 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 3953 uint32_t i; 3954 3955 tcg_gen_extrl_i64_i32(t1, c); 3956 for (i = 0; i < oprsz; i += 4) { 3957 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 3958 tcg_gen_negsetcond_i32(cond, t0, t0, t1); 3959 tcg_gen_st_i32(t0, tcg_env, dofs + i); 3960 } 3961 tcg_temp_free_i32(t0); 3962 tcg_temp_free_i32(t1); 3963 } else { 3964 gen_helper_gvec_2i * const *fn = fns[cond]; 3965 bool inv = false; 3966 3967 if (fn == NULL) { 3968 cond = tcg_invert_cond(cond); 3969 fn = fns[cond]; 3970 assert(fn != NULL); 3971 inv = true; 3972 } 3973 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, fn[vece]); 3974 return; 3975 } 3976 3977 if (oprsz < maxsz) { 3978 expand_clr(dofs + oprsz, maxsz - oprsz); 3979 } 3980 } 3981 3982 void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs, 3983 uint32_t aofs, int64_t c, 3984 uint32_t oprsz, uint32_t maxsz) 3985 { 3986 TCGv_i64 tmp = tcg_constant_i64(c); 3987 tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz); 3988 } 3989 3990 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c) 3991 { 3992 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3993 3994 tcg_gen_and_i64(t, b, a); 3995 tcg_gen_andc_i64(d, c, a); 3996 tcg_gen_or_i64(d, d, t); 3997 tcg_temp_free_i64(t); 3998 } 3999 4000 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs, 4001 uint32_t bofs, uint32_t cofs, 4002 uint32_t oprsz, uint32_t maxsz) 4003 { 4004 static const GVecGen4 g = { 4005 .fni8 = tcg_gen_bitsel_i64, 4006 .fniv = tcg_gen_bitsel_vec, 4007 .fno = gen_helper_gvec_bitsel, 4008 }; 4009 4010 tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g); 4011 } 4012