1 /* 2 * ARM generic vector expansion 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * Copyright (c) 2005-2007 CodeSourcery 6 * Copyright (c) 2007 OpenedHand, Ltd. 7 * 8 * This library is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * This library is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 20 */ 21 22 #include "qemu/osdep.h" 23 #include "translate.h" 24 25 26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs, 27 uint32_t opr_sz, uint32_t max_sz, 28 gen_helper_gvec_3_ptr *fn) 29 { 30 TCGv_ptr qc_ptr = tcg_temp_new_ptr(); 31 32 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 33 tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc)); 34 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr, 35 opr_sz, max_sz, 0, fn); 36 } 37 38 void gen_gvec_sqdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 39 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 40 { 41 static gen_helper_gvec_3_ptr * const fns[2] = { 42 gen_helper_neon_sqdmulh_h, gen_helper_neon_sqdmulh_s 43 }; 44 tcg_debug_assert(vece >= 1 && vece <= 2); 45 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 46 } 47 48 void gen_gvec_sqrdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 49 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 50 { 51 static gen_helper_gvec_3_ptr * const fns[2] = { 52 gen_helper_neon_sqrdmulh_h, gen_helper_neon_sqrdmulh_s 53 }; 54 tcg_debug_assert(vece >= 1 && vece <= 2); 55 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 56 } 57 58 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 59 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 60 { 61 static gen_helper_gvec_3_ptr * const fns[2] = { 62 gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32 63 }; 64 tcg_debug_assert(vece >= 1 && vece <= 2); 65 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 66 } 67 68 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 69 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 70 { 71 static gen_helper_gvec_3_ptr * const fns[2] = { 72 gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32 73 }; 74 tcg_debug_assert(vece >= 1 && vece <= 2); 75 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 76 } 77 78 #define GEN_CMP0(NAME, COND) \ 79 void NAME(unsigned vece, uint32_t d, uint32_t m, \ 80 uint32_t opr_sz, uint32_t max_sz) \ 81 { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); } 82 83 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ) 84 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE) 85 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE) 86 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT) 87 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT) 88 89 #undef GEN_CMP0 90 91 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 92 { 93 tcg_gen_vec_sar8i_i64(a, a, shift); 94 tcg_gen_vec_add8_i64(d, d, a); 95 } 96 97 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 98 { 99 tcg_gen_vec_sar16i_i64(a, a, shift); 100 tcg_gen_vec_add16_i64(d, d, a); 101 } 102 103 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 104 { 105 tcg_gen_sari_i32(a, a, shift); 106 tcg_gen_add_i32(d, d, a); 107 } 108 109 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 110 { 111 tcg_gen_sari_i64(a, a, shift); 112 tcg_gen_add_i64(d, d, a); 113 } 114 115 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 116 { 117 tcg_gen_sari_vec(vece, a, a, sh); 118 tcg_gen_add_vec(vece, d, d, a); 119 } 120 121 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 122 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 123 { 124 static const TCGOpcode vecop_list[] = { 125 INDEX_op_sari_vec, INDEX_op_add_vec, 0 126 }; 127 static const GVecGen2i ops[4] = { 128 { .fni8 = gen_ssra8_i64, 129 .fniv = gen_ssra_vec, 130 .fno = gen_helper_gvec_ssra_b, 131 .load_dest = true, 132 .opt_opc = vecop_list, 133 .vece = MO_8 }, 134 { .fni8 = gen_ssra16_i64, 135 .fniv = gen_ssra_vec, 136 .fno = gen_helper_gvec_ssra_h, 137 .load_dest = true, 138 .opt_opc = vecop_list, 139 .vece = MO_16 }, 140 { .fni4 = gen_ssra32_i32, 141 .fniv = gen_ssra_vec, 142 .fno = gen_helper_gvec_ssra_s, 143 .load_dest = true, 144 .opt_opc = vecop_list, 145 .vece = MO_32 }, 146 { .fni8 = gen_ssra64_i64, 147 .fniv = gen_ssra_vec, 148 .fno = gen_helper_gvec_ssra_d, 149 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 150 .opt_opc = vecop_list, 151 .load_dest = true, 152 .vece = MO_64 }, 153 }; 154 155 /* tszimm encoding produces immediates in the range [1..esize]. */ 156 tcg_debug_assert(shift > 0); 157 tcg_debug_assert(shift <= (8 << vece)); 158 159 /* 160 * Shifts larger than the element size are architecturally valid. 161 * Signed results in all sign bits. 162 */ 163 shift = MIN(shift, (8 << vece) - 1); 164 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 165 } 166 167 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 168 { 169 tcg_gen_vec_shr8i_i64(a, a, shift); 170 tcg_gen_vec_add8_i64(d, d, a); 171 } 172 173 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 174 { 175 tcg_gen_vec_shr16i_i64(a, a, shift); 176 tcg_gen_vec_add16_i64(d, d, a); 177 } 178 179 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 180 { 181 tcg_gen_shri_i32(a, a, shift); 182 tcg_gen_add_i32(d, d, a); 183 } 184 185 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 186 { 187 tcg_gen_shri_i64(a, a, shift); 188 tcg_gen_add_i64(d, d, a); 189 } 190 191 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 192 { 193 tcg_gen_shri_vec(vece, a, a, sh); 194 tcg_gen_add_vec(vece, d, d, a); 195 } 196 197 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 198 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 199 { 200 static const TCGOpcode vecop_list[] = { 201 INDEX_op_shri_vec, INDEX_op_add_vec, 0 202 }; 203 static const GVecGen2i ops[4] = { 204 { .fni8 = gen_usra8_i64, 205 .fniv = gen_usra_vec, 206 .fno = gen_helper_gvec_usra_b, 207 .load_dest = true, 208 .opt_opc = vecop_list, 209 .vece = MO_8, }, 210 { .fni8 = gen_usra16_i64, 211 .fniv = gen_usra_vec, 212 .fno = gen_helper_gvec_usra_h, 213 .load_dest = true, 214 .opt_opc = vecop_list, 215 .vece = MO_16, }, 216 { .fni4 = gen_usra32_i32, 217 .fniv = gen_usra_vec, 218 .fno = gen_helper_gvec_usra_s, 219 .load_dest = true, 220 .opt_opc = vecop_list, 221 .vece = MO_32, }, 222 { .fni8 = gen_usra64_i64, 223 .fniv = gen_usra_vec, 224 .fno = gen_helper_gvec_usra_d, 225 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 226 .load_dest = true, 227 .opt_opc = vecop_list, 228 .vece = MO_64, }, 229 }; 230 231 /* tszimm encoding produces immediates in the range [1..esize]. */ 232 tcg_debug_assert(shift > 0); 233 tcg_debug_assert(shift <= (8 << vece)); 234 235 /* 236 * Shifts larger than the element size are architecturally valid. 237 * Unsigned results in all zeros as input to accumulate: nop. 238 */ 239 if (shift < (8 << vece)) { 240 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 241 } else { 242 /* Nop, but we do need to clear the tail. */ 243 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 244 } 245 } 246 247 /* 248 * Shift one less than the requested amount, and the low bit is 249 * the rounding bit. For the 8 and 16-bit operations, because we 250 * mask the low bit, we can perform a normal integer shift instead 251 * of a vector shift. 252 */ 253 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 254 { 255 TCGv_i64 t = tcg_temp_new_i64(); 256 257 tcg_gen_shri_i64(t, a, sh - 1); 258 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 259 tcg_gen_vec_sar8i_i64(d, a, sh); 260 tcg_gen_vec_add8_i64(d, d, t); 261 } 262 263 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 264 { 265 TCGv_i64 t = tcg_temp_new_i64(); 266 267 tcg_gen_shri_i64(t, a, sh - 1); 268 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 269 tcg_gen_vec_sar16i_i64(d, a, sh); 270 tcg_gen_vec_add16_i64(d, d, t); 271 } 272 273 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 274 { 275 TCGv_i32 t; 276 277 /* Handle shift by the input size for the benefit of trans_SRSHR_ri */ 278 if (sh == 32) { 279 tcg_gen_movi_i32(d, 0); 280 return; 281 } 282 t = tcg_temp_new_i32(); 283 tcg_gen_extract_i32(t, a, sh - 1, 1); 284 tcg_gen_sari_i32(d, a, sh); 285 tcg_gen_add_i32(d, d, t); 286 } 287 288 void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 289 { 290 TCGv_i64 t = tcg_temp_new_i64(); 291 292 tcg_gen_extract_i64(t, a, sh - 1, 1); 293 tcg_gen_sari_i64(d, a, sh); 294 tcg_gen_add_i64(d, d, t); 295 } 296 297 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 298 { 299 TCGv_vec t = tcg_temp_new_vec_matching(d); 300 TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1); 301 302 tcg_gen_shri_vec(vece, t, a, sh - 1); 303 tcg_gen_and_vec(vece, t, t, ones); 304 tcg_gen_sari_vec(vece, d, a, sh); 305 tcg_gen_add_vec(vece, d, d, t); 306 } 307 308 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 309 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 310 { 311 static const TCGOpcode vecop_list[] = { 312 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 313 }; 314 static const GVecGen2i ops[4] = { 315 { .fni8 = gen_srshr8_i64, 316 .fniv = gen_srshr_vec, 317 .fno = gen_helper_gvec_srshr_b, 318 .opt_opc = vecop_list, 319 .vece = MO_8 }, 320 { .fni8 = gen_srshr16_i64, 321 .fniv = gen_srshr_vec, 322 .fno = gen_helper_gvec_srshr_h, 323 .opt_opc = vecop_list, 324 .vece = MO_16 }, 325 { .fni4 = gen_srshr32_i32, 326 .fniv = gen_srshr_vec, 327 .fno = gen_helper_gvec_srshr_s, 328 .opt_opc = vecop_list, 329 .vece = MO_32 }, 330 { .fni8 = gen_srshr64_i64, 331 .fniv = gen_srshr_vec, 332 .fno = gen_helper_gvec_srshr_d, 333 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 334 .opt_opc = vecop_list, 335 .vece = MO_64 }, 336 }; 337 338 /* tszimm encoding produces immediates in the range [1..esize] */ 339 tcg_debug_assert(shift > 0); 340 tcg_debug_assert(shift <= (8 << vece)); 341 342 if (shift == (8 << vece)) { 343 /* 344 * Shifts larger than the element size are architecturally valid. 345 * Signed results in all sign bits. With rounding, this produces 346 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. 347 * I.e. always zero. 348 */ 349 tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0); 350 } else { 351 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 352 } 353 } 354 355 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 356 { 357 TCGv_i64 t = tcg_temp_new_i64(); 358 359 gen_srshr8_i64(t, a, sh); 360 tcg_gen_vec_add8_i64(d, d, t); 361 } 362 363 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 364 { 365 TCGv_i64 t = tcg_temp_new_i64(); 366 367 gen_srshr16_i64(t, a, sh); 368 tcg_gen_vec_add16_i64(d, d, t); 369 } 370 371 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 372 { 373 TCGv_i32 t = tcg_temp_new_i32(); 374 375 gen_srshr32_i32(t, a, sh); 376 tcg_gen_add_i32(d, d, t); 377 } 378 379 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 380 { 381 TCGv_i64 t = tcg_temp_new_i64(); 382 383 gen_srshr64_i64(t, a, sh); 384 tcg_gen_add_i64(d, d, t); 385 } 386 387 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 388 { 389 TCGv_vec t = tcg_temp_new_vec_matching(d); 390 391 gen_srshr_vec(vece, t, a, sh); 392 tcg_gen_add_vec(vece, d, d, t); 393 } 394 395 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 396 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 397 { 398 static const TCGOpcode vecop_list[] = { 399 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 400 }; 401 static const GVecGen2i ops[4] = { 402 { .fni8 = gen_srsra8_i64, 403 .fniv = gen_srsra_vec, 404 .fno = gen_helper_gvec_srsra_b, 405 .opt_opc = vecop_list, 406 .load_dest = true, 407 .vece = MO_8 }, 408 { .fni8 = gen_srsra16_i64, 409 .fniv = gen_srsra_vec, 410 .fno = gen_helper_gvec_srsra_h, 411 .opt_opc = vecop_list, 412 .load_dest = true, 413 .vece = MO_16 }, 414 { .fni4 = gen_srsra32_i32, 415 .fniv = gen_srsra_vec, 416 .fno = gen_helper_gvec_srsra_s, 417 .opt_opc = vecop_list, 418 .load_dest = true, 419 .vece = MO_32 }, 420 { .fni8 = gen_srsra64_i64, 421 .fniv = gen_srsra_vec, 422 .fno = gen_helper_gvec_srsra_d, 423 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 424 .opt_opc = vecop_list, 425 .load_dest = true, 426 .vece = MO_64 }, 427 }; 428 429 /* tszimm encoding produces immediates in the range [1..esize] */ 430 tcg_debug_assert(shift > 0); 431 tcg_debug_assert(shift <= (8 << vece)); 432 433 /* 434 * Shifts larger than the element size are architecturally valid. 435 * Signed results in all sign bits. With rounding, this produces 436 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. 437 * I.e. always zero. With accumulation, this leaves D unchanged. 438 */ 439 if (shift == (8 << vece)) { 440 /* Nop, but we do need to clear the tail. */ 441 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 442 } else { 443 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 444 } 445 } 446 447 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 448 { 449 TCGv_i64 t = tcg_temp_new_i64(); 450 451 tcg_gen_shri_i64(t, a, sh - 1); 452 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 453 tcg_gen_vec_shr8i_i64(d, a, sh); 454 tcg_gen_vec_add8_i64(d, d, t); 455 } 456 457 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 458 { 459 TCGv_i64 t = tcg_temp_new_i64(); 460 461 tcg_gen_shri_i64(t, a, sh - 1); 462 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 463 tcg_gen_vec_shr16i_i64(d, a, sh); 464 tcg_gen_vec_add16_i64(d, d, t); 465 } 466 467 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 468 { 469 TCGv_i32 t; 470 471 /* Handle shift by the input size for the benefit of trans_URSHR_ri */ 472 if (sh == 32) { 473 tcg_gen_extract_i32(d, a, sh - 1, 1); 474 return; 475 } 476 t = tcg_temp_new_i32(); 477 tcg_gen_extract_i32(t, a, sh - 1, 1); 478 tcg_gen_shri_i32(d, a, sh); 479 tcg_gen_add_i32(d, d, t); 480 } 481 482 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 483 { 484 TCGv_i64 t = tcg_temp_new_i64(); 485 486 tcg_gen_extract_i64(t, a, sh - 1, 1); 487 tcg_gen_shri_i64(d, a, sh); 488 tcg_gen_add_i64(d, d, t); 489 } 490 491 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift) 492 { 493 TCGv_vec t = tcg_temp_new_vec_matching(d); 494 TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1); 495 496 tcg_gen_shri_vec(vece, t, a, shift - 1); 497 tcg_gen_and_vec(vece, t, t, ones); 498 tcg_gen_shri_vec(vece, d, a, shift); 499 tcg_gen_add_vec(vece, d, d, t); 500 } 501 502 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 503 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 504 { 505 static const TCGOpcode vecop_list[] = { 506 INDEX_op_shri_vec, INDEX_op_add_vec, 0 507 }; 508 static const GVecGen2i ops[4] = { 509 { .fni8 = gen_urshr8_i64, 510 .fniv = gen_urshr_vec, 511 .fno = gen_helper_gvec_urshr_b, 512 .opt_opc = vecop_list, 513 .vece = MO_8 }, 514 { .fni8 = gen_urshr16_i64, 515 .fniv = gen_urshr_vec, 516 .fno = gen_helper_gvec_urshr_h, 517 .opt_opc = vecop_list, 518 .vece = MO_16 }, 519 { .fni4 = gen_urshr32_i32, 520 .fniv = gen_urshr_vec, 521 .fno = gen_helper_gvec_urshr_s, 522 .opt_opc = vecop_list, 523 .vece = MO_32 }, 524 { .fni8 = gen_urshr64_i64, 525 .fniv = gen_urshr_vec, 526 .fno = gen_helper_gvec_urshr_d, 527 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 528 .opt_opc = vecop_list, 529 .vece = MO_64 }, 530 }; 531 532 /* tszimm encoding produces immediates in the range [1..esize] */ 533 tcg_debug_assert(shift > 0); 534 tcg_debug_assert(shift <= (8 << vece)); 535 536 if (shift == (8 << vece)) { 537 /* 538 * Shifts larger than the element size are architecturally valid. 539 * Unsigned results in zero. With rounding, this produces a 540 * copy of the most significant bit. 541 */ 542 tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz); 543 } else { 544 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 545 } 546 } 547 548 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 549 { 550 TCGv_i64 t = tcg_temp_new_i64(); 551 552 if (sh == 8) { 553 tcg_gen_vec_shr8i_i64(t, a, 7); 554 } else { 555 gen_urshr8_i64(t, a, sh); 556 } 557 tcg_gen_vec_add8_i64(d, d, t); 558 } 559 560 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 561 { 562 TCGv_i64 t = tcg_temp_new_i64(); 563 564 if (sh == 16) { 565 tcg_gen_vec_shr16i_i64(t, a, 15); 566 } else { 567 gen_urshr16_i64(t, a, sh); 568 } 569 tcg_gen_vec_add16_i64(d, d, t); 570 } 571 572 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 573 { 574 TCGv_i32 t = tcg_temp_new_i32(); 575 576 if (sh == 32) { 577 tcg_gen_shri_i32(t, a, 31); 578 } else { 579 gen_urshr32_i32(t, a, sh); 580 } 581 tcg_gen_add_i32(d, d, t); 582 } 583 584 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 585 { 586 TCGv_i64 t = tcg_temp_new_i64(); 587 588 if (sh == 64) { 589 tcg_gen_shri_i64(t, a, 63); 590 } else { 591 gen_urshr64_i64(t, a, sh); 592 } 593 tcg_gen_add_i64(d, d, t); 594 } 595 596 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 597 { 598 TCGv_vec t = tcg_temp_new_vec_matching(d); 599 600 if (sh == (8 << vece)) { 601 tcg_gen_shri_vec(vece, t, a, sh - 1); 602 } else { 603 gen_urshr_vec(vece, t, a, sh); 604 } 605 tcg_gen_add_vec(vece, d, d, t); 606 } 607 608 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 609 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 610 { 611 static const TCGOpcode vecop_list[] = { 612 INDEX_op_shri_vec, INDEX_op_add_vec, 0 613 }; 614 static const GVecGen2i ops[4] = { 615 { .fni8 = gen_ursra8_i64, 616 .fniv = gen_ursra_vec, 617 .fno = gen_helper_gvec_ursra_b, 618 .opt_opc = vecop_list, 619 .load_dest = true, 620 .vece = MO_8 }, 621 { .fni8 = gen_ursra16_i64, 622 .fniv = gen_ursra_vec, 623 .fno = gen_helper_gvec_ursra_h, 624 .opt_opc = vecop_list, 625 .load_dest = true, 626 .vece = MO_16 }, 627 { .fni4 = gen_ursra32_i32, 628 .fniv = gen_ursra_vec, 629 .fno = gen_helper_gvec_ursra_s, 630 .opt_opc = vecop_list, 631 .load_dest = true, 632 .vece = MO_32 }, 633 { .fni8 = gen_ursra64_i64, 634 .fniv = gen_ursra_vec, 635 .fno = gen_helper_gvec_ursra_d, 636 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 637 .opt_opc = vecop_list, 638 .load_dest = true, 639 .vece = MO_64 }, 640 }; 641 642 /* tszimm encoding produces immediates in the range [1..esize] */ 643 tcg_debug_assert(shift > 0); 644 tcg_debug_assert(shift <= (8 << vece)); 645 646 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 647 } 648 649 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 650 { 651 uint64_t mask = dup_const(MO_8, 0xff >> shift); 652 TCGv_i64 t = tcg_temp_new_i64(); 653 654 tcg_gen_shri_i64(t, a, shift); 655 tcg_gen_andi_i64(t, t, mask); 656 tcg_gen_andi_i64(d, d, ~mask); 657 tcg_gen_or_i64(d, d, t); 658 } 659 660 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 661 { 662 uint64_t mask = dup_const(MO_16, 0xffff >> shift); 663 TCGv_i64 t = tcg_temp_new_i64(); 664 665 tcg_gen_shri_i64(t, a, shift); 666 tcg_gen_andi_i64(t, t, mask); 667 tcg_gen_andi_i64(d, d, ~mask); 668 tcg_gen_or_i64(d, d, t); 669 } 670 671 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 672 { 673 tcg_gen_shri_i32(a, a, shift); 674 tcg_gen_deposit_i32(d, d, a, 0, 32 - shift); 675 } 676 677 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 678 { 679 tcg_gen_shri_i64(a, a, shift); 680 tcg_gen_deposit_i64(d, d, a, 0, 64 - shift); 681 } 682 683 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 684 { 685 TCGv_vec t = tcg_temp_new_vec_matching(d); 686 int64_t mi = MAKE_64BIT_MASK((8 << vece) - sh, sh); 687 TCGv_vec m = tcg_constant_vec_matching(d, vece, mi); 688 689 tcg_gen_shri_vec(vece, t, a, sh); 690 tcg_gen_and_vec(vece, d, d, m); 691 tcg_gen_or_vec(vece, d, d, t); 692 } 693 694 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 695 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 696 { 697 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 698 const GVecGen2i ops[4] = { 699 { .fni8 = gen_shr8_ins_i64, 700 .fniv = gen_shr_ins_vec, 701 .fno = gen_helper_gvec_sri_b, 702 .load_dest = true, 703 .opt_opc = vecop_list, 704 .vece = MO_8 }, 705 { .fni8 = gen_shr16_ins_i64, 706 .fniv = gen_shr_ins_vec, 707 .fno = gen_helper_gvec_sri_h, 708 .load_dest = true, 709 .opt_opc = vecop_list, 710 .vece = MO_16 }, 711 { .fni4 = gen_shr32_ins_i32, 712 .fniv = gen_shr_ins_vec, 713 .fno = gen_helper_gvec_sri_s, 714 .load_dest = true, 715 .opt_opc = vecop_list, 716 .vece = MO_32 }, 717 { .fni8 = gen_shr64_ins_i64, 718 .fniv = gen_shr_ins_vec, 719 .fno = gen_helper_gvec_sri_d, 720 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 721 .load_dest = true, 722 .opt_opc = vecop_list, 723 .vece = MO_64 }, 724 }; 725 726 /* tszimm encoding produces immediates in the range [1..esize]. */ 727 tcg_debug_assert(shift > 0); 728 tcg_debug_assert(shift <= (8 << vece)); 729 730 /* Shift of esize leaves destination unchanged. */ 731 if (shift < (8 << vece)) { 732 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 733 } else { 734 /* Nop, but we do need to clear the tail. */ 735 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 736 } 737 } 738 739 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 740 { 741 uint64_t mask = dup_const(MO_8, 0xff << shift); 742 TCGv_i64 t = tcg_temp_new_i64(); 743 744 tcg_gen_shli_i64(t, a, shift); 745 tcg_gen_andi_i64(t, t, mask); 746 tcg_gen_andi_i64(d, d, ~mask); 747 tcg_gen_or_i64(d, d, t); 748 } 749 750 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 751 { 752 uint64_t mask = dup_const(MO_16, 0xffff << shift); 753 TCGv_i64 t = tcg_temp_new_i64(); 754 755 tcg_gen_shli_i64(t, a, shift); 756 tcg_gen_andi_i64(t, t, mask); 757 tcg_gen_andi_i64(d, d, ~mask); 758 tcg_gen_or_i64(d, d, t); 759 } 760 761 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 762 { 763 tcg_gen_deposit_i32(d, d, a, shift, 32 - shift); 764 } 765 766 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 767 { 768 tcg_gen_deposit_i64(d, d, a, shift, 64 - shift); 769 } 770 771 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 772 { 773 TCGv_vec t = tcg_temp_new_vec_matching(d); 774 TCGv_vec m = tcg_constant_vec_matching(d, vece, MAKE_64BIT_MASK(0, sh)); 775 776 tcg_gen_shli_vec(vece, t, a, sh); 777 tcg_gen_and_vec(vece, d, d, m); 778 tcg_gen_or_vec(vece, d, d, t); 779 } 780 781 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 782 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 783 { 784 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 785 const GVecGen2i ops[4] = { 786 { .fni8 = gen_shl8_ins_i64, 787 .fniv = gen_shl_ins_vec, 788 .fno = gen_helper_gvec_sli_b, 789 .load_dest = true, 790 .opt_opc = vecop_list, 791 .vece = MO_8 }, 792 { .fni8 = gen_shl16_ins_i64, 793 .fniv = gen_shl_ins_vec, 794 .fno = gen_helper_gvec_sli_h, 795 .load_dest = true, 796 .opt_opc = vecop_list, 797 .vece = MO_16 }, 798 { .fni4 = gen_shl32_ins_i32, 799 .fniv = gen_shl_ins_vec, 800 .fno = gen_helper_gvec_sli_s, 801 .load_dest = true, 802 .opt_opc = vecop_list, 803 .vece = MO_32 }, 804 { .fni8 = gen_shl64_ins_i64, 805 .fniv = gen_shl_ins_vec, 806 .fno = gen_helper_gvec_sli_d, 807 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 808 .load_dest = true, 809 .opt_opc = vecop_list, 810 .vece = MO_64 }, 811 }; 812 813 /* tszimm encoding produces immediates in the range [0..esize-1]. */ 814 tcg_debug_assert(shift >= 0); 815 tcg_debug_assert(shift < (8 << vece)); 816 817 if (shift == 0) { 818 tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz); 819 } else { 820 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 821 } 822 } 823 824 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 825 { 826 gen_helper_neon_mul_u8(a, a, b); 827 gen_helper_neon_add_u8(d, d, a); 828 } 829 830 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 831 { 832 gen_helper_neon_mul_u8(a, a, b); 833 gen_helper_neon_sub_u8(d, d, a); 834 } 835 836 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 837 { 838 gen_helper_neon_mul_u16(a, a, b); 839 gen_helper_neon_add_u16(d, d, a); 840 } 841 842 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 843 { 844 gen_helper_neon_mul_u16(a, a, b); 845 gen_helper_neon_sub_u16(d, d, a); 846 } 847 848 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 849 { 850 tcg_gen_mul_i32(a, a, b); 851 tcg_gen_add_i32(d, d, a); 852 } 853 854 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 855 { 856 tcg_gen_mul_i32(a, a, b); 857 tcg_gen_sub_i32(d, d, a); 858 } 859 860 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 861 { 862 tcg_gen_mul_i64(a, a, b); 863 tcg_gen_add_i64(d, d, a); 864 } 865 866 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 867 { 868 tcg_gen_mul_i64(a, a, b); 869 tcg_gen_sub_i64(d, d, a); 870 } 871 872 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 873 { 874 tcg_gen_mul_vec(vece, a, a, b); 875 tcg_gen_add_vec(vece, d, d, a); 876 } 877 878 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 879 { 880 tcg_gen_mul_vec(vece, a, a, b); 881 tcg_gen_sub_vec(vece, d, d, a); 882 } 883 884 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops, 885 * these tables are shared with AArch64 which does support them. 886 */ 887 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 888 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 889 { 890 static const TCGOpcode vecop_list[] = { 891 INDEX_op_mul_vec, INDEX_op_add_vec, 0 892 }; 893 static const GVecGen3 ops[4] = { 894 { .fni4 = gen_mla8_i32, 895 .fniv = gen_mla_vec, 896 .load_dest = true, 897 .opt_opc = vecop_list, 898 .vece = MO_8 }, 899 { .fni4 = gen_mla16_i32, 900 .fniv = gen_mla_vec, 901 .load_dest = true, 902 .opt_opc = vecop_list, 903 .vece = MO_16 }, 904 { .fni4 = gen_mla32_i32, 905 .fniv = gen_mla_vec, 906 .load_dest = true, 907 .opt_opc = vecop_list, 908 .vece = MO_32 }, 909 { .fni8 = gen_mla64_i64, 910 .fniv = gen_mla_vec, 911 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 912 .load_dest = true, 913 .opt_opc = vecop_list, 914 .vece = MO_64 }, 915 }; 916 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 917 } 918 919 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 920 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 921 { 922 static const TCGOpcode vecop_list[] = { 923 INDEX_op_mul_vec, INDEX_op_sub_vec, 0 924 }; 925 static const GVecGen3 ops[4] = { 926 { .fni4 = gen_mls8_i32, 927 .fniv = gen_mls_vec, 928 .load_dest = true, 929 .opt_opc = vecop_list, 930 .vece = MO_8 }, 931 { .fni4 = gen_mls16_i32, 932 .fniv = gen_mls_vec, 933 .load_dest = true, 934 .opt_opc = vecop_list, 935 .vece = MO_16 }, 936 { .fni4 = gen_mls32_i32, 937 .fniv = gen_mls_vec, 938 .load_dest = true, 939 .opt_opc = vecop_list, 940 .vece = MO_32 }, 941 { .fni8 = gen_mls64_i64, 942 .fniv = gen_mls_vec, 943 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 944 .load_dest = true, 945 .opt_opc = vecop_list, 946 .vece = MO_64 }, 947 }; 948 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 949 } 950 951 /* CMTST : test is "if (X & Y != 0)". */ 952 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 953 { 954 tcg_gen_negsetcond_i32(TCG_COND_TSTNE, d, a, b); 955 } 956 957 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 958 { 959 tcg_gen_negsetcond_i64(TCG_COND_TSTNE, d, a, b); 960 } 961 962 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 963 { 964 tcg_gen_cmp_vec(TCG_COND_TSTNE, vece, d, a, b); 965 } 966 967 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 968 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 969 { 970 static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 }; 971 static const GVecGen3 ops[4] = { 972 { .fni4 = gen_helper_neon_tst_u8, 973 .fniv = gen_cmtst_vec, 974 .opt_opc = vecop_list, 975 .vece = MO_8 }, 976 { .fni4 = gen_helper_neon_tst_u16, 977 .fniv = gen_cmtst_vec, 978 .opt_opc = vecop_list, 979 .vece = MO_16 }, 980 { .fni4 = gen_cmtst_i32, 981 .fniv = gen_cmtst_vec, 982 .opt_opc = vecop_list, 983 .vece = MO_32 }, 984 { .fni8 = gen_cmtst_i64, 985 .fniv = gen_cmtst_vec, 986 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 987 .opt_opc = vecop_list, 988 .vece = MO_64 }, 989 }; 990 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 991 } 992 993 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) 994 { 995 TCGv_i32 lval = tcg_temp_new_i32(); 996 TCGv_i32 rval = tcg_temp_new_i32(); 997 TCGv_i32 lsh = tcg_temp_new_i32(); 998 TCGv_i32 rsh = tcg_temp_new_i32(); 999 TCGv_i32 zero = tcg_constant_i32(0); 1000 TCGv_i32 max = tcg_constant_i32(32); 1001 1002 /* 1003 * Rely on the TCG guarantee that out of range shifts produce 1004 * unspecified results, not undefined behaviour (i.e. no trap). 1005 * Discard out-of-range results after the fact. 1006 */ 1007 tcg_gen_ext8s_i32(lsh, shift); 1008 tcg_gen_neg_i32(rsh, lsh); 1009 tcg_gen_shl_i32(lval, src, lsh); 1010 tcg_gen_shr_i32(rval, src, rsh); 1011 tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero); 1012 tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst); 1013 } 1014 1015 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) 1016 { 1017 TCGv_i64 lval = tcg_temp_new_i64(); 1018 TCGv_i64 rval = tcg_temp_new_i64(); 1019 TCGv_i64 lsh = tcg_temp_new_i64(); 1020 TCGv_i64 rsh = tcg_temp_new_i64(); 1021 TCGv_i64 zero = tcg_constant_i64(0); 1022 TCGv_i64 max = tcg_constant_i64(64); 1023 1024 /* 1025 * Rely on the TCG guarantee that out of range shifts produce 1026 * unspecified results, not undefined behaviour (i.e. no trap). 1027 * Discard out-of-range results after the fact. 1028 */ 1029 tcg_gen_ext8s_i64(lsh, shift); 1030 tcg_gen_neg_i64(rsh, lsh); 1031 tcg_gen_shl_i64(lval, src, lsh); 1032 tcg_gen_shr_i64(rval, src, rsh); 1033 tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero); 1034 tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst); 1035 } 1036 1037 static void gen_ushl_vec(unsigned vece, TCGv_vec dst, 1038 TCGv_vec src, TCGv_vec shift) 1039 { 1040 TCGv_vec lval = tcg_temp_new_vec_matching(dst); 1041 TCGv_vec rval = tcg_temp_new_vec_matching(dst); 1042 TCGv_vec lsh = tcg_temp_new_vec_matching(dst); 1043 TCGv_vec rsh = tcg_temp_new_vec_matching(dst); 1044 TCGv_vec max, zero; 1045 1046 tcg_gen_neg_vec(vece, rsh, shift); 1047 if (vece == MO_8) { 1048 tcg_gen_mov_vec(lsh, shift); 1049 } else { 1050 TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff); 1051 tcg_gen_and_vec(vece, lsh, shift, msk); 1052 tcg_gen_and_vec(vece, rsh, rsh, msk); 1053 } 1054 1055 /* 1056 * Rely on the TCG guarantee that out of range shifts produce 1057 * unspecified results, not undefined behaviour (i.e. no trap). 1058 * Discard out-of-range results after the fact. 1059 */ 1060 tcg_gen_shlv_vec(vece, lval, src, lsh); 1061 tcg_gen_shrv_vec(vece, rval, src, rsh); 1062 1063 /* 1064 * The choice of GE (signed) and GEU (unsigned) are biased toward 1065 * the instructions of the x86_64 host. For MO_8, the whole byte 1066 * is significant so we must use an unsigned compare; otherwise we 1067 * have already masked to a byte and so a signed compare works. 1068 * Other tcg hosts have a full set of comparisons and do not care. 1069 */ 1070 zero = tcg_constant_vec_matching(dst, vece, 0); 1071 max = tcg_constant_vec_matching(dst, vece, 8 << vece); 1072 if (vece == MO_8) { 1073 tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, lval, lsh, max, zero, lval); 1074 tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, rval, rsh, max, zero, rval); 1075 } else { 1076 tcg_gen_cmpsel_vec(TCG_COND_GE, vece, lval, lsh, max, zero, lval); 1077 tcg_gen_cmpsel_vec(TCG_COND_GE, vece, rval, rsh, max, zero, rval); 1078 } 1079 tcg_gen_or_vec(vece, dst, lval, rval); 1080 } 1081 1082 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1083 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1084 { 1085 static const TCGOpcode vecop_list[] = { 1086 INDEX_op_neg_vec, INDEX_op_shlv_vec, 1087 INDEX_op_shrv_vec, INDEX_op_cmpsel_vec, 0 1088 }; 1089 static const GVecGen3 ops[4] = { 1090 { .fniv = gen_ushl_vec, 1091 .fno = gen_helper_gvec_ushl_b, 1092 .opt_opc = vecop_list, 1093 .vece = MO_8 }, 1094 { .fniv = gen_ushl_vec, 1095 .fno = gen_helper_gvec_ushl_h, 1096 .opt_opc = vecop_list, 1097 .vece = MO_16 }, 1098 { .fni4 = gen_ushl_i32, 1099 .fniv = gen_ushl_vec, 1100 .opt_opc = vecop_list, 1101 .vece = MO_32 }, 1102 { .fni8 = gen_ushl_i64, 1103 .fniv = gen_ushl_vec, 1104 .opt_opc = vecop_list, 1105 .vece = MO_64 }, 1106 }; 1107 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1108 } 1109 1110 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) 1111 { 1112 TCGv_i32 lval = tcg_temp_new_i32(); 1113 TCGv_i32 rval = tcg_temp_new_i32(); 1114 TCGv_i32 lsh = tcg_temp_new_i32(); 1115 TCGv_i32 rsh = tcg_temp_new_i32(); 1116 TCGv_i32 zero = tcg_constant_i32(0); 1117 TCGv_i32 max = tcg_constant_i32(31); 1118 1119 /* 1120 * Rely on the TCG guarantee that out of range shifts produce 1121 * unspecified results, not undefined behaviour (i.e. no trap). 1122 * Discard out-of-range results after the fact. 1123 */ 1124 tcg_gen_ext8s_i32(lsh, shift); 1125 tcg_gen_neg_i32(rsh, lsh); 1126 tcg_gen_shl_i32(lval, src, lsh); 1127 tcg_gen_umin_i32(rsh, rsh, max); 1128 tcg_gen_sar_i32(rval, src, rsh); 1129 tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero); 1130 tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval); 1131 } 1132 1133 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) 1134 { 1135 TCGv_i64 lval = tcg_temp_new_i64(); 1136 TCGv_i64 rval = tcg_temp_new_i64(); 1137 TCGv_i64 lsh = tcg_temp_new_i64(); 1138 TCGv_i64 rsh = tcg_temp_new_i64(); 1139 TCGv_i64 zero = tcg_constant_i64(0); 1140 TCGv_i64 max = tcg_constant_i64(63); 1141 1142 /* 1143 * Rely on the TCG guarantee that out of range shifts produce 1144 * unspecified results, not undefined behaviour (i.e. no trap). 1145 * Discard out-of-range results after the fact. 1146 */ 1147 tcg_gen_ext8s_i64(lsh, shift); 1148 tcg_gen_neg_i64(rsh, lsh); 1149 tcg_gen_shl_i64(lval, src, lsh); 1150 tcg_gen_umin_i64(rsh, rsh, max); 1151 tcg_gen_sar_i64(rval, src, rsh); 1152 tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero); 1153 tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval); 1154 } 1155 1156 static void gen_sshl_vec(unsigned vece, TCGv_vec dst, 1157 TCGv_vec src, TCGv_vec shift) 1158 { 1159 TCGv_vec lval = tcg_temp_new_vec_matching(dst); 1160 TCGv_vec rval = tcg_temp_new_vec_matching(dst); 1161 TCGv_vec lsh = tcg_temp_new_vec_matching(dst); 1162 TCGv_vec rsh = tcg_temp_new_vec_matching(dst); 1163 TCGv_vec tmp = tcg_temp_new_vec_matching(dst); 1164 TCGv_vec max, zero; 1165 1166 /* 1167 * Rely on the TCG guarantee that out of range shifts produce 1168 * unspecified results, not undefined behaviour (i.e. no trap). 1169 * Discard out-of-range results after the fact. 1170 */ 1171 tcg_gen_neg_vec(vece, rsh, shift); 1172 if (vece == MO_8) { 1173 tcg_gen_mov_vec(lsh, shift); 1174 } else { 1175 TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff); 1176 tcg_gen_and_vec(vece, lsh, shift, msk); 1177 tcg_gen_and_vec(vece, rsh, rsh, msk); 1178 } 1179 1180 /* Bound rsh so out of bound right shift gets -1. */ 1181 max = tcg_constant_vec_matching(dst, vece, (8 << vece) - 1); 1182 tcg_gen_umin_vec(vece, rsh, rsh, max); 1183 tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, max); 1184 1185 tcg_gen_shlv_vec(vece, lval, src, lsh); 1186 tcg_gen_sarv_vec(vece, rval, src, rsh); 1187 1188 /* Select in-bound left shift. */ 1189 tcg_gen_andc_vec(vece, lval, lval, tmp); 1190 1191 /* Select between left and right shift. */ 1192 zero = tcg_constant_vec_matching(dst, vece, 0); 1193 if (vece == MO_8) { 1194 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, zero, rval, lval); 1195 } else { 1196 TCGv_vec sgn = tcg_constant_vec_matching(dst, vece, 0x80); 1197 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, sgn, lval, rval); 1198 } 1199 } 1200 1201 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1202 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1203 { 1204 static const TCGOpcode vecop_list[] = { 1205 INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec, 1206 INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0 1207 }; 1208 static const GVecGen3 ops[4] = { 1209 { .fniv = gen_sshl_vec, 1210 .fno = gen_helper_gvec_sshl_b, 1211 .opt_opc = vecop_list, 1212 .vece = MO_8 }, 1213 { .fniv = gen_sshl_vec, 1214 .fno = gen_helper_gvec_sshl_h, 1215 .opt_opc = vecop_list, 1216 .vece = MO_16 }, 1217 { .fni4 = gen_sshl_i32, 1218 .fniv = gen_sshl_vec, 1219 .opt_opc = vecop_list, 1220 .vece = MO_32 }, 1221 { .fni8 = gen_sshl_i64, 1222 .fniv = gen_sshl_vec, 1223 .opt_opc = vecop_list, 1224 .vece = MO_64 }, 1225 }; 1226 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1227 } 1228 1229 void gen_gvec_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1230 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1231 { 1232 static gen_helper_gvec_3 * const fns[] = { 1233 gen_helper_gvec_srshl_b, gen_helper_gvec_srshl_h, 1234 gen_helper_gvec_srshl_s, gen_helper_gvec_srshl_d, 1235 }; 1236 tcg_debug_assert(vece <= MO_64); 1237 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1238 } 1239 1240 void gen_gvec_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1241 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1242 { 1243 static gen_helper_gvec_3 * const fns[] = { 1244 gen_helper_gvec_urshl_b, gen_helper_gvec_urshl_h, 1245 gen_helper_gvec_urshl_s, gen_helper_gvec_urshl_d, 1246 }; 1247 tcg_debug_assert(vece <= MO_64); 1248 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1249 } 1250 1251 void gen_neon_sqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1252 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1253 { 1254 static gen_helper_gvec_3_ptr * const fns[] = { 1255 gen_helper_neon_sqshl_b, gen_helper_neon_sqshl_h, 1256 gen_helper_neon_sqshl_s, gen_helper_neon_sqshl_d, 1257 }; 1258 tcg_debug_assert(vece <= MO_64); 1259 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1260 opr_sz, max_sz, 0, fns[vece]); 1261 } 1262 1263 void gen_neon_uqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1264 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1265 { 1266 static gen_helper_gvec_3_ptr * const fns[] = { 1267 gen_helper_neon_uqshl_b, gen_helper_neon_uqshl_h, 1268 gen_helper_neon_uqshl_s, gen_helper_neon_uqshl_d, 1269 }; 1270 tcg_debug_assert(vece <= MO_64); 1271 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1272 opr_sz, max_sz, 0, fns[vece]); 1273 } 1274 1275 void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1276 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1277 { 1278 static gen_helper_gvec_3_ptr * const fns[] = { 1279 gen_helper_neon_sqrshl_b, gen_helper_neon_sqrshl_h, 1280 gen_helper_neon_sqrshl_s, gen_helper_neon_sqrshl_d, 1281 }; 1282 tcg_debug_assert(vece <= MO_64); 1283 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1284 opr_sz, max_sz, 0, fns[vece]); 1285 } 1286 1287 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1288 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1289 { 1290 static gen_helper_gvec_3_ptr * const fns[] = { 1291 gen_helper_neon_uqrshl_b, gen_helper_neon_uqrshl_h, 1292 gen_helper_neon_uqrshl_s, gen_helper_neon_uqrshl_d, 1293 }; 1294 tcg_debug_assert(vece <= MO_64); 1295 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1296 opr_sz, max_sz, 0, fns[vece]); 1297 } 1298 1299 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1300 { 1301 uint64_t max = MAKE_64BIT_MASK(0, 8 << esz); 1302 TCGv_i64 tmp = tcg_temp_new_i64(); 1303 1304 tcg_gen_add_i64(tmp, a, b); 1305 tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max)); 1306 tcg_gen_xor_i64(tmp, tmp, res); 1307 tcg_gen_or_i64(qc, qc, tmp); 1308 } 1309 1310 void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1311 { 1312 TCGv_i64 t = tcg_temp_new_i64(); 1313 1314 tcg_gen_add_i64(t, a, b); 1315 tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a, 1316 tcg_constant_i64(UINT64_MAX), t); 1317 tcg_gen_xor_i64(t, t, res); 1318 tcg_gen_or_i64(qc, qc, t); 1319 } 1320 1321 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1322 TCGv_vec a, TCGv_vec b) 1323 { 1324 TCGv_vec x = tcg_temp_new_vec_matching(t); 1325 tcg_gen_add_vec(vece, x, a, b); 1326 tcg_gen_usadd_vec(vece, t, a, b); 1327 tcg_gen_xor_vec(vece, x, x, t); 1328 tcg_gen_or_vec(vece, qc, qc, x); 1329 } 1330 1331 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1332 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1333 { 1334 static const TCGOpcode vecop_list[] = { 1335 INDEX_op_usadd_vec, INDEX_op_add_vec, 0 1336 }; 1337 static const GVecGen4 ops[4] = { 1338 { .fniv = gen_uqadd_vec, 1339 .fno = gen_helper_gvec_uqadd_b, 1340 .write_aofs = true, 1341 .opt_opc = vecop_list, 1342 .vece = MO_8 }, 1343 { .fniv = gen_uqadd_vec, 1344 .fno = gen_helper_gvec_uqadd_h, 1345 .write_aofs = true, 1346 .opt_opc = vecop_list, 1347 .vece = MO_16 }, 1348 { .fniv = gen_uqadd_vec, 1349 .fno = gen_helper_gvec_uqadd_s, 1350 .write_aofs = true, 1351 .opt_opc = vecop_list, 1352 .vece = MO_32 }, 1353 { .fniv = gen_uqadd_vec, 1354 .fni8 = gen_uqadd_d, 1355 .fno = gen_helper_gvec_uqadd_d, 1356 .write_aofs = true, 1357 .opt_opc = vecop_list, 1358 .vece = MO_64 }, 1359 }; 1360 1361 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1362 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1363 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1364 } 1365 1366 void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1367 { 1368 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1); 1369 int64_t min = -1ll - max; 1370 TCGv_i64 tmp = tcg_temp_new_i64(); 1371 1372 tcg_gen_add_i64(tmp, a, b); 1373 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max)); 1374 tcg_gen_smax_i64(res, res, tcg_constant_i64(min)); 1375 tcg_gen_xor_i64(tmp, tmp, res); 1376 tcg_gen_or_i64(qc, qc, tmp); 1377 } 1378 1379 void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1380 { 1381 TCGv_i64 t0 = tcg_temp_new_i64(); 1382 TCGv_i64 t1 = tcg_temp_new_i64(); 1383 TCGv_i64 t2 = tcg_temp_new_i64(); 1384 1385 tcg_gen_add_i64(t0, a, b); 1386 1387 /* Compute signed overflow indication into T1 */ 1388 tcg_gen_xor_i64(t1, a, b); 1389 tcg_gen_xor_i64(t2, t0, a); 1390 tcg_gen_andc_i64(t1, t2, t1); 1391 1392 /* Compute saturated value into T2 */ 1393 tcg_gen_sari_i64(t2, a, 63); 1394 tcg_gen_xori_i64(t2, t2, INT64_MAX); 1395 1396 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0); 1397 tcg_gen_xor_i64(t0, t0, res); 1398 tcg_gen_or_i64(qc, qc, t0); 1399 } 1400 1401 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1402 TCGv_vec a, TCGv_vec b) 1403 { 1404 TCGv_vec x = tcg_temp_new_vec_matching(t); 1405 tcg_gen_add_vec(vece, x, a, b); 1406 tcg_gen_ssadd_vec(vece, t, a, b); 1407 tcg_gen_xor_vec(vece, x, x, t); 1408 tcg_gen_or_vec(vece, qc, qc, x); 1409 } 1410 1411 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1412 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1413 { 1414 static const TCGOpcode vecop_list[] = { 1415 INDEX_op_ssadd_vec, INDEX_op_add_vec, 0 1416 }; 1417 static const GVecGen4 ops[4] = { 1418 { .fniv = gen_sqadd_vec, 1419 .fno = gen_helper_gvec_sqadd_b, 1420 .opt_opc = vecop_list, 1421 .write_aofs = true, 1422 .vece = MO_8 }, 1423 { .fniv = gen_sqadd_vec, 1424 .fno = gen_helper_gvec_sqadd_h, 1425 .opt_opc = vecop_list, 1426 .write_aofs = true, 1427 .vece = MO_16 }, 1428 { .fniv = gen_sqadd_vec, 1429 .fno = gen_helper_gvec_sqadd_s, 1430 .opt_opc = vecop_list, 1431 .write_aofs = true, 1432 .vece = MO_32 }, 1433 { .fniv = gen_sqadd_vec, 1434 .fni8 = gen_sqadd_d, 1435 .fno = gen_helper_gvec_sqadd_d, 1436 .opt_opc = vecop_list, 1437 .write_aofs = true, 1438 .vece = MO_64 }, 1439 }; 1440 1441 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1442 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1443 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1444 } 1445 1446 void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1447 { 1448 TCGv_i64 tmp = tcg_temp_new_i64(); 1449 1450 tcg_gen_sub_i64(tmp, a, b); 1451 tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0)); 1452 tcg_gen_xor_i64(tmp, tmp, res); 1453 tcg_gen_or_i64(qc, qc, tmp); 1454 } 1455 1456 void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1457 { 1458 TCGv_i64 t = tcg_temp_new_i64(); 1459 1460 tcg_gen_sub_i64(t, a, b); 1461 tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t); 1462 tcg_gen_xor_i64(t, t, res); 1463 tcg_gen_or_i64(qc, qc, t); 1464 } 1465 1466 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1467 TCGv_vec a, TCGv_vec b) 1468 { 1469 TCGv_vec x = tcg_temp_new_vec_matching(t); 1470 tcg_gen_sub_vec(vece, x, a, b); 1471 tcg_gen_ussub_vec(vece, t, a, b); 1472 tcg_gen_xor_vec(vece, x, x, t); 1473 tcg_gen_or_vec(vece, qc, qc, x); 1474 } 1475 1476 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1477 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1478 { 1479 static const TCGOpcode vecop_list[] = { 1480 INDEX_op_ussub_vec, INDEX_op_sub_vec, 0 1481 }; 1482 static const GVecGen4 ops[4] = { 1483 { .fniv = gen_uqsub_vec, 1484 .fno = gen_helper_gvec_uqsub_b, 1485 .opt_opc = vecop_list, 1486 .write_aofs = true, 1487 .vece = MO_8 }, 1488 { .fniv = gen_uqsub_vec, 1489 .fno = gen_helper_gvec_uqsub_h, 1490 .opt_opc = vecop_list, 1491 .write_aofs = true, 1492 .vece = MO_16 }, 1493 { .fniv = gen_uqsub_vec, 1494 .fno = gen_helper_gvec_uqsub_s, 1495 .opt_opc = vecop_list, 1496 .write_aofs = true, 1497 .vece = MO_32 }, 1498 { .fniv = gen_uqsub_vec, 1499 .fni8 = gen_uqsub_d, 1500 .fno = gen_helper_gvec_uqsub_d, 1501 .opt_opc = vecop_list, 1502 .write_aofs = true, 1503 .vece = MO_64 }, 1504 }; 1505 1506 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1507 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1508 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1509 } 1510 1511 void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1512 { 1513 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1); 1514 int64_t min = -1ll - max; 1515 TCGv_i64 tmp = tcg_temp_new_i64(); 1516 1517 tcg_gen_sub_i64(tmp, a, b); 1518 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max)); 1519 tcg_gen_smax_i64(res, res, tcg_constant_i64(min)); 1520 tcg_gen_xor_i64(tmp, tmp, res); 1521 tcg_gen_or_i64(qc, qc, tmp); 1522 } 1523 1524 void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1525 { 1526 TCGv_i64 t0 = tcg_temp_new_i64(); 1527 TCGv_i64 t1 = tcg_temp_new_i64(); 1528 TCGv_i64 t2 = tcg_temp_new_i64(); 1529 1530 tcg_gen_sub_i64(t0, a, b); 1531 1532 /* Compute signed overflow indication into T1 */ 1533 tcg_gen_xor_i64(t1, a, b); 1534 tcg_gen_xor_i64(t2, t0, a); 1535 tcg_gen_and_i64(t1, t1, t2); 1536 1537 /* Compute saturated value into T2 */ 1538 tcg_gen_sari_i64(t2, a, 63); 1539 tcg_gen_xori_i64(t2, t2, INT64_MAX); 1540 1541 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0); 1542 tcg_gen_xor_i64(t0, t0, res); 1543 tcg_gen_or_i64(qc, qc, t0); 1544 } 1545 1546 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1547 TCGv_vec a, TCGv_vec b) 1548 { 1549 TCGv_vec x = tcg_temp_new_vec_matching(t); 1550 tcg_gen_sub_vec(vece, x, a, b); 1551 tcg_gen_sssub_vec(vece, t, a, b); 1552 tcg_gen_xor_vec(vece, x, x, t); 1553 tcg_gen_or_vec(vece, qc, qc, x); 1554 } 1555 1556 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1557 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1558 { 1559 static const TCGOpcode vecop_list[] = { 1560 INDEX_op_sssub_vec, INDEX_op_sub_vec, 0 1561 }; 1562 static const GVecGen4 ops[4] = { 1563 { .fniv = gen_sqsub_vec, 1564 .fno = gen_helper_gvec_sqsub_b, 1565 .opt_opc = vecop_list, 1566 .write_aofs = true, 1567 .vece = MO_8 }, 1568 { .fniv = gen_sqsub_vec, 1569 .fno = gen_helper_gvec_sqsub_h, 1570 .opt_opc = vecop_list, 1571 .write_aofs = true, 1572 .vece = MO_16 }, 1573 { .fniv = gen_sqsub_vec, 1574 .fno = gen_helper_gvec_sqsub_s, 1575 .opt_opc = vecop_list, 1576 .write_aofs = true, 1577 .vece = MO_32 }, 1578 { .fniv = gen_sqsub_vec, 1579 .fni8 = gen_sqsub_d, 1580 .fno = gen_helper_gvec_sqsub_d, 1581 .opt_opc = vecop_list, 1582 .write_aofs = true, 1583 .vece = MO_64 }, 1584 }; 1585 1586 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1587 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1588 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1589 } 1590 1591 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1592 { 1593 TCGv_i32 t = tcg_temp_new_i32(); 1594 1595 tcg_gen_sub_i32(t, a, b); 1596 tcg_gen_sub_i32(d, b, a); 1597 tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t); 1598 } 1599 1600 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1601 { 1602 TCGv_i64 t = tcg_temp_new_i64(); 1603 1604 tcg_gen_sub_i64(t, a, b); 1605 tcg_gen_sub_i64(d, b, a); 1606 tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t); 1607 } 1608 1609 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1610 { 1611 TCGv_vec t = tcg_temp_new_vec_matching(d); 1612 1613 tcg_gen_smin_vec(vece, t, a, b); 1614 tcg_gen_smax_vec(vece, d, a, b); 1615 tcg_gen_sub_vec(vece, d, d, t); 1616 } 1617 1618 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1619 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1620 { 1621 static const TCGOpcode vecop_list[] = { 1622 INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0 1623 }; 1624 static const GVecGen3 ops[4] = { 1625 { .fniv = gen_sabd_vec, 1626 .fno = gen_helper_gvec_sabd_b, 1627 .opt_opc = vecop_list, 1628 .vece = MO_8 }, 1629 { .fniv = gen_sabd_vec, 1630 .fno = gen_helper_gvec_sabd_h, 1631 .opt_opc = vecop_list, 1632 .vece = MO_16 }, 1633 { .fni4 = gen_sabd_i32, 1634 .fniv = gen_sabd_vec, 1635 .fno = gen_helper_gvec_sabd_s, 1636 .opt_opc = vecop_list, 1637 .vece = MO_32 }, 1638 { .fni8 = gen_sabd_i64, 1639 .fniv = gen_sabd_vec, 1640 .fno = gen_helper_gvec_sabd_d, 1641 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1642 .opt_opc = vecop_list, 1643 .vece = MO_64 }, 1644 }; 1645 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1646 } 1647 1648 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1649 { 1650 TCGv_i32 t = tcg_temp_new_i32(); 1651 1652 tcg_gen_sub_i32(t, a, b); 1653 tcg_gen_sub_i32(d, b, a); 1654 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t); 1655 } 1656 1657 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1658 { 1659 TCGv_i64 t = tcg_temp_new_i64(); 1660 1661 tcg_gen_sub_i64(t, a, b); 1662 tcg_gen_sub_i64(d, b, a); 1663 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t); 1664 } 1665 1666 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1667 { 1668 TCGv_vec t = tcg_temp_new_vec_matching(d); 1669 1670 tcg_gen_umin_vec(vece, t, a, b); 1671 tcg_gen_umax_vec(vece, d, a, b); 1672 tcg_gen_sub_vec(vece, d, d, t); 1673 } 1674 1675 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1676 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1677 { 1678 static const TCGOpcode vecop_list[] = { 1679 INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0 1680 }; 1681 static const GVecGen3 ops[4] = { 1682 { .fniv = gen_uabd_vec, 1683 .fno = gen_helper_gvec_uabd_b, 1684 .opt_opc = vecop_list, 1685 .vece = MO_8 }, 1686 { .fniv = gen_uabd_vec, 1687 .fno = gen_helper_gvec_uabd_h, 1688 .opt_opc = vecop_list, 1689 .vece = MO_16 }, 1690 { .fni4 = gen_uabd_i32, 1691 .fniv = gen_uabd_vec, 1692 .fno = gen_helper_gvec_uabd_s, 1693 .opt_opc = vecop_list, 1694 .vece = MO_32 }, 1695 { .fni8 = gen_uabd_i64, 1696 .fniv = gen_uabd_vec, 1697 .fno = gen_helper_gvec_uabd_d, 1698 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1699 .opt_opc = vecop_list, 1700 .vece = MO_64 }, 1701 }; 1702 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1703 } 1704 1705 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1706 { 1707 TCGv_i32 t = tcg_temp_new_i32(); 1708 gen_sabd_i32(t, a, b); 1709 tcg_gen_add_i32(d, d, t); 1710 } 1711 1712 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1713 { 1714 TCGv_i64 t = tcg_temp_new_i64(); 1715 gen_sabd_i64(t, a, b); 1716 tcg_gen_add_i64(d, d, t); 1717 } 1718 1719 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1720 { 1721 TCGv_vec t = tcg_temp_new_vec_matching(d); 1722 gen_sabd_vec(vece, t, a, b); 1723 tcg_gen_add_vec(vece, d, d, t); 1724 } 1725 1726 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1727 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1728 { 1729 static const TCGOpcode vecop_list[] = { 1730 INDEX_op_sub_vec, INDEX_op_add_vec, 1731 INDEX_op_smin_vec, INDEX_op_smax_vec, 0 1732 }; 1733 static const GVecGen3 ops[4] = { 1734 { .fniv = gen_saba_vec, 1735 .fno = gen_helper_gvec_saba_b, 1736 .opt_opc = vecop_list, 1737 .load_dest = true, 1738 .vece = MO_8 }, 1739 { .fniv = gen_saba_vec, 1740 .fno = gen_helper_gvec_saba_h, 1741 .opt_opc = vecop_list, 1742 .load_dest = true, 1743 .vece = MO_16 }, 1744 { .fni4 = gen_saba_i32, 1745 .fniv = gen_saba_vec, 1746 .fno = gen_helper_gvec_saba_s, 1747 .opt_opc = vecop_list, 1748 .load_dest = true, 1749 .vece = MO_32 }, 1750 { .fni8 = gen_saba_i64, 1751 .fniv = gen_saba_vec, 1752 .fno = gen_helper_gvec_saba_d, 1753 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1754 .opt_opc = vecop_list, 1755 .load_dest = true, 1756 .vece = MO_64 }, 1757 }; 1758 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1759 } 1760 1761 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1762 { 1763 TCGv_i32 t = tcg_temp_new_i32(); 1764 gen_uabd_i32(t, a, b); 1765 tcg_gen_add_i32(d, d, t); 1766 } 1767 1768 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1769 { 1770 TCGv_i64 t = tcg_temp_new_i64(); 1771 gen_uabd_i64(t, a, b); 1772 tcg_gen_add_i64(d, d, t); 1773 } 1774 1775 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1776 { 1777 TCGv_vec t = tcg_temp_new_vec_matching(d); 1778 gen_uabd_vec(vece, t, a, b); 1779 tcg_gen_add_vec(vece, d, d, t); 1780 } 1781 1782 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1783 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1784 { 1785 static const TCGOpcode vecop_list[] = { 1786 INDEX_op_sub_vec, INDEX_op_add_vec, 1787 INDEX_op_umin_vec, INDEX_op_umax_vec, 0 1788 }; 1789 static const GVecGen3 ops[4] = { 1790 { .fniv = gen_uaba_vec, 1791 .fno = gen_helper_gvec_uaba_b, 1792 .opt_opc = vecop_list, 1793 .load_dest = true, 1794 .vece = MO_8 }, 1795 { .fniv = gen_uaba_vec, 1796 .fno = gen_helper_gvec_uaba_h, 1797 .opt_opc = vecop_list, 1798 .load_dest = true, 1799 .vece = MO_16 }, 1800 { .fni4 = gen_uaba_i32, 1801 .fniv = gen_uaba_vec, 1802 .fno = gen_helper_gvec_uaba_s, 1803 .opt_opc = vecop_list, 1804 .load_dest = true, 1805 .vece = MO_32 }, 1806 { .fni8 = gen_uaba_i64, 1807 .fniv = gen_uaba_vec, 1808 .fno = gen_helper_gvec_uaba_d, 1809 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1810 .opt_opc = vecop_list, 1811 .load_dest = true, 1812 .vece = MO_64 }, 1813 }; 1814 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1815 } 1816 1817 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1818 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1819 { 1820 static gen_helper_gvec_3 * const fns[4] = { 1821 gen_helper_gvec_addp_b, 1822 gen_helper_gvec_addp_h, 1823 gen_helper_gvec_addp_s, 1824 gen_helper_gvec_addp_d, 1825 }; 1826 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1827 } 1828 1829 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1830 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1831 { 1832 static gen_helper_gvec_3 * const fns[4] = { 1833 gen_helper_gvec_smaxp_b, 1834 gen_helper_gvec_smaxp_h, 1835 gen_helper_gvec_smaxp_s, 1836 }; 1837 tcg_debug_assert(vece <= MO_32); 1838 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1839 } 1840 1841 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1842 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1843 { 1844 static gen_helper_gvec_3 * const fns[4] = { 1845 gen_helper_gvec_sminp_b, 1846 gen_helper_gvec_sminp_h, 1847 gen_helper_gvec_sminp_s, 1848 }; 1849 tcg_debug_assert(vece <= MO_32); 1850 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1851 } 1852 1853 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1854 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1855 { 1856 static gen_helper_gvec_3 * const fns[4] = { 1857 gen_helper_gvec_umaxp_b, 1858 gen_helper_gvec_umaxp_h, 1859 gen_helper_gvec_umaxp_s, 1860 }; 1861 tcg_debug_assert(vece <= MO_32); 1862 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1863 } 1864 1865 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1866 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1867 { 1868 static gen_helper_gvec_3 * const fns[4] = { 1869 gen_helper_gvec_uminp_b, 1870 gen_helper_gvec_uminp_h, 1871 gen_helper_gvec_uminp_s, 1872 }; 1873 tcg_debug_assert(vece <= MO_32); 1874 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1875 } 1876 1877 static void gen_shadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1878 { 1879 TCGv_i64 t = tcg_temp_new_i64(); 1880 1881 tcg_gen_and_i64(t, a, b); 1882 tcg_gen_vec_sar8i_i64(a, a, 1); 1883 tcg_gen_vec_sar8i_i64(b, b, 1); 1884 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 1885 tcg_gen_vec_add8_i64(d, a, b); 1886 tcg_gen_vec_add8_i64(d, d, t); 1887 } 1888 1889 static void gen_shadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1890 { 1891 TCGv_i64 t = tcg_temp_new_i64(); 1892 1893 tcg_gen_and_i64(t, a, b); 1894 tcg_gen_vec_sar16i_i64(a, a, 1); 1895 tcg_gen_vec_sar16i_i64(b, b, 1); 1896 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 1897 tcg_gen_vec_add16_i64(d, a, b); 1898 tcg_gen_vec_add16_i64(d, d, t); 1899 } 1900 1901 static void gen_shadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1902 { 1903 TCGv_i32 t = tcg_temp_new_i32(); 1904 1905 tcg_gen_and_i32(t, a, b); 1906 tcg_gen_sari_i32(a, a, 1); 1907 tcg_gen_sari_i32(b, b, 1); 1908 tcg_gen_andi_i32(t, t, 1); 1909 tcg_gen_add_i32(d, a, b); 1910 tcg_gen_add_i32(d, d, t); 1911 } 1912 1913 static void gen_shadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1914 { 1915 TCGv_vec t = tcg_temp_new_vec_matching(d); 1916 1917 tcg_gen_and_vec(vece, t, a, b); 1918 tcg_gen_sari_vec(vece, a, a, 1); 1919 tcg_gen_sari_vec(vece, b, b, 1); 1920 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 1921 tcg_gen_add_vec(vece, d, a, b); 1922 tcg_gen_add_vec(vece, d, d, t); 1923 } 1924 1925 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1926 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1927 { 1928 static const TCGOpcode vecop_list[] = { 1929 INDEX_op_sari_vec, INDEX_op_add_vec, 0 1930 }; 1931 static const GVecGen3 g[] = { 1932 { .fni8 = gen_shadd8_i64, 1933 .fniv = gen_shadd_vec, 1934 .opt_opc = vecop_list, 1935 .vece = MO_8 }, 1936 { .fni8 = gen_shadd16_i64, 1937 .fniv = gen_shadd_vec, 1938 .opt_opc = vecop_list, 1939 .vece = MO_16 }, 1940 { .fni4 = gen_shadd_i32, 1941 .fniv = gen_shadd_vec, 1942 .opt_opc = vecop_list, 1943 .vece = MO_32 }, 1944 }; 1945 tcg_debug_assert(vece <= MO_32); 1946 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 1947 } 1948 1949 static void gen_uhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1950 { 1951 TCGv_i64 t = tcg_temp_new_i64(); 1952 1953 tcg_gen_and_i64(t, a, b); 1954 tcg_gen_vec_shr8i_i64(a, a, 1); 1955 tcg_gen_vec_shr8i_i64(b, b, 1); 1956 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 1957 tcg_gen_vec_add8_i64(d, a, b); 1958 tcg_gen_vec_add8_i64(d, d, t); 1959 } 1960 1961 static void gen_uhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1962 { 1963 TCGv_i64 t = tcg_temp_new_i64(); 1964 1965 tcg_gen_and_i64(t, a, b); 1966 tcg_gen_vec_shr16i_i64(a, a, 1); 1967 tcg_gen_vec_shr16i_i64(b, b, 1); 1968 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 1969 tcg_gen_vec_add16_i64(d, a, b); 1970 tcg_gen_vec_add16_i64(d, d, t); 1971 } 1972 1973 static void gen_uhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1974 { 1975 TCGv_i32 t = tcg_temp_new_i32(); 1976 1977 tcg_gen_and_i32(t, a, b); 1978 tcg_gen_shri_i32(a, a, 1); 1979 tcg_gen_shri_i32(b, b, 1); 1980 tcg_gen_andi_i32(t, t, 1); 1981 tcg_gen_add_i32(d, a, b); 1982 tcg_gen_add_i32(d, d, t); 1983 } 1984 1985 static void gen_uhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1986 { 1987 TCGv_vec t = tcg_temp_new_vec_matching(d); 1988 1989 tcg_gen_and_vec(vece, t, a, b); 1990 tcg_gen_shri_vec(vece, a, a, 1); 1991 tcg_gen_shri_vec(vece, b, b, 1); 1992 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 1993 tcg_gen_add_vec(vece, d, a, b); 1994 tcg_gen_add_vec(vece, d, d, t); 1995 } 1996 1997 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1998 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1999 { 2000 static const TCGOpcode vecop_list[] = { 2001 INDEX_op_shri_vec, INDEX_op_add_vec, 0 2002 }; 2003 static const GVecGen3 g[] = { 2004 { .fni8 = gen_uhadd8_i64, 2005 .fniv = gen_uhadd_vec, 2006 .opt_opc = vecop_list, 2007 .vece = MO_8 }, 2008 { .fni8 = gen_uhadd16_i64, 2009 .fniv = gen_uhadd_vec, 2010 .opt_opc = vecop_list, 2011 .vece = MO_16 }, 2012 { .fni4 = gen_uhadd_i32, 2013 .fniv = gen_uhadd_vec, 2014 .opt_opc = vecop_list, 2015 .vece = MO_32 }, 2016 }; 2017 tcg_debug_assert(vece <= MO_32); 2018 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2019 } 2020 2021 static void gen_shsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2022 { 2023 TCGv_i64 t = tcg_temp_new_i64(); 2024 2025 tcg_gen_andc_i64(t, b, a); 2026 tcg_gen_vec_sar8i_i64(a, a, 1); 2027 tcg_gen_vec_sar8i_i64(b, b, 1); 2028 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2029 tcg_gen_vec_sub8_i64(d, a, b); 2030 tcg_gen_vec_sub8_i64(d, d, t); 2031 } 2032 2033 static void gen_shsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2034 { 2035 TCGv_i64 t = tcg_temp_new_i64(); 2036 2037 tcg_gen_andc_i64(t, b, a); 2038 tcg_gen_vec_sar16i_i64(a, a, 1); 2039 tcg_gen_vec_sar16i_i64(b, b, 1); 2040 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2041 tcg_gen_vec_sub16_i64(d, a, b); 2042 tcg_gen_vec_sub16_i64(d, d, t); 2043 } 2044 2045 static void gen_shsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2046 { 2047 TCGv_i32 t = tcg_temp_new_i32(); 2048 2049 tcg_gen_andc_i32(t, b, a); 2050 tcg_gen_sari_i32(a, a, 1); 2051 tcg_gen_sari_i32(b, b, 1); 2052 tcg_gen_andi_i32(t, t, 1); 2053 tcg_gen_sub_i32(d, a, b); 2054 tcg_gen_sub_i32(d, d, t); 2055 } 2056 2057 static void gen_shsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2058 { 2059 TCGv_vec t = tcg_temp_new_vec_matching(d); 2060 2061 tcg_gen_andc_vec(vece, t, b, a); 2062 tcg_gen_sari_vec(vece, a, a, 1); 2063 tcg_gen_sari_vec(vece, b, b, 1); 2064 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2065 tcg_gen_sub_vec(vece, d, a, b); 2066 tcg_gen_sub_vec(vece, d, d, t); 2067 } 2068 2069 void gen_gvec_shsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2070 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2071 { 2072 static const TCGOpcode vecop_list[] = { 2073 INDEX_op_sari_vec, INDEX_op_sub_vec, 0 2074 }; 2075 static const GVecGen3 g[4] = { 2076 { .fni8 = gen_shsub8_i64, 2077 .fniv = gen_shsub_vec, 2078 .opt_opc = vecop_list, 2079 .vece = MO_8 }, 2080 { .fni8 = gen_shsub16_i64, 2081 .fniv = gen_shsub_vec, 2082 .opt_opc = vecop_list, 2083 .vece = MO_16 }, 2084 { .fni4 = gen_shsub_i32, 2085 .fniv = gen_shsub_vec, 2086 .opt_opc = vecop_list, 2087 .vece = MO_32 }, 2088 }; 2089 assert(vece <= MO_32); 2090 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2091 } 2092 2093 static void gen_uhsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2094 { 2095 TCGv_i64 t = tcg_temp_new_i64(); 2096 2097 tcg_gen_andc_i64(t, b, a); 2098 tcg_gen_vec_shr8i_i64(a, a, 1); 2099 tcg_gen_vec_shr8i_i64(b, b, 1); 2100 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2101 tcg_gen_vec_sub8_i64(d, a, b); 2102 tcg_gen_vec_sub8_i64(d, d, t); 2103 } 2104 2105 static void gen_uhsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2106 { 2107 TCGv_i64 t = tcg_temp_new_i64(); 2108 2109 tcg_gen_andc_i64(t, b, a); 2110 tcg_gen_vec_shr16i_i64(a, a, 1); 2111 tcg_gen_vec_shr16i_i64(b, b, 1); 2112 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2113 tcg_gen_vec_sub16_i64(d, a, b); 2114 tcg_gen_vec_sub16_i64(d, d, t); 2115 } 2116 2117 static void gen_uhsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2118 { 2119 TCGv_i32 t = tcg_temp_new_i32(); 2120 2121 tcg_gen_andc_i32(t, b, a); 2122 tcg_gen_shri_i32(a, a, 1); 2123 tcg_gen_shri_i32(b, b, 1); 2124 tcg_gen_andi_i32(t, t, 1); 2125 tcg_gen_sub_i32(d, a, b); 2126 tcg_gen_sub_i32(d, d, t); 2127 } 2128 2129 static void gen_uhsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2130 { 2131 TCGv_vec t = tcg_temp_new_vec_matching(d); 2132 2133 tcg_gen_andc_vec(vece, t, b, a); 2134 tcg_gen_shri_vec(vece, a, a, 1); 2135 tcg_gen_shri_vec(vece, b, b, 1); 2136 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2137 tcg_gen_sub_vec(vece, d, a, b); 2138 tcg_gen_sub_vec(vece, d, d, t); 2139 } 2140 2141 void gen_gvec_uhsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2142 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2143 { 2144 static const TCGOpcode vecop_list[] = { 2145 INDEX_op_shri_vec, INDEX_op_sub_vec, 0 2146 }; 2147 static const GVecGen3 g[4] = { 2148 { .fni8 = gen_uhsub8_i64, 2149 .fniv = gen_uhsub_vec, 2150 .opt_opc = vecop_list, 2151 .vece = MO_8 }, 2152 { .fni8 = gen_uhsub16_i64, 2153 .fniv = gen_uhsub_vec, 2154 .opt_opc = vecop_list, 2155 .vece = MO_16 }, 2156 { .fni4 = gen_uhsub_i32, 2157 .fniv = gen_uhsub_vec, 2158 .opt_opc = vecop_list, 2159 .vece = MO_32 }, 2160 }; 2161 assert(vece <= MO_32); 2162 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2163 } 2164 2165 static void gen_srhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2166 { 2167 TCGv_i64 t = tcg_temp_new_i64(); 2168 2169 tcg_gen_or_i64(t, a, b); 2170 tcg_gen_vec_sar8i_i64(a, a, 1); 2171 tcg_gen_vec_sar8i_i64(b, b, 1); 2172 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2173 tcg_gen_vec_add8_i64(d, a, b); 2174 tcg_gen_vec_add8_i64(d, d, t); 2175 } 2176 2177 static void gen_srhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2178 { 2179 TCGv_i64 t = tcg_temp_new_i64(); 2180 2181 tcg_gen_or_i64(t, a, b); 2182 tcg_gen_vec_sar16i_i64(a, a, 1); 2183 tcg_gen_vec_sar16i_i64(b, b, 1); 2184 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2185 tcg_gen_vec_add16_i64(d, a, b); 2186 tcg_gen_vec_add16_i64(d, d, t); 2187 } 2188 2189 static void gen_srhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2190 { 2191 TCGv_i32 t = tcg_temp_new_i32(); 2192 2193 tcg_gen_or_i32(t, a, b); 2194 tcg_gen_sari_i32(a, a, 1); 2195 tcg_gen_sari_i32(b, b, 1); 2196 tcg_gen_andi_i32(t, t, 1); 2197 tcg_gen_add_i32(d, a, b); 2198 tcg_gen_add_i32(d, d, t); 2199 } 2200 2201 static void gen_srhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2202 { 2203 TCGv_vec t = tcg_temp_new_vec_matching(d); 2204 2205 tcg_gen_or_vec(vece, t, a, b); 2206 tcg_gen_sari_vec(vece, a, a, 1); 2207 tcg_gen_sari_vec(vece, b, b, 1); 2208 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2209 tcg_gen_add_vec(vece, d, a, b); 2210 tcg_gen_add_vec(vece, d, d, t); 2211 } 2212 2213 void gen_gvec_srhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2214 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2215 { 2216 static const TCGOpcode vecop_list[] = { 2217 INDEX_op_sari_vec, INDEX_op_add_vec, 0 2218 }; 2219 static const GVecGen3 g[] = { 2220 { .fni8 = gen_srhadd8_i64, 2221 .fniv = gen_srhadd_vec, 2222 .opt_opc = vecop_list, 2223 .vece = MO_8 }, 2224 { .fni8 = gen_srhadd16_i64, 2225 .fniv = gen_srhadd_vec, 2226 .opt_opc = vecop_list, 2227 .vece = MO_16 }, 2228 { .fni4 = gen_srhadd_i32, 2229 .fniv = gen_srhadd_vec, 2230 .opt_opc = vecop_list, 2231 .vece = MO_32 }, 2232 }; 2233 assert(vece <= MO_32); 2234 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2235 } 2236 2237 static void gen_urhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2238 { 2239 TCGv_i64 t = tcg_temp_new_i64(); 2240 2241 tcg_gen_or_i64(t, a, b); 2242 tcg_gen_vec_shr8i_i64(a, a, 1); 2243 tcg_gen_vec_shr8i_i64(b, b, 1); 2244 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2245 tcg_gen_vec_add8_i64(d, a, b); 2246 tcg_gen_vec_add8_i64(d, d, t); 2247 } 2248 2249 static void gen_urhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2250 { 2251 TCGv_i64 t = tcg_temp_new_i64(); 2252 2253 tcg_gen_or_i64(t, a, b); 2254 tcg_gen_vec_shr16i_i64(a, a, 1); 2255 tcg_gen_vec_shr16i_i64(b, b, 1); 2256 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2257 tcg_gen_vec_add16_i64(d, a, b); 2258 tcg_gen_vec_add16_i64(d, d, t); 2259 } 2260 2261 static void gen_urhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2262 { 2263 TCGv_i32 t = tcg_temp_new_i32(); 2264 2265 tcg_gen_or_i32(t, a, b); 2266 tcg_gen_shri_i32(a, a, 1); 2267 tcg_gen_shri_i32(b, b, 1); 2268 tcg_gen_andi_i32(t, t, 1); 2269 tcg_gen_add_i32(d, a, b); 2270 tcg_gen_add_i32(d, d, t); 2271 } 2272 2273 static void gen_urhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2274 { 2275 TCGv_vec t = tcg_temp_new_vec_matching(d); 2276 2277 tcg_gen_or_vec(vece, t, a, b); 2278 tcg_gen_shri_vec(vece, a, a, 1); 2279 tcg_gen_shri_vec(vece, b, b, 1); 2280 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2281 tcg_gen_add_vec(vece, d, a, b); 2282 tcg_gen_add_vec(vece, d, d, t); 2283 } 2284 2285 void gen_gvec_urhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2286 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2287 { 2288 static const TCGOpcode vecop_list[] = { 2289 INDEX_op_shri_vec, INDEX_op_add_vec, 0 2290 }; 2291 static const GVecGen3 g[] = { 2292 { .fni8 = gen_urhadd8_i64, 2293 .fniv = gen_urhadd_vec, 2294 .opt_opc = vecop_list, 2295 .vece = MO_8 }, 2296 { .fni8 = gen_urhadd16_i64, 2297 .fniv = gen_urhadd_vec, 2298 .opt_opc = vecop_list, 2299 .vece = MO_16 }, 2300 { .fni4 = gen_urhadd_i32, 2301 .fniv = gen_urhadd_vec, 2302 .opt_opc = vecop_list, 2303 .vece = MO_32 }, 2304 }; 2305 assert(vece <= MO_32); 2306 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2307 } 2308