1 /* 2 * ARM generic vector expansion 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * Copyright (c) 2005-2007 CodeSourcery 6 * Copyright (c) 2007 OpenedHand, Ltd. 7 * 8 * This library is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * This library is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 20 */ 21 22 #include "qemu/osdep.h" 23 #include "translate.h" 24 25 26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs, 27 uint32_t opr_sz, uint32_t max_sz, 28 gen_helper_gvec_3_ptr *fn) 29 { 30 TCGv_ptr qc_ptr = tcg_temp_new_ptr(); 31 32 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 33 tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc)); 34 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr, 35 opr_sz, max_sz, 0, fn); 36 } 37 38 void gen_gvec_sqdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 39 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 40 { 41 static gen_helper_gvec_3_ptr * const fns[2] = { 42 gen_helper_neon_sqdmulh_h, gen_helper_neon_sqdmulh_s 43 }; 44 tcg_debug_assert(vece >= 1 && vece <= 2); 45 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 46 } 47 48 void gen_gvec_sqrdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 49 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 50 { 51 static gen_helper_gvec_3_ptr * const fns[2] = { 52 gen_helper_neon_sqrdmulh_h, gen_helper_neon_sqrdmulh_s 53 }; 54 tcg_debug_assert(vece >= 1 && vece <= 2); 55 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 56 } 57 58 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 59 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 60 { 61 static gen_helper_gvec_3_ptr * const fns[2] = { 62 gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32 63 }; 64 tcg_debug_assert(vece >= 1 && vece <= 2); 65 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 66 } 67 68 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 69 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 70 { 71 static gen_helper_gvec_3_ptr * const fns[2] = { 72 gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32 73 }; 74 tcg_debug_assert(vece >= 1 && vece <= 2); 75 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 76 } 77 78 #define GEN_CMP0(NAME, COND) \ 79 void NAME(unsigned vece, uint32_t d, uint32_t m, \ 80 uint32_t opr_sz, uint32_t max_sz) \ 81 { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); } 82 83 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ) 84 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE) 85 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE) 86 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT) 87 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT) 88 89 #undef GEN_CMP0 90 91 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 92 { 93 tcg_gen_vec_sar8i_i64(a, a, shift); 94 tcg_gen_vec_add8_i64(d, d, a); 95 } 96 97 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 98 { 99 tcg_gen_vec_sar16i_i64(a, a, shift); 100 tcg_gen_vec_add16_i64(d, d, a); 101 } 102 103 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 104 { 105 tcg_gen_sari_i32(a, a, shift); 106 tcg_gen_add_i32(d, d, a); 107 } 108 109 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 110 { 111 tcg_gen_sari_i64(a, a, shift); 112 tcg_gen_add_i64(d, d, a); 113 } 114 115 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 116 { 117 tcg_gen_sari_vec(vece, a, a, sh); 118 tcg_gen_add_vec(vece, d, d, a); 119 } 120 121 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 122 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 123 { 124 static const TCGOpcode vecop_list[] = { 125 INDEX_op_sari_vec, INDEX_op_add_vec, 0 126 }; 127 static const GVecGen2i ops[4] = { 128 { .fni8 = gen_ssra8_i64, 129 .fniv = gen_ssra_vec, 130 .fno = gen_helper_gvec_ssra_b, 131 .load_dest = true, 132 .opt_opc = vecop_list, 133 .vece = MO_8 }, 134 { .fni8 = gen_ssra16_i64, 135 .fniv = gen_ssra_vec, 136 .fno = gen_helper_gvec_ssra_h, 137 .load_dest = true, 138 .opt_opc = vecop_list, 139 .vece = MO_16 }, 140 { .fni4 = gen_ssra32_i32, 141 .fniv = gen_ssra_vec, 142 .fno = gen_helper_gvec_ssra_s, 143 .load_dest = true, 144 .opt_opc = vecop_list, 145 .vece = MO_32 }, 146 { .fni8 = gen_ssra64_i64, 147 .fniv = gen_ssra_vec, 148 .fno = gen_helper_gvec_ssra_d, 149 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 150 .opt_opc = vecop_list, 151 .load_dest = true, 152 .vece = MO_64 }, 153 }; 154 155 /* tszimm encoding produces immediates in the range [1..esize]. */ 156 tcg_debug_assert(shift > 0); 157 tcg_debug_assert(shift <= (8 << vece)); 158 159 /* 160 * Shifts larger than the element size are architecturally valid. 161 * Signed results in all sign bits. 162 */ 163 shift = MIN(shift, (8 << vece) - 1); 164 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 165 } 166 167 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 168 { 169 tcg_gen_vec_shr8i_i64(a, a, shift); 170 tcg_gen_vec_add8_i64(d, d, a); 171 } 172 173 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 174 { 175 tcg_gen_vec_shr16i_i64(a, a, shift); 176 tcg_gen_vec_add16_i64(d, d, a); 177 } 178 179 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 180 { 181 tcg_gen_shri_i32(a, a, shift); 182 tcg_gen_add_i32(d, d, a); 183 } 184 185 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 186 { 187 tcg_gen_shri_i64(a, a, shift); 188 tcg_gen_add_i64(d, d, a); 189 } 190 191 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 192 { 193 tcg_gen_shri_vec(vece, a, a, sh); 194 tcg_gen_add_vec(vece, d, d, a); 195 } 196 197 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 198 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 199 { 200 static const TCGOpcode vecop_list[] = { 201 INDEX_op_shri_vec, INDEX_op_add_vec, 0 202 }; 203 static const GVecGen2i ops[4] = { 204 { .fni8 = gen_usra8_i64, 205 .fniv = gen_usra_vec, 206 .fno = gen_helper_gvec_usra_b, 207 .load_dest = true, 208 .opt_opc = vecop_list, 209 .vece = MO_8, }, 210 { .fni8 = gen_usra16_i64, 211 .fniv = gen_usra_vec, 212 .fno = gen_helper_gvec_usra_h, 213 .load_dest = true, 214 .opt_opc = vecop_list, 215 .vece = MO_16, }, 216 { .fni4 = gen_usra32_i32, 217 .fniv = gen_usra_vec, 218 .fno = gen_helper_gvec_usra_s, 219 .load_dest = true, 220 .opt_opc = vecop_list, 221 .vece = MO_32, }, 222 { .fni8 = gen_usra64_i64, 223 .fniv = gen_usra_vec, 224 .fno = gen_helper_gvec_usra_d, 225 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 226 .load_dest = true, 227 .opt_opc = vecop_list, 228 .vece = MO_64, }, 229 }; 230 231 /* tszimm encoding produces immediates in the range [1..esize]. */ 232 tcg_debug_assert(shift > 0); 233 tcg_debug_assert(shift <= (8 << vece)); 234 235 /* 236 * Shifts larger than the element size are architecturally valid. 237 * Unsigned results in all zeros as input to accumulate: nop. 238 */ 239 if (shift < (8 << vece)) { 240 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 241 } else { 242 /* Nop, but we do need to clear the tail. */ 243 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 244 } 245 } 246 247 /* 248 * Shift one less than the requested amount, and the low bit is 249 * the rounding bit. For the 8 and 16-bit operations, because we 250 * mask the low bit, we can perform a normal integer shift instead 251 * of a vector shift. 252 */ 253 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 254 { 255 TCGv_i64 t = tcg_temp_new_i64(); 256 257 tcg_gen_shri_i64(t, a, sh - 1); 258 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 259 tcg_gen_vec_sar8i_i64(d, a, sh); 260 tcg_gen_vec_add8_i64(d, d, t); 261 } 262 263 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 264 { 265 TCGv_i64 t = tcg_temp_new_i64(); 266 267 tcg_gen_shri_i64(t, a, sh - 1); 268 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 269 tcg_gen_vec_sar16i_i64(d, a, sh); 270 tcg_gen_vec_add16_i64(d, d, t); 271 } 272 273 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 274 { 275 TCGv_i32 t; 276 277 /* Handle shift by the input size for the benefit of trans_SRSHR_ri */ 278 if (sh == 32) { 279 tcg_gen_movi_i32(d, 0); 280 return; 281 } 282 t = tcg_temp_new_i32(); 283 tcg_gen_extract_i32(t, a, sh - 1, 1); 284 tcg_gen_sari_i32(d, a, sh); 285 tcg_gen_add_i32(d, d, t); 286 } 287 288 void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 289 { 290 TCGv_i64 t = tcg_temp_new_i64(); 291 292 tcg_gen_extract_i64(t, a, sh - 1, 1); 293 tcg_gen_sari_i64(d, a, sh); 294 tcg_gen_add_i64(d, d, t); 295 } 296 297 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 298 { 299 TCGv_vec t = tcg_temp_new_vec_matching(d); 300 TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1); 301 302 tcg_gen_shri_vec(vece, t, a, sh - 1); 303 tcg_gen_and_vec(vece, t, t, ones); 304 tcg_gen_sari_vec(vece, d, a, sh); 305 tcg_gen_add_vec(vece, d, d, t); 306 } 307 308 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 309 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 310 { 311 static const TCGOpcode vecop_list[] = { 312 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 313 }; 314 static const GVecGen2i ops[4] = { 315 { .fni8 = gen_srshr8_i64, 316 .fniv = gen_srshr_vec, 317 .fno = gen_helper_gvec_srshr_b, 318 .opt_opc = vecop_list, 319 .vece = MO_8 }, 320 { .fni8 = gen_srshr16_i64, 321 .fniv = gen_srshr_vec, 322 .fno = gen_helper_gvec_srshr_h, 323 .opt_opc = vecop_list, 324 .vece = MO_16 }, 325 { .fni4 = gen_srshr32_i32, 326 .fniv = gen_srshr_vec, 327 .fno = gen_helper_gvec_srshr_s, 328 .opt_opc = vecop_list, 329 .vece = MO_32 }, 330 { .fni8 = gen_srshr64_i64, 331 .fniv = gen_srshr_vec, 332 .fno = gen_helper_gvec_srshr_d, 333 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 334 .opt_opc = vecop_list, 335 .vece = MO_64 }, 336 }; 337 338 /* tszimm encoding produces immediates in the range [1..esize] */ 339 tcg_debug_assert(shift > 0); 340 tcg_debug_assert(shift <= (8 << vece)); 341 342 if (shift == (8 << vece)) { 343 /* 344 * Shifts larger than the element size are architecturally valid. 345 * Signed results in all sign bits. With rounding, this produces 346 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. 347 * I.e. always zero. 348 */ 349 tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0); 350 } else { 351 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 352 } 353 } 354 355 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 356 { 357 TCGv_i64 t = tcg_temp_new_i64(); 358 359 gen_srshr8_i64(t, a, sh); 360 tcg_gen_vec_add8_i64(d, d, t); 361 } 362 363 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 364 { 365 TCGv_i64 t = tcg_temp_new_i64(); 366 367 gen_srshr16_i64(t, a, sh); 368 tcg_gen_vec_add16_i64(d, d, t); 369 } 370 371 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 372 { 373 TCGv_i32 t = tcg_temp_new_i32(); 374 375 gen_srshr32_i32(t, a, sh); 376 tcg_gen_add_i32(d, d, t); 377 } 378 379 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 380 { 381 TCGv_i64 t = tcg_temp_new_i64(); 382 383 gen_srshr64_i64(t, a, sh); 384 tcg_gen_add_i64(d, d, t); 385 } 386 387 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 388 { 389 TCGv_vec t = tcg_temp_new_vec_matching(d); 390 391 gen_srshr_vec(vece, t, a, sh); 392 tcg_gen_add_vec(vece, d, d, t); 393 } 394 395 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 396 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 397 { 398 static const TCGOpcode vecop_list[] = { 399 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 400 }; 401 static const GVecGen2i ops[4] = { 402 { .fni8 = gen_srsra8_i64, 403 .fniv = gen_srsra_vec, 404 .fno = gen_helper_gvec_srsra_b, 405 .opt_opc = vecop_list, 406 .load_dest = true, 407 .vece = MO_8 }, 408 { .fni8 = gen_srsra16_i64, 409 .fniv = gen_srsra_vec, 410 .fno = gen_helper_gvec_srsra_h, 411 .opt_opc = vecop_list, 412 .load_dest = true, 413 .vece = MO_16 }, 414 { .fni4 = gen_srsra32_i32, 415 .fniv = gen_srsra_vec, 416 .fno = gen_helper_gvec_srsra_s, 417 .opt_opc = vecop_list, 418 .load_dest = true, 419 .vece = MO_32 }, 420 { .fni8 = gen_srsra64_i64, 421 .fniv = gen_srsra_vec, 422 .fno = gen_helper_gvec_srsra_d, 423 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 424 .opt_opc = vecop_list, 425 .load_dest = true, 426 .vece = MO_64 }, 427 }; 428 429 /* tszimm encoding produces immediates in the range [1..esize] */ 430 tcg_debug_assert(shift > 0); 431 tcg_debug_assert(shift <= (8 << vece)); 432 433 /* 434 * Shifts larger than the element size are architecturally valid. 435 * Signed results in all sign bits. With rounding, this produces 436 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. 437 * I.e. always zero. With accumulation, this leaves D unchanged. 438 */ 439 if (shift == (8 << vece)) { 440 /* Nop, but we do need to clear the tail. */ 441 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 442 } else { 443 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 444 } 445 } 446 447 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 448 { 449 TCGv_i64 t = tcg_temp_new_i64(); 450 451 tcg_gen_shri_i64(t, a, sh - 1); 452 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 453 tcg_gen_vec_shr8i_i64(d, a, sh); 454 tcg_gen_vec_add8_i64(d, d, t); 455 } 456 457 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 458 { 459 TCGv_i64 t = tcg_temp_new_i64(); 460 461 tcg_gen_shri_i64(t, a, sh - 1); 462 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 463 tcg_gen_vec_shr16i_i64(d, a, sh); 464 tcg_gen_vec_add16_i64(d, d, t); 465 } 466 467 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 468 { 469 TCGv_i32 t; 470 471 /* Handle shift by the input size for the benefit of trans_URSHR_ri */ 472 if (sh == 32) { 473 tcg_gen_extract_i32(d, a, sh - 1, 1); 474 return; 475 } 476 t = tcg_temp_new_i32(); 477 tcg_gen_extract_i32(t, a, sh - 1, 1); 478 tcg_gen_shri_i32(d, a, sh); 479 tcg_gen_add_i32(d, d, t); 480 } 481 482 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 483 { 484 TCGv_i64 t = tcg_temp_new_i64(); 485 486 tcg_gen_extract_i64(t, a, sh - 1, 1); 487 tcg_gen_shri_i64(d, a, sh); 488 tcg_gen_add_i64(d, d, t); 489 } 490 491 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift) 492 { 493 TCGv_vec t = tcg_temp_new_vec_matching(d); 494 TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1); 495 496 tcg_gen_shri_vec(vece, t, a, shift - 1); 497 tcg_gen_and_vec(vece, t, t, ones); 498 tcg_gen_shri_vec(vece, d, a, shift); 499 tcg_gen_add_vec(vece, d, d, t); 500 } 501 502 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 503 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 504 { 505 static const TCGOpcode vecop_list[] = { 506 INDEX_op_shri_vec, INDEX_op_add_vec, 0 507 }; 508 static const GVecGen2i ops[4] = { 509 { .fni8 = gen_urshr8_i64, 510 .fniv = gen_urshr_vec, 511 .fno = gen_helper_gvec_urshr_b, 512 .opt_opc = vecop_list, 513 .vece = MO_8 }, 514 { .fni8 = gen_urshr16_i64, 515 .fniv = gen_urshr_vec, 516 .fno = gen_helper_gvec_urshr_h, 517 .opt_opc = vecop_list, 518 .vece = MO_16 }, 519 { .fni4 = gen_urshr32_i32, 520 .fniv = gen_urshr_vec, 521 .fno = gen_helper_gvec_urshr_s, 522 .opt_opc = vecop_list, 523 .vece = MO_32 }, 524 { .fni8 = gen_urshr64_i64, 525 .fniv = gen_urshr_vec, 526 .fno = gen_helper_gvec_urshr_d, 527 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 528 .opt_opc = vecop_list, 529 .vece = MO_64 }, 530 }; 531 532 /* tszimm encoding produces immediates in the range [1..esize] */ 533 tcg_debug_assert(shift > 0); 534 tcg_debug_assert(shift <= (8 << vece)); 535 536 if (shift == (8 << vece)) { 537 /* 538 * Shifts larger than the element size are architecturally valid. 539 * Unsigned results in zero. With rounding, this produces a 540 * copy of the most significant bit. 541 */ 542 tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz); 543 } else { 544 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 545 } 546 } 547 548 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 549 { 550 TCGv_i64 t = tcg_temp_new_i64(); 551 552 if (sh == 8) { 553 tcg_gen_vec_shr8i_i64(t, a, 7); 554 } else { 555 gen_urshr8_i64(t, a, sh); 556 } 557 tcg_gen_vec_add8_i64(d, d, t); 558 } 559 560 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 561 { 562 TCGv_i64 t = tcg_temp_new_i64(); 563 564 if (sh == 16) { 565 tcg_gen_vec_shr16i_i64(t, a, 15); 566 } else { 567 gen_urshr16_i64(t, a, sh); 568 } 569 tcg_gen_vec_add16_i64(d, d, t); 570 } 571 572 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 573 { 574 TCGv_i32 t = tcg_temp_new_i32(); 575 576 if (sh == 32) { 577 tcg_gen_shri_i32(t, a, 31); 578 } else { 579 gen_urshr32_i32(t, a, sh); 580 } 581 tcg_gen_add_i32(d, d, t); 582 } 583 584 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 585 { 586 TCGv_i64 t = tcg_temp_new_i64(); 587 588 if (sh == 64) { 589 tcg_gen_shri_i64(t, a, 63); 590 } else { 591 gen_urshr64_i64(t, a, sh); 592 } 593 tcg_gen_add_i64(d, d, t); 594 } 595 596 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 597 { 598 TCGv_vec t = tcg_temp_new_vec_matching(d); 599 600 if (sh == (8 << vece)) { 601 tcg_gen_shri_vec(vece, t, a, sh - 1); 602 } else { 603 gen_urshr_vec(vece, t, a, sh); 604 } 605 tcg_gen_add_vec(vece, d, d, t); 606 } 607 608 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 609 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 610 { 611 static const TCGOpcode vecop_list[] = { 612 INDEX_op_shri_vec, INDEX_op_add_vec, 0 613 }; 614 static const GVecGen2i ops[4] = { 615 { .fni8 = gen_ursra8_i64, 616 .fniv = gen_ursra_vec, 617 .fno = gen_helper_gvec_ursra_b, 618 .opt_opc = vecop_list, 619 .load_dest = true, 620 .vece = MO_8 }, 621 { .fni8 = gen_ursra16_i64, 622 .fniv = gen_ursra_vec, 623 .fno = gen_helper_gvec_ursra_h, 624 .opt_opc = vecop_list, 625 .load_dest = true, 626 .vece = MO_16 }, 627 { .fni4 = gen_ursra32_i32, 628 .fniv = gen_ursra_vec, 629 .fno = gen_helper_gvec_ursra_s, 630 .opt_opc = vecop_list, 631 .load_dest = true, 632 .vece = MO_32 }, 633 { .fni8 = gen_ursra64_i64, 634 .fniv = gen_ursra_vec, 635 .fno = gen_helper_gvec_ursra_d, 636 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 637 .opt_opc = vecop_list, 638 .load_dest = true, 639 .vece = MO_64 }, 640 }; 641 642 /* tszimm encoding produces immediates in the range [1..esize] */ 643 tcg_debug_assert(shift > 0); 644 tcg_debug_assert(shift <= (8 << vece)); 645 646 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 647 } 648 649 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 650 { 651 uint64_t mask = dup_const(MO_8, 0xff >> shift); 652 TCGv_i64 t = tcg_temp_new_i64(); 653 654 tcg_gen_shri_i64(t, a, shift); 655 tcg_gen_andi_i64(t, t, mask); 656 tcg_gen_andi_i64(d, d, ~mask); 657 tcg_gen_or_i64(d, d, t); 658 } 659 660 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 661 { 662 uint64_t mask = dup_const(MO_16, 0xffff >> shift); 663 TCGv_i64 t = tcg_temp_new_i64(); 664 665 tcg_gen_shri_i64(t, a, shift); 666 tcg_gen_andi_i64(t, t, mask); 667 tcg_gen_andi_i64(d, d, ~mask); 668 tcg_gen_or_i64(d, d, t); 669 } 670 671 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 672 { 673 tcg_gen_shri_i32(a, a, shift); 674 tcg_gen_deposit_i32(d, d, a, 0, 32 - shift); 675 } 676 677 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 678 { 679 tcg_gen_shri_i64(a, a, shift); 680 tcg_gen_deposit_i64(d, d, a, 0, 64 - shift); 681 } 682 683 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 684 { 685 TCGv_vec t = tcg_temp_new_vec_matching(d); 686 int64_t mi = MAKE_64BIT_MASK((8 << vece) - sh, sh); 687 TCGv_vec m = tcg_constant_vec_matching(d, vece, mi); 688 689 tcg_gen_shri_vec(vece, t, a, sh); 690 tcg_gen_and_vec(vece, d, d, m); 691 tcg_gen_or_vec(vece, d, d, t); 692 } 693 694 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 695 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 696 { 697 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 698 const GVecGen2i ops[4] = { 699 { .fni8 = gen_shr8_ins_i64, 700 .fniv = gen_shr_ins_vec, 701 .fno = gen_helper_gvec_sri_b, 702 .load_dest = true, 703 .opt_opc = vecop_list, 704 .vece = MO_8 }, 705 { .fni8 = gen_shr16_ins_i64, 706 .fniv = gen_shr_ins_vec, 707 .fno = gen_helper_gvec_sri_h, 708 .load_dest = true, 709 .opt_opc = vecop_list, 710 .vece = MO_16 }, 711 { .fni4 = gen_shr32_ins_i32, 712 .fniv = gen_shr_ins_vec, 713 .fno = gen_helper_gvec_sri_s, 714 .load_dest = true, 715 .opt_opc = vecop_list, 716 .vece = MO_32 }, 717 { .fni8 = gen_shr64_ins_i64, 718 .fniv = gen_shr_ins_vec, 719 .fno = gen_helper_gvec_sri_d, 720 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 721 .load_dest = true, 722 .opt_opc = vecop_list, 723 .vece = MO_64 }, 724 }; 725 726 /* tszimm encoding produces immediates in the range [1..esize]. */ 727 tcg_debug_assert(shift > 0); 728 tcg_debug_assert(shift <= (8 << vece)); 729 730 /* Shift of esize leaves destination unchanged. */ 731 if (shift < (8 << vece)) { 732 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 733 } else { 734 /* Nop, but we do need to clear the tail. */ 735 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 736 } 737 } 738 739 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 740 { 741 uint64_t mask = dup_const(MO_8, 0xff << shift); 742 TCGv_i64 t = tcg_temp_new_i64(); 743 744 tcg_gen_shli_i64(t, a, shift); 745 tcg_gen_andi_i64(t, t, mask); 746 tcg_gen_andi_i64(d, d, ~mask); 747 tcg_gen_or_i64(d, d, t); 748 } 749 750 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 751 { 752 uint64_t mask = dup_const(MO_16, 0xffff << shift); 753 TCGv_i64 t = tcg_temp_new_i64(); 754 755 tcg_gen_shli_i64(t, a, shift); 756 tcg_gen_andi_i64(t, t, mask); 757 tcg_gen_andi_i64(d, d, ~mask); 758 tcg_gen_or_i64(d, d, t); 759 } 760 761 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 762 { 763 tcg_gen_deposit_i32(d, d, a, shift, 32 - shift); 764 } 765 766 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 767 { 768 tcg_gen_deposit_i64(d, d, a, shift, 64 - shift); 769 } 770 771 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 772 { 773 TCGv_vec t = tcg_temp_new_vec_matching(d); 774 TCGv_vec m = tcg_constant_vec_matching(d, vece, MAKE_64BIT_MASK(0, sh)); 775 776 tcg_gen_shli_vec(vece, t, a, sh); 777 tcg_gen_and_vec(vece, d, d, m); 778 tcg_gen_or_vec(vece, d, d, t); 779 } 780 781 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 782 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 783 { 784 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 785 const GVecGen2i ops[4] = { 786 { .fni8 = gen_shl8_ins_i64, 787 .fniv = gen_shl_ins_vec, 788 .fno = gen_helper_gvec_sli_b, 789 .load_dest = true, 790 .opt_opc = vecop_list, 791 .vece = MO_8 }, 792 { .fni8 = gen_shl16_ins_i64, 793 .fniv = gen_shl_ins_vec, 794 .fno = gen_helper_gvec_sli_h, 795 .load_dest = true, 796 .opt_opc = vecop_list, 797 .vece = MO_16 }, 798 { .fni4 = gen_shl32_ins_i32, 799 .fniv = gen_shl_ins_vec, 800 .fno = gen_helper_gvec_sli_s, 801 .load_dest = true, 802 .opt_opc = vecop_list, 803 .vece = MO_32 }, 804 { .fni8 = gen_shl64_ins_i64, 805 .fniv = gen_shl_ins_vec, 806 .fno = gen_helper_gvec_sli_d, 807 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 808 .load_dest = true, 809 .opt_opc = vecop_list, 810 .vece = MO_64 }, 811 }; 812 813 /* tszimm encoding produces immediates in the range [0..esize-1]. */ 814 tcg_debug_assert(shift >= 0); 815 tcg_debug_assert(shift < (8 << vece)); 816 817 if (shift == 0) { 818 tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz); 819 } else { 820 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 821 } 822 } 823 824 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 825 { 826 gen_helper_neon_mul_u8(a, a, b); 827 gen_helper_neon_add_u8(d, d, a); 828 } 829 830 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 831 { 832 gen_helper_neon_mul_u8(a, a, b); 833 gen_helper_neon_sub_u8(d, d, a); 834 } 835 836 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 837 { 838 gen_helper_neon_mul_u16(a, a, b); 839 gen_helper_neon_add_u16(d, d, a); 840 } 841 842 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 843 { 844 gen_helper_neon_mul_u16(a, a, b); 845 gen_helper_neon_sub_u16(d, d, a); 846 } 847 848 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 849 { 850 tcg_gen_mul_i32(a, a, b); 851 tcg_gen_add_i32(d, d, a); 852 } 853 854 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 855 { 856 tcg_gen_mul_i32(a, a, b); 857 tcg_gen_sub_i32(d, d, a); 858 } 859 860 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 861 { 862 tcg_gen_mul_i64(a, a, b); 863 tcg_gen_add_i64(d, d, a); 864 } 865 866 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 867 { 868 tcg_gen_mul_i64(a, a, b); 869 tcg_gen_sub_i64(d, d, a); 870 } 871 872 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 873 { 874 tcg_gen_mul_vec(vece, a, a, b); 875 tcg_gen_add_vec(vece, d, d, a); 876 } 877 878 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 879 { 880 tcg_gen_mul_vec(vece, a, a, b); 881 tcg_gen_sub_vec(vece, d, d, a); 882 } 883 884 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops, 885 * these tables are shared with AArch64 which does support them. 886 */ 887 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 888 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 889 { 890 static const TCGOpcode vecop_list[] = { 891 INDEX_op_mul_vec, INDEX_op_add_vec, 0 892 }; 893 static const GVecGen3 ops[4] = { 894 { .fni4 = gen_mla8_i32, 895 .fniv = gen_mla_vec, 896 .load_dest = true, 897 .opt_opc = vecop_list, 898 .vece = MO_8 }, 899 { .fni4 = gen_mla16_i32, 900 .fniv = gen_mla_vec, 901 .load_dest = true, 902 .opt_opc = vecop_list, 903 .vece = MO_16 }, 904 { .fni4 = gen_mla32_i32, 905 .fniv = gen_mla_vec, 906 .load_dest = true, 907 .opt_opc = vecop_list, 908 .vece = MO_32 }, 909 { .fni8 = gen_mla64_i64, 910 .fniv = gen_mla_vec, 911 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 912 .load_dest = true, 913 .opt_opc = vecop_list, 914 .vece = MO_64 }, 915 }; 916 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 917 } 918 919 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 920 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 921 { 922 static const TCGOpcode vecop_list[] = { 923 INDEX_op_mul_vec, INDEX_op_sub_vec, 0 924 }; 925 static const GVecGen3 ops[4] = { 926 { .fni4 = gen_mls8_i32, 927 .fniv = gen_mls_vec, 928 .load_dest = true, 929 .opt_opc = vecop_list, 930 .vece = MO_8 }, 931 { .fni4 = gen_mls16_i32, 932 .fniv = gen_mls_vec, 933 .load_dest = true, 934 .opt_opc = vecop_list, 935 .vece = MO_16 }, 936 { .fni4 = gen_mls32_i32, 937 .fniv = gen_mls_vec, 938 .load_dest = true, 939 .opt_opc = vecop_list, 940 .vece = MO_32 }, 941 { .fni8 = gen_mls64_i64, 942 .fniv = gen_mls_vec, 943 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 944 .load_dest = true, 945 .opt_opc = vecop_list, 946 .vece = MO_64 }, 947 }; 948 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 949 } 950 951 /* CMTST : test is "if (X & Y != 0)". */ 952 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 953 { 954 tcg_gen_negsetcond_i32(TCG_COND_TSTNE, d, a, b); 955 } 956 957 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 958 { 959 tcg_gen_negsetcond_i64(TCG_COND_TSTNE, d, a, b); 960 } 961 962 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 963 { 964 tcg_gen_cmp_vec(TCG_COND_TSTNE, vece, d, a, b); 965 } 966 967 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 968 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 969 { 970 static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 }; 971 static const GVecGen3 ops[4] = { 972 { .fni4 = gen_helper_neon_tst_u8, 973 .fniv = gen_cmtst_vec, 974 .opt_opc = vecop_list, 975 .vece = MO_8 }, 976 { .fni4 = gen_helper_neon_tst_u16, 977 .fniv = gen_cmtst_vec, 978 .opt_opc = vecop_list, 979 .vece = MO_16 }, 980 { .fni4 = gen_cmtst_i32, 981 .fniv = gen_cmtst_vec, 982 .opt_opc = vecop_list, 983 .vece = MO_32 }, 984 { .fni8 = gen_cmtst_i64, 985 .fniv = gen_cmtst_vec, 986 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 987 .opt_opc = vecop_list, 988 .vece = MO_64 }, 989 }; 990 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 991 } 992 993 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) 994 { 995 TCGv_i32 lval = tcg_temp_new_i32(); 996 TCGv_i32 rval = tcg_temp_new_i32(); 997 TCGv_i32 lsh = tcg_temp_new_i32(); 998 TCGv_i32 rsh = tcg_temp_new_i32(); 999 TCGv_i32 zero = tcg_constant_i32(0); 1000 TCGv_i32 max = tcg_constant_i32(32); 1001 1002 /* 1003 * Rely on the TCG guarantee that out of range shifts produce 1004 * unspecified results, not undefined behaviour (i.e. no trap). 1005 * Discard out-of-range results after the fact. 1006 */ 1007 tcg_gen_ext8s_i32(lsh, shift); 1008 tcg_gen_neg_i32(rsh, lsh); 1009 tcg_gen_shl_i32(lval, src, lsh); 1010 tcg_gen_shr_i32(rval, src, rsh); 1011 tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero); 1012 tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst); 1013 } 1014 1015 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) 1016 { 1017 TCGv_i64 lval = tcg_temp_new_i64(); 1018 TCGv_i64 rval = tcg_temp_new_i64(); 1019 TCGv_i64 lsh = tcg_temp_new_i64(); 1020 TCGv_i64 rsh = tcg_temp_new_i64(); 1021 TCGv_i64 zero = tcg_constant_i64(0); 1022 TCGv_i64 max = tcg_constant_i64(64); 1023 1024 /* 1025 * Rely on the TCG guarantee that out of range shifts produce 1026 * unspecified results, not undefined behaviour (i.e. no trap). 1027 * Discard out-of-range results after the fact. 1028 */ 1029 tcg_gen_ext8s_i64(lsh, shift); 1030 tcg_gen_neg_i64(rsh, lsh); 1031 tcg_gen_shl_i64(lval, src, lsh); 1032 tcg_gen_shr_i64(rval, src, rsh); 1033 tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero); 1034 tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst); 1035 } 1036 1037 static void gen_ushl_vec(unsigned vece, TCGv_vec dst, 1038 TCGv_vec src, TCGv_vec shift) 1039 { 1040 TCGv_vec lval = tcg_temp_new_vec_matching(dst); 1041 TCGv_vec rval = tcg_temp_new_vec_matching(dst); 1042 TCGv_vec lsh = tcg_temp_new_vec_matching(dst); 1043 TCGv_vec rsh = tcg_temp_new_vec_matching(dst); 1044 TCGv_vec max, zero; 1045 1046 tcg_gen_neg_vec(vece, rsh, shift); 1047 if (vece == MO_8) { 1048 tcg_gen_mov_vec(lsh, shift); 1049 } else { 1050 TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff); 1051 tcg_gen_and_vec(vece, lsh, shift, msk); 1052 tcg_gen_and_vec(vece, rsh, rsh, msk); 1053 } 1054 1055 /* 1056 * Rely on the TCG guarantee that out of range shifts produce 1057 * unspecified results, not undefined behaviour (i.e. no trap). 1058 * Discard out-of-range results after the fact. 1059 */ 1060 tcg_gen_shlv_vec(vece, lval, src, lsh); 1061 tcg_gen_shrv_vec(vece, rval, src, rsh); 1062 1063 /* 1064 * The choice of GE (signed) and GEU (unsigned) are biased toward 1065 * the instructions of the x86_64 host. For MO_8, the whole byte 1066 * is significant so we must use an unsigned compare; otherwise we 1067 * have already masked to a byte and so a signed compare works. 1068 * Other tcg hosts have a full set of comparisons and do not care. 1069 */ 1070 zero = tcg_constant_vec_matching(dst, vece, 0); 1071 max = tcg_constant_vec_matching(dst, vece, 8 << vece); 1072 if (vece == MO_8) { 1073 tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, lval, lsh, max, zero, lval); 1074 tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, rval, rsh, max, zero, rval); 1075 } else { 1076 tcg_gen_cmpsel_vec(TCG_COND_GE, vece, lval, lsh, max, zero, lval); 1077 tcg_gen_cmpsel_vec(TCG_COND_GE, vece, rval, rsh, max, zero, rval); 1078 } 1079 tcg_gen_or_vec(vece, dst, lval, rval); 1080 } 1081 1082 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1083 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1084 { 1085 static const TCGOpcode vecop_list[] = { 1086 INDEX_op_neg_vec, INDEX_op_shlv_vec, 1087 INDEX_op_shrv_vec, INDEX_op_cmpsel_vec, 0 1088 }; 1089 static const GVecGen3 ops[4] = { 1090 { .fniv = gen_ushl_vec, 1091 .fno = gen_helper_gvec_ushl_b, 1092 .opt_opc = vecop_list, 1093 .vece = MO_8 }, 1094 { .fniv = gen_ushl_vec, 1095 .fno = gen_helper_gvec_ushl_h, 1096 .opt_opc = vecop_list, 1097 .vece = MO_16 }, 1098 { .fni4 = gen_ushl_i32, 1099 .fniv = gen_ushl_vec, 1100 .opt_opc = vecop_list, 1101 .vece = MO_32 }, 1102 { .fni8 = gen_ushl_i64, 1103 .fniv = gen_ushl_vec, 1104 .opt_opc = vecop_list, 1105 .vece = MO_64 }, 1106 }; 1107 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1108 } 1109 1110 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) 1111 { 1112 TCGv_i32 lval = tcg_temp_new_i32(); 1113 TCGv_i32 rval = tcg_temp_new_i32(); 1114 TCGv_i32 lsh = tcg_temp_new_i32(); 1115 TCGv_i32 rsh = tcg_temp_new_i32(); 1116 TCGv_i32 zero = tcg_constant_i32(0); 1117 TCGv_i32 max = tcg_constant_i32(31); 1118 1119 /* 1120 * Rely on the TCG guarantee that out of range shifts produce 1121 * unspecified results, not undefined behaviour (i.e. no trap). 1122 * Discard out-of-range results after the fact. 1123 */ 1124 tcg_gen_ext8s_i32(lsh, shift); 1125 tcg_gen_neg_i32(rsh, lsh); 1126 tcg_gen_shl_i32(lval, src, lsh); 1127 tcg_gen_umin_i32(rsh, rsh, max); 1128 tcg_gen_sar_i32(rval, src, rsh); 1129 tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero); 1130 tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval); 1131 } 1132 1133 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) 1134 { 1135 TCGv_i64 lval = tcg_temp_new_i64(); 1136 TCGv_i64 rval = tcg_temp_new_i64(); 1137 TCGv_i64 lsh = tcg_temp_new_i64(); 1138 TCGv_i64 rsh = tcg_temp_new_i64(); 1139 TCGv_i64 zero = tcg_constant_i64(0); 1140 TCGv_i64 max = tcg_constant_i64(63); 1141 1142 /* 1143 * Rely on the TCG guarantee that out of range shifts produce 1144 * unspecified results, not undefined behaviour (i.e. no trap). 1145 * Discard out-of-range results after the fact. 1146 */ 1147 tcg_gen_ext8s_i64(lsh, shift); 1148 tcg_gen_neg_i64(rsh, lsh); 1149 tcg_gen_shl_i64(lval, src, lsh); 1150 tcg_gen_umin_i64(rsh, rsh, max); 1151 tcg_gen_sar_i64(rval, src, rsh); 1152 tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero); 1153 tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval); 1154 } 1155 1156 static void gen_sshl_vec(unsigned vece, TCGv_vec dst, 1157 TCGv_vec src, TCGv_vec shift) 1158 { 1159 TCGv_vec lval = tcg_temp_new_vec_matching(dst); 1160 TCGv_vec rval = tcg_temp_new_vec_matching(dst); 1161 TCGv_vec lsh = tcg_temp_new_vec_matching(dst); 1162 TCGv_vec rsh = tcg_temp_new_vec_matching(dst); 1163 TCGv_vec max, zero; 1164 1165 /* 1166 * Rely on the TCG guarantee that out of range shifts produce 1167 * unspecified results, not undefined behaviour (i.e. no trap). 1168 * Discard out-of-range results after the fact. 1169 */ 1170 tcg_gen_neg_vec(vece, rsh, shift); 1171 if (vece == MO_8) { 1172 tcg_gen_mov_vec(lsh, shift); 1173 } else { 1174 TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff); 1175 tcg_gen_and_vec(vece, lsh, shift, msk); 1176 tcg_gen_and_vec(vece, rsh, rsh, msk); 1177 } 1178 1179 /* Bound rsh so out of bound right shift gets -1. */ 1180 max = tcg_constant_vec_matching(dst, vece, (8 << vece) - 1); 1181 tcg_gen_umin_vec(vece, rsh, rsh, max); 1182 1183 tcg_gen_shlv_vec(vece, lval, src, lsh); 1184 tcg_gen_sarv_vec(vece, rval, src, rsh); 1185 1186 /* Select in-bound left shift. */ 1187 zero = tcg_constant_vec_matching(dst, vece, 0); 1188 tcg_gen_cmpsel_vec(TCG_COND_GT, vece, lval, lsh, max, zero, lval); 1189 1190 /* Select between left and right shift. */ 1191 if (vece == MO_8) { 1192 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, zero, rval, lval); 1193 } else { 1194 TCGv_vec sgn = tcg_constant_vec_matching(dst, vece, 0x80); 1195 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, sgn, lval, rval); 1196 } 1197 } 1198 1199 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1200 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1201 { 1202 static const TCGOpcode vecop_list[] = { 1203 INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec, 1204 INDEX_op_sarv_vec, INDEX_op_cmpsel_vec, 0 1205 }; 1206 static const GVecGen3 ops[4] = { 1207 { .fniv = gen_sshl_vec, 1208 .fno = gen_helper_gvec_sshl_b, 1209 .opt_opc = vecop_list, 1210 .vece = MO_8 }, 1211 { .fniv = gen_sshl_vec, 1212 .fno = gen_helper_gvec_sshl_h, 1213 .opt_opc = vecop_list, 1214 .vece = MO_16 }, 1215 { .fni4 = gen_sshl_i32, 1216 .fniv = gen_sshl_vec, 1217 .opt_opc = vecop_list, 1218 .vece = MO_32 }, 1219 { .fni8 = gen_sshl_i64, 1220 .fniv = gen_sshl_vec, 1221 .opt_opc = vecop_list, 1222 .vece = MO_64 }, 1223 }; 1224 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1225 } 1226 1227 void gen_gvec_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1228 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1229 { 1230 static gen_helper_gvec_3 * const fns[] = { 1231 gen_helper_gvec_srshl_b, gen_helper_gvec_srshl_h, 1232 gen_helper_gvec_srshl_s, gen_helper_gvec_srshl_d, 1233 }; 1234 tcg_debug_assert(vece <= MO_64); 1235 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1236 } 1237 1238 void gen_gvec_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1239 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1240 { 1241 static gen_helper_gvec_3 * const fns[] = { 1242 gen_helper_gvec_urshl_b, gen_helper_gvec_urshl_h, 1243 gen_helper_gvec_urshl_s, gen_helper_gvec_urshl_d, 1244 }; 1245 tcg_debug_assert(vece <= MO_64); 1246 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1247 } 1248 1249 void gen_neon_sqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1250 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1251 { 1252 static gen_helper_gvec_3_ptr * const fns[] = { 1253 gen_helper_neon_sqshl_b, gen_helper_neon_sqshl_h, 1254 gen_helper_neon_sqshl_s, gen_helper_neon_sqshl_d, 1255 }; 1256 tcg_debug_assert(vece <= MO_64); 1257 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1258 opr_sz, max_sz, 0, fns[vece]); 1259 } 1260 1261 void gen_neon_uqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1262 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1263 { 1264 static gen_helper_gvec_3_ptr * const fns[] = { 1265 gen_helper_neon_uqshl_b, gen_helper_neon_uqshl_h, 1266 gen_helper_neon_uqshl_s, gen_helper_neon_uqshl_d, 1267 }; 1268 tcg_debug_assert(vece <= MO_64); 1269 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1270 opr_sz, max_sz, 0, fns[vece]); 1271 } 1272 1273 void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1274 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1275 { 1276 static gen_helper_gvec_3_ptr * const fns[] = { 1277 gen_helper_neon_sqrshl_b, gen_helper_neon_sqrshl_h, 1278 gen_helper_neon_sqrshl_s, gen_helper_neon_sqrshl_d, 1279 }; 1280 tcg_debug_assert(vece <= MO_64); 1281 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1282 opr_sz, max_sz, 0, fns[vece]); 1283 } 1284 1285 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1286 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1287 { 1288 static gen_helper_gvec_3_ptr * const fns[] = { 1289 gen_helper_neon_uqrshl_b, gen_helper_neon_uqrshl_h, 1290 gen_helper_neon_uqrshl_s, gen_helper_neon_uqrshl_d, 1291 }; 1292 tcg_debug_assert(vece <= MO_64); 1293 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1294 opr_sz, max_sz, 0, fns[vece]); 1295 } 1296 1297 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1298 { 1299 uint64_t max = MAKE_64BIT_MASK(0, 8 << esz); 1300 TCGv_i64 tmp = tcg_temp_new_i64(); 1301 1302 tcg_gen_add_i64(tmp, a, b); 1303 tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max)); 1304 tcg_gen_xor_i64(tmp, tmp, res); 1305 tcg_gen_or_i64(qc, qc, tmp); 1306 } 1307 1308 void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1309 { 1310 TCGv_i64 t = tcg_temp_new_i64(); 1311 1312 tcg_gen_add_i64(t, a, b); 1313 tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a, 1314 tcg_constant_i64(UINT64_MAX), t); 1315 tcg_gen_xor_i64(t, t, res); 1316 tcg_gen_or_i64(qc, qc, t); 1317 } 1318 1319 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1320 TCGv_vec a, TCGv_vec b) 1321 { 1322 TCGv_vec x = tcg_temp_new_vec_matching(t); 1323 tcg_gen_add_vec(vece, x, a, b); 1324 tcg_gen_usadd_vec(vece, t, a, b); 1325 tcg_gen_xor_vec(vece, x, x, t); 1326 tcg_gen_or_vec(vece, qc, qc, x); 1327 } 1328 1329 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1330 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1331 { 1332 static const TCGOpcode vecop_list[] = { 1333 INDEX_op_usadd_vec, INDEX_op_add_vec, 0 1334 }; 1335 static const GVecGen4 ops[4] = { 1336 { .fniv = gen_uqadd_vec, 1337 .fno = gen_helper_gvec_uqadd_b, 1338 .write_aofs = true, 1339 .opt_opc = vecop_list, 1340 .vece = MO_8 }, 1341 { .fniv = gen_uqadd_vec, 1342 .fno = gen_helper_gvec_uqadd_h, 1343 .write_aofs = true, 1344 .opt_opc = vecop_list, 1345 .vece = MO_16 }, 1346 { .fniv = gen_uqadd_vec, 1347 .fno = gen_helper_gvec_uqadd_s, 1348 .write_aofs = true, 1349 .opt_opc = vecop_list, 1350 .vece = MO_32 }, 1351 { .fniv = gen_uqadd_vec, 1352 .fni8 = gen_uqadd_d, 1353 .fno = gen_helper_gvec_uqadd_d, 1354 .write_aofs = true, 1355 .opt_opc = vecop_list, 1356 .vece = MO_64 }, 1357 }; 1358 1359 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1360 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1361 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1362 } 1363 1364 void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1365 { 1366 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1); 1367 int64_t min = -1ll - max; 1368 TCGv_i64 tmp = tcg_temp_new_i64(); 1369 1370 tcg_gen_add_i64(tmp, a, b); 1371 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max)); 1372 tcg_gen_smax_i64(res, res, tcg_constant_i64(min)); 1373 tcg_gen_xor_i64(tmp, tmp, res); 1374 tcg_gen_or_i64(qc, qc, tmp); 1375 } 1376 1377 void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1378 { 1379 TCGv_i64 t0 = tcg_temp_new_i64(); 1380 TCGv_i64 t1 = tcg_temp_new_i64(); 1381 TCGv_i64 t2 = tcg_temp_new_i64(); 1382 1383 tcg_gen_add_i64(t0, a, b); 1384 1385 /* Compute signed overflow indication into T1 */ 1386 tcg_gen_xor_i64(t1, a, b); 1387 tcg_gen_xor_i64(t2, t0, a); 1388 tcg_gen_andc_i64(t1, t2, t1); 1389 1390 /* Compute saturated value into T2 */ 1391 tcg_gen_sari_i64(t2, a, 63); 1392 tcg_gen_xori_i64(t2, t2, INT64_MAX); 1393 1394 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0); 1395 tcg_gen_xor_i64(t0, t0, res); 1396 tcg_gen_or_i64(qc, qc, t0); 1397 } 1398 1399 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1400 TCGv_vec a, TCGv_vec b) 1401 { 1402 TCGv_vec x = tcg_temp_new_vec_matching(t); 1403 tcg_gen_add_vec(vece, x, a, b); 1404 tcg_gen_ssadd_vec(vece, t, a, b); 1405 tcg_gen_xor_vec(vece, x, x, t); 1406 tcg_gen_or_vec(vece, qc, qc, x); 1407 } 1408 1409 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1410 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1411 { 1412 static const TCGOpcode vecop_list[] = { 1413 INDEX_op_ssadd_vec, INDEX_op_add_vec, 0 1414 }; 1415 static const GVecGen4 ops[4] = { 1416 { .fniv = gen_sqadd_vec, 1417 .fno = gen_helper_gvec_sqadd_b, 1418 .opt_opc = vecop_list, 1419 .write_aofs = true, 1420 .vece = MO_8 }, 1421 { .fniv = gen_sqadd_vec, 1422 .fno = gen_helper_gvec_sqadd_h, 1423 .opt_opc = vecop_list, 1424 .write_aofs = true, 1425 .vece = MO_16 }, 1426 { .fniv = gen_sqadd_vec, 1427 .fno = gen_helper_gvec_sqadd_s, 1428 .opt_opc = vecop_list, 1429 .write_aofs = true, 1430 .vece = MO_32 }, 1431 { .fniv = gen_sqadd_vec, 1432 .fni8 = gen_sqadd_d, 1433 .fno = gen_helper_gvec_sqadd_d, 1434 .opt_opc = vecop_list, 1435 .write_aofs = true, 1436 .vece = MO_64 }, 1437 }; 1438 1439 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1440 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1441 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1442 } 1443 1444 void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1445 { 1446 TCGv_i64 tmp = tcg_temp_new_i64(); 1447 1448 tcg_gen_sub_i64(tmp, a, b); 1449 tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0)); 1450 tcg_gen_xor_i64(tmp, tmp, res); 1451 tcg_gen_or_i64(qc, qc, tmp); 1452 } 1453 1454 void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1455 { 1456 TCGv_i64 t = tcg_temp_new_i64(); 1457 1458 tcg_gen_sub_i64(t, a, b); 1459 tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t); 1460 tcg_gen_xor_i64(t, t, res); 1461 tcg_gen_or_i64(qc, qc, t); 1462 } 1463 1464 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1465 TCGv_vec a, TCGv_vec b) 1466 { 1467 TCGv_vec x = tcg_temp_new_vec_matching(t); 1468 tcg_gen_sub_vec(vece, x, a, b); 1469 tcg_gen_ussub_vec(vece, t, a, b); 1470 tcg_gen_xor_vec(vece, x, x, t); 1471 tcg_gen_or_vec(vece, qc, qc, x); 1472 } 1473 1474 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1475 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1476 { 1477 static const TCGOpcode vecop_list[] = { 1478 INDEX_op_ussub_vec, INDEX_op_sub_vec, 0 1479 }; 1480 static const GVecGen4 ops[4] = { 1481 { .fniv = gen_uqsub_vec, 1482 .fno = gen_helper_gvec_uqsub_b, 1483 .opt_opc = vecop_list, 1484 .write_aofs = true, 1485 .vece = MO_8 }, 1486 { .fniv = gen_uqsub_vec, 1487 .fno = gen_helper_gvec_uqsub_h, 1488 .opt_opc = vecop_list, 1489 .write_aofs = true, 1490 .vece = MO_16 }, 1491 { .fniv = gen_uqsub_vec, 1492 .fno = gen_helper_gvec_uqsub_s, 1493 .opt_opc = vecop_list, 1494 .write_aofs = true, 1495 .vece = MO_32 }, 1496 { .fniv = gen_uqsub_vec, 1497 .fni8 = gen_uqsub_d, 1498 .fno = gen_helper_gvec_uqsub_d, 1499 .opt_opc = vecop_list, 1500 .write_aofs = true, 1501 .vece = MO_64 }, 1502 }; 1503 1504 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1505 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1506 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1507 } 1508 1509 void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1510 { 1511 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1); 1512 int64_t min = -1ll - max; 1513 TCGv_i64 tmp = tcg_temp_new_i64(); 1514 1515 tcg_gen_sub_i64(tmp, a, b); 1516 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max)); 1517 tcg_gen_smax_i64(res, res, tcg_constant_i64(min)); 1518 tcg_gen_xor_i64(tmp, tmp, res); 1519 tcg_gen_or_i64(qc, qc, tmp); 1520 } 1521 1522 void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1523 { 1524 TCGv_i64 t0 = tcg_temp_new_i64(); 1525 TCGv_i64 t1 = tcg_temp_new_i64(); 1526 TCGv_i64 t2 = tcg_temp_new_i64(); 1527 1528 tcg_gen_sub_i64(t0, a, b); 1529 1530 /* Compute signed overflow indication into T1 */ 1531 tcg_gen_xor_i64(t1, a, b); 1532 tcg_gen_xor_i64(t2, t0, a); 1533 tcg_gen_and_i64(t1, t1, t2); 1534 1535 /* Compute saturated value into T2 */ 1536 tcg_gen_sari_i64(t2, a, 63); 1537 tcg_gen_xori_i64(t2, t2, INT64_MAX); 1538 1539 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0); 1540 tcg_gen_xor_i64(t0, t0, res); 1541 tcg_gen_or_i64(qc, qc, t0); 1542 } 1543 1544 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1545 TCGv_vec a, TCGv_vec b) 1546 { 1547 TCGv_vec x = tcg_temp_new_vec_matching(t); 1548 tcg_gen_sub_vec(vece, x, a, b); 1549 tcg_gen_sssub_vec(vece, t, a, b); 1550 tcg_gen_xor_vec(vece, x, x, t); 1551 tcg_gen_or_vec(vece, qc, qc, x); 1552 } 1553 1554 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1555 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1556 { 1557 static const TCGOpcode vecop_list[] = { 1558 INDEX_op_sssub_vec, INDEX_op_sub_vec, 0 1559 }; 1560 static const GVecGen4 ops[4] = { 1561 { .fniv = gen_sqsub_vec, 1562 .fno = gen_helper_gvec_sqsub_b, 1563 .opt_opc = vecop_list, 1564 .write_aofs = true, 1565 .vece = MO_8 }, 1566 { .fniv = gen_sqsub_vec, 1567 .fno = gen_helper_gvec_sqsub_h, 1568 .opt_opc = vecop_list, 1569 .write_aofs = true, 1570 .vece = MO_16 }, 1571 { .fniv = gen_sqsub_vec, 1572 .fno = gen_helper_gvec_sqsub_s, 1573 .opt_opc = vecop_list, 1574 .write_aofs = true, 1575 .vece = MO_32 }, 1576 { .fniv = gen_sqsub_vec, 1577 .fni8 = gen_sqsub_d, 1578 .fno = gen_helper_gvec_sqsub_d, 1579 .opt_opc = vecop_list, 1580 .write_aofs = true, 1581 .vece = MO_64 }, 1582 }; 1583 1584 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1585 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1586 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1587 } 1588 1589 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1590 { 1591 TCGv_i32 t = tcg_temp_new_i32(); 1592 1593 tcg_gen_sub_i32(t, a, b); 1594 tcg_gen_sub_i32(d, b, a); 1595 tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t); 1596 } 1597 1598 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1599 { 1600 TCGv_i64 t = tcg_temp_new_i64(); 1601 1602 tcg_gen_sub_i64(t, a, b); 1603 tcg_gen_sub_i64(d, b, a); 1604 tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t); 1605 } 1606 1607 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1608 { 1609 TCGv_vec t = tcg_temp_new_vec_matching(d); 1610 1611 tcg_gen_smin_vec(vece, t, a, b); 1612 tcg_gen_smax_vec(vece, d, a, b); 1613 tcg_gen_sub_vec(vece, d, d, t); 1614 } 1615 1616 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1617 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1618 { 1619 static const TCGOpcode vecop_list[] = { 1620 INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0 1621 }; 1622 static const GVecGen3 ops[4] = { 1623 { .fniv = gen_sabd_vec, 1624 .fno = gen_helper_gvec_sabd_b, 1625 .opt_opc = vecop_list, 1626 .vece = MO_8 }, 1627 { .fniv = gen_sabd_vec, 1628 .fno = gen_helper_gvec_sabd_h, 1629 .opt_opc = vecop_list, 1630 .vece = MO_16 }, 1631 { .fni4 = gen_sabd_i32, 1632 .fniv = gen_sabd_vec, 1633 .fno = gen_helper_gvec_sabd_s, 1634 .opt_opc = vecop_list, 1635 .vece = MO_32 }, 1636 { .fni8 = gen_sabd_i64, 1637 .fniv = gen_sabd_vec, 1638 .fno = gen_helper_gvec_sabd_d, 1639 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1640 .opt_opc = vecop_list, 1641 .vece = MO_64 }, 1642 }; 1643 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1644 } 1645 1646 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1647 { 1648 TCGv_i32 t = tcg_temp_new_i32(); 1649 1650 tcg_gen_sub_i32(t, a, b); 1651 tcg_gen_sub_i32(d, b, a); 1652 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t); 1653 } 1654 1655 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1656 { 1657 TCGv_i64 t = tcg_temp_new_i64(); 1658 1659 tcg_gen_sub_i64(t, a, b); 1660 tcg_gen_sub_i64(d, b, a); 1661 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t); 1662 } 1663 1664 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1665 { 1666 TCGv_vec t = tcg_temp_new_vec_matching(d); 1667 1668 tcg_gen_umin_vec(vece, t, a, b); 1669 tcg_gen_umax_vec(vece, d, a, b); 1670 tcg_gen_sub_vec(vece, d, d, t); 1671 } 1672 1673 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1674 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1675 { 1676 static const TCGOpcode vecop_list[] = { 1677 INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0 1678 }; 1679 static const GVecGen3 ops[4] = { 1680 { .fniv = gen_uabd_vec, 1681 .fno = gen_helper_gvec_uabd_b, 1682 .opt_opc = vecop_list, 1683 .vece = MO_8 }, 1684 { .fniv = gen_uabd_vec, 1685 .fno = gen_helper_gvec_uabd_h, 1686 .opt_opc = vecop_list, 1687 .vece = MO_16 }, 1688 { .fni4 = gen_uabd_i32, 1689 .fniv = gen_uabd_vec, 1690 .fno = gen_helper_gvec_uabd_s, 1691 .opt_opc = vecop_list, 1692 .vece = MO_32 }, 1693 { .fni8 = gen_uabd_i64, 1694 .fniv = gen_uabd_vec, 1695 .fno = gen_helper_gvec_uabd_d, 1696 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1697 .opt_opc = vecop_list, 1698 .vece = MO_64 }, 1699 }; 1700 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1701 } 1702 1703 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1704 { 1705 TCGv_i32 t = tcg_temp_new_i32(); 1706 gen_sabd_i32(t, a, b); 1707 tcg_gen_add_i32(d, d, t); 1708 } 1709 1710 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1711 { 1712 TCGv_i64 t = tcg_temp_new_i64(); 1713 gen_sabd_i64(t, a, b); 1714 tcg_gen_add_i64(d, d, t); 1715 } 1716 1717 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1718 { 1719 TCGv_vec t = tcg_temp_new_vec_matching(d); 1720 gen_sabd_vec(vece, t, a, b); 1721 tcg_gen_add_vec(vece, d, d, t); 1722 } 1723 1724 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1725 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1726 { 1727 static const TCGOpcode vecop_list[] = { 1728 INDEX_op_sub_vec, INDEX_op_add_vec, 1729 INDEX_op_smin_vec, INDEX_op_smax_vec, 0 1730 }; 1731 static const GVecGen3 ops[4] = { 1732 { .fniv = gen_saba_vec, 1733 .fno = gen_helper_gvec_saba_b, 1734 .opt_opc = vecop_list, 1735 .load_dest = true, 1736 .vece = MO_8 }, 1737 { .fniv = gen_saba_vec, 1738 .fno = gen_helper_gvec_saba_h, 1739 .opt_opc = vecop_list, 1740 .load_dest = true, 1741 .vece = MO_16 }, 1742 { .fni4 = gen_saba_i32, 1743 .fniv = gen_saba_vec, 1744 .fno = gen_helper_gvec_saba_s, 1745 .opt_opc = vecop_list, 1746 .load_dest = true, 1747 .vece = MO_32 }, 1748 { .fni8 = gen_saba_i64, 1749 .fniv = gen_saba_vec, 1750 .fno = gen_helper_gvec_saba_d, 1751 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1752 .opt_opc = vecop_list, 1753 .load_dest = true, 1754 .vece = MO_64 }, 1755 }; 1756 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1757 } 1758 1759 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1760 { 1761 TCGv_i32 t = tcg_temp_new_i32(); 1762 gen_uabd_i32(t, a, b); 1763 tcg_gen_add_i32(d, d, t); 1764 } 1765 1766 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1767 { 1768 TCGv_i64 t = tcg_temp_new_i64(); 1769 gen_uabd_i64(t, a, b); 1770 tcg_gen_add_i64(d, d, t); 1771 } 1772 1773 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1774 { 1775 TCGv_vec t = tcg_temp_new_vec_matching(d); 1776 gen_uabd_vec(vece, t, a, b); 1777 tcg_gen_add_vec(vece, d, d, t); 1778 } 1779 1780 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1781 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1782 { 1783 static const TCGOpcode vecop_list[] = { 1784 INDEX_op_sub_vec, INDEX_op_add_vec, 1785 INDEX_op_umin_vec, INDEX_op_umax_vec, 0 1786 }; 1787 static const GVecGen3 ops[4] = { 1788 { .fniv = gen_uaba_vec, 1789 .fno = gen_helper_gvec_uaba_b, 1790 .opt_opc = vecop_list, 1791 .load_dest = true, 1792 .vece = MO_8 }, 1793 { .fniv = gen_uaba_vec, 1794 .fno = gen_helper_gvec_uaba_h, 1795 .opt_opc = vecop_list, 1796 .load_dest = true, 1797 .vece = MO_16 }, 1798 { .fni4 = gen_uaba_i32, 1799 .fniv = gen_uaba_vec, 1800 .fno = gen_helper_gvec_uaba_s, 1801 .opt_opc = vecop_list, 1802 .load_dest = true, 1803 .vece = MO_32 }, 1804 { .fni8 = gen_uaba_i64, 1805 .fniv = gen_uaba_vec, 1806 .fno = gen_helper_gvec_uaba_d, 1807 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1808 .opt_opc = vecop_list, 1809 .load_dest = true, 1810 .vece = MO_64 }, 1811 }; 1812 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1813 } 1814 1815 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1816 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1817 { 1818 static gen_helper_gvec_3 * const fns[4] = { 1819 gen_helper_gvec_addp_b, 1820 gen_helper_gvec_addp_h, 1821 gen_helper_gvec_addp_s, 1822 gen_helper_gvec_addp_d, 1823 }; 1824 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1825 } 1826 1827 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1828 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1829 { 1830 static gen_helper_gvec_3 * const fns[4] = { 1831 gen_helper_gvec_smaxp_b, 1832 gen_helper_gvec_smaxp_h, 1833 gen_helper_gvec_smaxp_s, 1834 }; 1835 tcg_debug_assert(vece <= MO_32); 1836 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1837 } 1838 1839 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1840 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1841 { 1842 static gen_helper_gvec_3 * const fns[4] = { 1843 gen_helper_gvec_sminp_b, 1844 gen_helper_gvec_sminp_h, 1845 gen_helper_gvec_sminp_s, 1846 }; 1847 tcg_debug_assert(vece <= MO_32); 1848 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1849 } 1850 1851 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1852 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1853 { 1854 static gen_helper_gvec_3 * const fns[4] = { 1855 gen_helper_gvec_umaxp_b, 1856 gen_helper_gvec_umaxp_h, 1857 gen_helper_gvec_umaxp_s, 1858 }; 1859 tcg_debug_assert(vece <= MO_32); 1860 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1861 } 1862 1863 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1864 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1865 { 1866 static gen_helper_gvec_3 * const fns[4] = { 1867 gen_helper_gvec_uminp_b, 1868 gen_helper_gvec_uminp_h, 1869 gen_helper_gvec_uminp_s, 1870 }; 1871 tcg_debug_assert(vece <= MO_32); 1872 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1873 } 1874 1875 static void gen_shadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1876 { 1877 TCGv_i64 t = tcg_temp_new_i64(); 1878 1879 tcg_gen_and_i64(t, a, b); 1880 tcg_gen_vec_sar8i_i64(a, a, 1); 1881 tcg_gen_vec_sar8i_i64(b, b, 1); 1882 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 1883 tcg_gen_vec_add8_i64(d, a, b); 1884 tcg_gen_vec_add8_i64(d, d, t); 1885 } 1886 1887 static void gen_shadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1888 { 1889 TCGv_i64 t = tcg_temp_new_i64(); 1890 1891 tcg_gen_and_i64(t, a, b); 1892 tcg_gen_vec_sar16i_i64(a, a, 1); 1893 tcg_gen_vec_sar16i_i64(b, b, 1); 1894 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 1895 tcg_gen_vec_add16_i64(d, a, b); 1896 tcg_gen_vec_add16_i64(d, d, t); 1897 } 1898 1899 static void gen_shadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1900 { 1901 TCGv_i32 t = tcg_temp_new_i32(); 1902 1903 tcg_gen_and_i32(t, a, b); 1904 tcg_gen_sari_i32(a, a, 1); 1905 tcg_gen_sari_i32(b, b, 1); 1906 tcg_gen_andi_i32(t, t, 1); 1907 tcg_gen_add_i32(d, a, b); 1908 tcg_gen_add_i32(d, d, t); 1909 } 1910 1911 static void gen_shadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1912 { 1913 TCGv_vec t = tcg_temp_new_vec_matching(d); 1914 1915 tcg_gen_and_vec(vece, t, a, b); 1916 tcg_gen_sari_vec(vece, a, a, 1); 1917 tcg_gen_sari_vec(vece, b, b, 1); 1918 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 1919 tcg_gen_add_vec(vece, d, a, b); 1920 tcg_gen_add_vec(vece, d, d, t); 1921 } 1922 1923 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1924 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1925 { 1926 static const TCGOpcode vecop_list[] = { 1927 INDEX_op_sari_vec, INDEX_op_add_vec, 0 1928 }; 1929 static const GVecGen3 g[] = { 1930 { .fni8 = gen_shadd8_i64, 1931 .fniv = gen_shadd_vec, 1932 .opt_opc = vecop_list, 1933 .vece = MO_8 }, 1934 { .fni8 = gen_shadd16_i64, 1935 .fniv = gen_shadd_vec, 1936 .opt_opc = vecop_list, 1937 .vece = MO_16 }, 1938 { .fni4 = gen_shadd_i32, 1939 .fniv = gen_shadd_vec, 1940 .opt_opc = vecop_list, 1941 .vece = MO_32 }, 1942 }; 1943 tcg_debug_assert(vece <= MO_32); 1944 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 1945 } 1946 1947 static void gen_uhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1948 { 1949 TCGv_i64 t = tcg_temp_new_i64(); 1950 1951 tcg_gen_and_i64(t, a, b); 1952 tcg_gen_vec_shr8i_i64(a, a, 1); 1953 tcg_gen_vec_shr8i_i64(b, b, 1); 1954 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 1955 tcg_gen_vec_add8_i64(d, a, b); 1956 tcg_gen_vec_add8_i64(d, d, t); 1957 } 1958 1959 static void gen_uhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1960 { 1961 TCGv_i64 t = tcg_temp_new_i64(); 1962 1963 tcg_gen_and_i64(t, a, b); 1964 tcg_gen_vec_shr16i_i64(a, a, 1); 1965 tcg_gen_vec_shr16i_i64(b, b, 1); 1966 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 1967 tcg_gen_vec_add16_i64(d, a, b); 1968 tcg_gen_vec_add16_i64(d, d, t); 1969 } 1970 1971 static void gen_uhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1972 { 1973 TCGv_i32 t = tcg_temp_new_i32(); 1974 1975 tcg_gen_and_i32(t, a, b); 1976 tcg_gen_shri_i32(a, a, 1); 1977 tcg_gen_shri_i32(b, b, 1); 1978 tcg_gen_andi_i32(t, t, 1); 1979 tcg_gen_add_i32(d, a, b); 1980 tcg_gen_add_i32(d, d, t); 1981 } 1982 1983 static void gen_uhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1984 { 1985 TCGv_vec t = tcg_temp_new_vec_matching(d); 1986 1987 tcg_gen_and_vec(vece, t, a, b); 1988 tcg_gen_shri_vec(vece, a, a, 1); 1989 tcg_gen_shri_vec(vece, b, b, 1); 1990 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 1991 tcg_gen_add_vec(vece, d, a, b); 1992 tcg_gen_add_vec(vece, d, d, t); 1993 } 1994 1995 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1996 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1997 { 1998 static const TCGOpcode vecop_list[] = { 1999 INDEX_op_shri_vec, INDEX_op_add_vec, 0 2000 }; 2001 static const GVecGen3 g[] = { 2002 { .fni8 = gen_uhadd8_i64, 2003 .fniv = gen_uhadd_vec, 2004 .opt_opc = vecop_list, 2005 .vece = MO_8 }, 2006 { .fni8 = gen_uhadd16_i64, 2007 .fniv = gen_uhadd_vec, 2008 .opt_opc = vecop_list, 2009 .vece = MO_16 }, 2010 { .fni4 = gen_uhadd_i32, 2011 .fniv = gen_uhadd_vec, 2012 .opt_opc = vecop_list, 2013 .vece = MO_32 }, 2014 }; 2015 tcg_debug_assert(vece <= MO_32); 2016 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2017 } 2018 2019 static void gen_shsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2020 { 2021 TCGv_i64 t = tcg_temp_new_i64(); 2022 2023 tcg_gen_andc_i64(t, b, a); 2024 tcg_gen_vec_sar8i_i64(a, a, 1); 2025 tcg_gen_vec_sar8i_i64(b, b, 1); 2026 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2027 tcg_gen_vec_sub8_i64(d, a, b); 2028 tcg_gen_vec_sub8_i64(d, d, t); 2029 } 2030 2031 static void gen_shsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2032 { 2033 TCGv_i64 t = tcg_temp_new_i64(); 2034 2035 tcg_gen_andc_i64(t, b, a); 2036 tcg_gen_vec_sar16i_i64(a, a, 1); 2037 tcg_gen_vec_sar16i_i64(b, b, 1); 2038 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2039 tcg_gen_vec_sub16_i64(d, a, b); 2040 tcg_gen_vec_sub16_i64(d, d, t); 2041 } 2042 2043 static void gen_shsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2044 { 2045 TCGv_i32 t = tcg_temp_new_i32(); 2046 2047 tcg_gen_andc_i32(t, b, a); 2048 tcg_gen_sari_i32(a, a, 1); 2049 tcg_gen_sari_i32(b, b, 1); 2050 tcg_gen_andi_i32(t, t, 1); 2051 tcg_gen_sub_i32(d, a, b); 2052 tcg_gen_sub_i32(d, d, t); 2053 } 2054 2055 static void gen_shsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2056 { 2057 TCGv_vec t = tcg_temp_new_vec_matching(d); 2058 2059 tcg_gen_andc_vec(vece, t, b, a); 2060 tcg_gen_sari_vec(vece, a, a, 1); 2061 tcg_gen_sari_vec(vece, b, b, 1); 2062 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2063 tcg_gen_sub_vec(vece, d, a, b); 2064 tcg_gen_sub_vec(vece, d, d, t); 2065 } 2066 2067 void gen_gvec_shsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2068 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2069 { 2070 static const TCGOpcode vecop_list[] = { 2071 INDEX_op_sari_vec, INDEX_op_sub_vec, 0 2072 }; 2073 static const GVecGen3 g[4] = { 2074 { .fni8 = gen_shsub8_i64, 2075 .fniv = gen_shsub_vec, 2076 .opt_opc = vecop_list, 2077 .vece = MO_8 }, 2078 { .fni8 = gen_shsub16_i64, 2079 .fniv = gen_shsub_vec, 2080 .opt_opc = vecop_list, 2081 .vece = MO_16 }, 2082 { .fni4 = gen_shsub_i32, 2083 .fniv = gen_shsub_vec, 2084 .opt_opc = vecop_list, 2085 .vece = MO_32 }, 2086 }; 2087 assert(vece <= MO_32); 2088 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2089 } 2090 2091 static void gen_uhsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2092 { 2093 TCGv_i64 t = tcg_temp_new_i64(); 2094 2095 tcg_gen_andc_i64(t, b, a); 2096 tcg_gen_vec_shr8i_i64(a, a, 1); 2097 tcg_gen_vec_shr8i_i64(b, b, 1); 2098 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2099 tcg_gen_vec_sub8_i64(d, a, b); 2100 tcg_gen_vec_sub8_i64(d, d, t); 2101 } 2102 2103 static void gen_uhsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2104 { 2105 TCGv_i64 t = tcg_temp_new_i64(); 2106 2107 tcg_gen_andc_i64(t, b, a); 2108 tcg_gen_vec_shr16i_i64(a, a, 1); 2109 tcg_gen_vec_shr16i_i64(b, b, 1); 2110 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2111 tcg_gen_vec_sub16_i64(d, a, b); 2112 tcg_gen_vec_sub16_i64(d, d, t); 2113 } 2114 2115 static void gen_uhsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2116 { 2117 TCGv_i32 t = tcg_temp_new_i32(); 2118 2119 tcg_gen_andc_i32(t, b, a); 2120 tcg_gen_shri_i32(a, a, 1); 2121 tcg_gen_shri_i32(b, b, 1); 2122 tcg_gen_andi_i32(t, t, 1); 2123 tcg_gen_sub_i32(d, a, b); 2124 tcg_gen_sub_i32(d, d, t); 2125 } 2126 2127 static void gen_uhsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2128 { 2129 TCGv_vec t = tcg_temp_new_vec_matching(d); 2130 2131 tcg_gen_andc_vec(vece, t, b, a); 2132 tcg_gen_shri_vec(vece, a, a, 1); 2133 tcg_gen_shri_vec(vece, b, b, 1); 2134 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2135 tcg_gen_sub_vec(vece, d, a, b); 2136 tcg_gen_sub_vec(vece, d, d, t); 2137 } 2138 2139 void gen_gvec_uhsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2140 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2141 { 2142 static const TCGOpcode vecop_list[] = { 2143 INDEX_op_shri_vec, INDEX_op_sub_vec, 0 2144 }; 2145 static const GVecGen3 g[4] = { 2146 { .fni8 = gen_uhsub8_i64, 2147 .fniv = gen_uhsub_vec, 2148 .opt_opc = vecop_list, 2149 .vece = MO_8 }, 2150 { .fni8 = gen_uhsub16_i64, 2151 .fniv = gen_uhsub_vec, 2152 .opt_opc = vecop_list, 2153 .vece = MO_16 }, 2154 { .fni4 = gen_uhsub_i32, 2155 .fniv = gen_uhsub_vec, 2156 .opt_opc = vecop_list, 2157 .vece = MO_32 }, 2158 }; 2159 assert(vece <= MO_32); 2160 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2161 } 2162 2163 static void gen_srhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2164 { 2165 TCGv_i64 t = tcg_temp_new_i64(); 2166 2167 tcg_gen_or_i64(t, a, b); 2168 tcg_gen_vec_sar8i_i64(a, a, 1); 2169 tcg_gen_vec_sar8i_i64(b, b, 1); 2170 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2171 tcg_gen_vec_add8_i64(d, a, b); 2172 tcg_gen_vec_add8_i64(d, d, t); 2173 } 2174 2175 static void gen_srhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2176 { 2177 TCGv_i64 t = tcg_temp_new_i64(); 2178 2179 tcg_gen_or_i64(t, a, b); 2180 tcg_gen_vec_sar16i_i64(a, a, 1); 2181 tcg_gen_vec_sar16i_i64(b, b, 1); 2182 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2183 tcg_gen_vec_add16_i64(d, a, b); 2184 tcg_gen_vec_add16_i64(d, d, t); 2185 } 2186 2187 static void gen_srhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2188 { 2189 TCGv_i32 t = tcg_temp_new_i32(); 2190 2191 tcg_gen_or_i32(t, a, b); 2192 tcg_gen_sari_i32(a, a, 1); 2193 tcg_gen_sari_i32(b, b, 1); 2194 tcg_gen_andi_i32(t, t, 1); 2195 tcg_gen_add_i32(d, a, b); 2196 tcg_gen_add_i32(d, d, t); 2197 } 2198 2199 static void gen_srhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2200 { 2201 TCGv_vec t = tcg_temp_new_vec_matching(d); 2202 2203 tcg_gen_or_vec(vece, t, a, b); 2204 tcg_gen_sari_vec(vece, a, a, 1); 2205 tcg_gen_sari_vec(vece, b, b, 1); 2206 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2207 tcg_gen_add_vec(vece, d, a, b); 2208 tcg_gen_add_vec(vece, d, d, t); 2209 } 2210 2211 void gen_gvec_srhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2212 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2213 { 2214 static const TCGOpcode vecop_list[] = { 2215 INDEX_op_sari_vec, INDEX_op_add_vec, 0 2216 }; 2217 static const GVecGen3 g[] = { 2218 { .fni8 = gen_srhadd8_i64, 2219 .fniv = gen_srhadd_vec, 2220 .opt_opc = vecop_list, 2221 .vece = MO_8 }, 2222 { .fni8 = gen_srhadd16_i64, 2223 .fniv = gen_srhadd_vec, 2224 .opt_opc = vecop_list, 2225 .vece = MO_16 }, 2226 { .fni4 = gen_srhadd_i32, 2227 .fniv = gen_srhadd_vec, 2228 .opt_opc = vecop_list, 2229 .vece = MO_32 }, 2230 }; 2231 assert(vece <= MO_32); 2232 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2233 } 2234 2235 static void gen_urhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2236 { 2237 TCGv_i64 t = tcg_temp_new_i64(); 2238 2239 tcg_gen_or_i64(t, a, b); 2240 tcg_gen_vec_shr8i_i64(a, a, 1); 2241 tcg_gen_vec_shr8i_i64(b, b, 1); 2242 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2243 tcg_gen_vec_add8_i64(d, a, b); 2244 tcg_gen_vec_add8_i64(d, d, t); 2245 } 2246 2247 static void gen_urhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2248 { 2249 TCGv_i64 t = tcg_temp_new_i64(); 2250 2251 tcg_gen_or_i64(t, a, b); 2252 tcg_gen_vec_shr16i_i64(a, a, 1); 2253 tcg_gen_vec_shr16i_i64(b, b, 1); 2254 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2255 tcg_gen_vec_add16_i64(d, a, b); 2256 tcg_gen_vec_add16_i64(d, d, t); 2257 } 2258 2259 static void gen_urhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2260 { 2261 TCGv_i32 t = tcg_temp_new_i32(); 2262 2263 tcg_gen_or_i32(t, a, b); 2264 tcg_gen_shri_i32(a, a, 1); 2265 tcg_gen_shri_i32(b, b, 1); 2266 tcg_gen_andi_i32(t, t, 1); 2267 tcg_gen_add_i32(d, a, b); 2268 tcg_gen_add_i32(d, d, t); 2269 } 2270 2271 static void gen_urhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2272 { 2273 TCGv_vec t = tcg_temp_new_vec_matching(d); 2274 2275 tcg_gen_or_vec(vece, t, a, b); 2276 tcg_gen_shri_vec(vece, a, a, 1); 2277 tcg_gen_shri_vec(vece, b, b, 1); 2278 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2279 tcg_gen_add_vec(vece, d, a, b); 2280 tcg_gen_add_vec(vece, d, d, t); 2281 } 2282 2283 void gen_gvec_urhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2284 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2285 { 2286 static const TCGOpcode vecop_list[] = { 2287 INDEX_op_shri_vec, INDEX_op_add_vec, 0 2288 }; 2289 static const GVecGen3 g[] = { 2290 { .fni8 = gen_urhadd8_i64, 2291 .fniv = gen_urhadd_vec, 2292 .opt_opc = vecop_list, 2293 .vece = MO_8 }, 2294 { .fni8 = gen_urhadd16_i64, 2295 .fniv = gen_urhadd_vec, 2296 .opt_opc = vecop_list, 2297 .vece = MO_16 }, 2298 { .fni4 = gen_urhadd_i32, 2299 .fniv = gen_urhadd_vec, 2300 .opt_opc = vecop_list, 2301 .vece = MO_32 }, 2302 }; 2303 assert(vece <= MO_32); 2304 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2305 } 2306