1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64TargetLowering class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64ISelLowering.h" 14 #include "AArch64CallingConvention.h" 15 #include "AArch64ExpandImm.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64PerfectShuffle.h" 18 #include "AArch64RegisterInfo.h" 19 #include "AArch64Subtarget.h" 20 #include "MCTargetDesc/AArch64AddressingModes.h" 21 #include "Utils/AArch64BaseInfo.h" 22 #include "llvm/ADT/APFloat.h" 23 #include "llvm/ADT/APInt.h" 24 #include "llvm/ADT/ArrayRef.h" 25 #include "llvm/ADT/STLExtras.h" 26 #include "llvm/ADT/SmallSet.h" 27 #include "llvm/ADT/SmallVector.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/ADT/StringRef.h" 30 #include "llvm/ADT/Twine.h" 31 #include "llvm/Analysis/LoopInfo.h" 32 #include "llvm/Analysis/MemoryLocation.h" 33 #include "llvm/Analysis/ObjCARCUtil.h" 34 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 35 #include "llvm/Analysis/TargetTransformInfo.h" 36 #include "llvm/Analysis/ValueTracking.h" 37 #include "llvm/Analysis/VectorUtils.h" 38 #include "llvm/CodeGen/Analysis.h" 39 #include "llvm/CodeGen/CallingConvLower.h" 40 #include "llvm/CodeGen/ComplexDeinterleavingPass.h" 41 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 42 #include "llvm/CodeGen/GlobalISel/Utils.h" 43 #include "llvm/CodeGen/ISDOpcodes.h" 44 #include "llvm/CodeGen/MachineBasicBlock.h" 45 #include "llvm/CodeGen/MachineFrameInfo.h" 46 #include "llvm/CodeGen/MachineFunction.h" 47 #include "llvm/CodeGen/MachineInstr.h" 48 #include "llvm/CodeGen/MachineInstrBuilder.h" 49 #include "llvm/CodeGen/MachineMemOperand.h" 50 #include "llvm/CodeGen/MachineRegisterInfo.h" 51 #include "llvm/CodeGen/MachineValueType.h" 52 #include "llvm/CodeGen/RuntimeLibcalls.h" 53 #include "llvm/CodeGen/SelectionDAG.h" 54 #include "llvm/CodeGen/SelectionDAGNodes.h" 55 #include "llvm/CodeGen/TargetCallingConv.h" 56 #include "llvm/CodeGen/TargetInstrInfo.h" 57 #include "llvm/CodeGen/TargetOpcodes.h" 58 #include "llvm/CodeGen/ValueTypes.h" 59 #include "llvm/IR/Attributes.h" 60 #include "llvm/IR/Constants.h" 61 #include "llvm/IR/DataLayout.h" 62 #include "llvm/IR/DebugLoc.h" 63 #include "llvm/IR/DerivedTypes.h" 64 #include "llvm/IR/Function.h" 65 #include "llvm/IR/GetElementPtrTypeIterator.h" 66 #include "llvm/IR/GlobalValue.h" 67 #include "llvm/IR/IRBuilder.h" 68 #include "llvm/IR/Instruction.h" 69 #include "llvm/IR/Instructions.h" 70 #include "llvm/IR/IntrinsicInst.h" 71 #include "llvm/IR/Intrinsics.h" 72 #include "llvm/IR/IntrinsicsAArch64.h" 73 #include "llvm/IR/Module.h" 74 #include "llvm/IR/PatternMatch.h" 75 #include "llvm/IR/Type.h" 76 #include "llvm/IR/Use.h" 77 #include "llvm/IR/Value.h" 78 #include "llvm/MC/MCRegisterInfo.h" 79 #include "llvm/Support/AtomicOrdering.h" 80 #include "llvm/Support/Casting.h" 81 #include "llvm/Support/CodeGen.h" 82 #include "llvm/Support/CommandLine.h" 83 #include "llvm/Support/Debug.h" 84 #include "llvm/Support/ErrorHandling.h" 85 #include "llvm/Support/InstructionCost.h" 86 #include "llvm/Support/KnownBits.h" 87 #include "llvm/Support/MathExtras.h" 88 #include "llvm/Support/raw_ostream.h" 89 #include "llvm/Target/TargetMachine.h" 90 #include "llvm/Target/TargetOptions.h" 91 #include "llvm/TargetParser/Triple.h" 92 #include <algorithm> 93 #include <bitset> 94 #include <cassert> 95 #include <cctype> 96 #include <cstdint> 97 #include <cstdlib> 98 #include <iterator> 99 #include <limits> 100 #include <optional> 101 #include <tuple> 102 #include <utility> 103 #include <vector> 104 105 using namespace llvm; 106 using namespace llvm::PatternMatch; 107 108 #define DEBUG_TYPE "aarch64-lower" 109 110 STATISTIC(NumTailCalls, "Number of tail calls"); 111 STATISTIC(NumShiftInserts, "Number of vector shift inserts"); 112 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); 113 114 // FIXME: The necessary dtprel relocations don't seem to be supported 115 // well in the GNU bfd and gold linkers at the moment. Therefore, by 116 // default, for now, fall back to GeneralDynamic code generation. 117 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( 118 "aarch64-elf-ldtls-generation", cl::Hidden, 119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"), 120 cl::init(false)); 121 122 static cl::opt<bool> 123 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, 124 cl::desc("Enable AArch64 logical imm instruction " 125 "optimization"), 126 cl::init(true)); 127 128 // Temporary option added for the purpose of testing functionality added 129 // to DAGCombiner.cpp in D92230. It is expected that this can be removed 130 // in future when both implementations will be based off MGATHER rather 131 // than the GLD1 nodes added for the SVE gather load intrinsics. 132 static cl::opt<bool> 133 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, 134 cl::desc("Combine extends of AArch64 masked " 135 "gather intrinsics"), 136 cl::init(true)); 137 138 static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, 139 cl::desc("Combine ext and trunc to TBL"), 140 cl::init(true)); 141 142 // All of the XOR, OR and CMP use ALU ports, and data dependency will become the 143 // bottleneck after this transform on high end CPU. So this max leaf node 144 // limitation is guard cmp+ccmp will be profitable. 145 static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, 146 cl::desc("Maximum of xors")); 147 148 /// Value type used for condition codes. 149 static const MVT MVT_CC = MVT::i32; 150 151 static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2, 152 AArch64::X3, AArch64::X4, AArch64::X5, 153 AArch64::X6, AArch64::X7}; 154 static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, 155 AArch64::Q3, AArch64::Q4, AArch64::Q5, 156 AArch64::Q6, AArch64::Q7}; 157 158 ArrayRef<MCPhysReg> llvm::AArch64::getGPRArgRegs() { return GPRArgRegs; } 159 160 ArrayRef<MCPhysReg> llvm::AArch64::getFPRArgRegs() { return FPRArgRegs; } 161 162 static inline EVT getPackedSVEVectorVT(EVT VT) { 163 switch (VT.getSimpleVT().SimpleTy) { 164 default: 165 llvm_unreachable("unexpected element type for vector"); 166 case MVT::i8: 167 return MVT::nxv16i8; 168 case MVT::i16: 169 return MVT::nxv8i16; 170 case MVT::i32: 171 return MVT::nxv4i32; 172 case MVT::i64: 173 return MVT::nxv2i64; 174 case MVT::f16: 175 return MVT::nxv8f16; 176 case MVT::f32: 177 return MVT::nxv4f32; 178 case MVT::f64: 179 return MVT::nxv2f64; 180 case MVT::bf16: 181 return MVT::nxv8bf16; 182 } 183 } 184 185 // NOTE: Currently there's only a need to return integer vector types. If this 186 // changes then just add an extra "type" parameter. 187 static inline EVT getPackedSVEVectorVT(ElementCount EC) { 188 switch (EC.getKnownMinValue()) { 189 default: 190 llvm_unreachable("unexpected element count for vector"); 191 case 16: 192 return MVT::nxv16i8; 193 case 8: 194 return MVT::nxv8i16; 195 case 4: 196 return MVT::nxv4i32; 197 case 2: 198 return MVT::nxv2i64; 199 } 200 } 201 202 static inline EVT getPromotedVTForPredicate(EVT VT) { 203 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) && 204 "Expected scalable predicate vector type!"); 205 switch (VT.getVectorMinNumElements()) { 206 default: 207 llvm_unreachable("unexpected element count for vector"); 208 case 2: 209 return MVT::nxv2i64; 210 case 4: 211 return MVT::nxv4i32; 212 case 8: 213 return MVT::nxv8i16; 214 case 16: 215 return MVT::nxv16i8; 216 } 217 } 218 219 /// Returns true if VT's elements occupy the lowest bit positions of its 220 /// associated register class without any intervening space. 221 /// 222 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the 223 /// same register class, but only nxv8f16 can be treated as a packed vector. 224 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) { 225 assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 226 "Expected legal vector type!"); 227 return VT.isFixedLengthVector() || 228 VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock; 229 } 230 231 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading 232 // predicate and end with a passthru value matching the result type. 233 static bool isMergePassthruOpcode(unsigned Opc) { 234 switch (Opc) { 235 default: 236 return false; 237 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU: 238 case AArch64ISD::BSWAP_MERGE_PASSTHRU: 239 case AArch64ISD::REVH_MERGE_PASSTHRU: 240 case AArch64ISD::REVW_MERGE_PASSTHRU: 241 case AArch64ISD::REVD_MERGE_PASSTHRU: 242 case AArch64ISD::CTLZ_MERGE_PASSTHRU: 243 case AArch64ISD::CTPOP_MERGE_PASSTHRU: 244 case AArch64ISD::DUP_MERGE_PASSTHRU: 245 case AArch64ISD::ABS_MERGE_PASSTHRU: 246 case AArch64ISD::NEG_MERGE_PASSTHRU: 247 case AArch64ISD::FNEG_MERGE_PASSTHRU: 248 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU: 249 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU: 250 case AArch64ISD::FCEIL_MERGE_PASSTHRU: 251 case AArch64ISD::FFLOOR_MERGE_PASSTHRU: 252 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU: 253 case AArch64ISD::FRINT_MERGE_PASSTHRU: 254 case AArch64ISD::FROUND_MERGE_PASSTHRU: 255 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU: 256 case AArch64ISD::FTRUNC_MERGE_PASSTHRU: 257 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU: 258 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU: 259 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU: 260 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU: 261 case AArch64ISD::FCVTZU_MERGE_PASSTHRU: 262 case AArch64ISD::FCVTZS_MERGE_PASSTHRU: 263 case AArch64ISD::FSQRT_MERGE_PASSTHRU: 264 case AArch64ISD::FRECPX_MERGE_PASSTHRU: 265 case AArch64ISD::FABS_MERGE_PASSTHRU: 266 return true; 267 } 268 } 269 270 // Returns true if inactive lanes are known to be zeroed by construction. 271 static bool isZeroingInactiveLanes(SDValue Op) { 272 switch (Op.getOpcode()) { 273 default: 274 // We guarantee i1 splat_vectors to zero the other lanes by 275 // implementing it with ptrue and possibly a punpklo for nxv1i1. 276 if (ISD::isConstantSplatVectorAllOnes(Op.getNode())) 277 return true; 278 return false; 279 case AArch64ISD::PTRUE: 280 case AArch64ISD::SETCC_MERGE_ZERO: 281 return true; 282 case ISD::INTRINSIC_WO_CHAIN: 283 switch (Op.getConstantOperandVal(0)) { 284 default: 285 return false; 286 case Intrinsic::aarch64_sve_ptrue: 287 case Intrinsic::aarch64_sve_pnext: 288 case Intrinsic::aarch64_sve_cmpeq: 289 case Intrinsic::aarch64_sve_cmpne: 290 case Intrinsic::aarch64_sve_cmpge: 291 case Intrinsic::aarch64_sve_cmpgt: 292 case Intrinsic::aarch64_sve_cmphs: 293 case Intrinsic::aarch64_sve_cmphi: 294 case Intrinsic::aarch64_sve_cmpeq_wide: 295 case Intrinsic::aarch64_sve_cmpne_wide: 296 case Intrinsic::aarch64_sve_cmpge_wide: 297 case Intrinsic::aarch64_sve_cmpgt_wide: 298 case Intrinsic::aarch64_sve_cmplt_wide: 299 case Intrinsic::aarch64_sve_cmple_wide: 300 case Intrinsic::aarch64_sve_cmphs_wide: 301 case Intrinsic::aarch64_sve_cmphi_wide: 302 case Intrinsic::aarch64_sve_cmplo_wide: 303 case Intrinsic::aarch64_sve_cmpls_wide: 304 case Intrinsic::aarch64_sve_fcmpeq: 305 case Intrinsic::aarch64_sve_fcmpne: 306 case Intrinsic::aarch64_sve_fcmpge: 307 case Intrinsic::aarch64_sve_fcmpgt: 308 case Intrinsic::aarch64_sve_fcmpuo: 309 case Intrinsic::aarch64_sve_facgt: 310 case Intrinsic::aarch64_sve_facge: 311 case Intrinsic::aarch64_sve_whilege: 312 case Intrinsic::aarch64_sve_whilegt: 313 case Intrinsic::aarch64_sve_whilehi: 314 case Intrinsic::aarch64_sve_whilehs: 315 case Intrinsic::aarch64_sve_whilele: 316 case Intrinsic::aarch64_sve_whilelo: 317 case Intrinsic::aarch64_sve_whilels: 318 case Intrinsic::aarch64_sve_whilelt: 319 case Intrinsic::aarch64_sve_match: 320 case Intrinsic::aarch64_sve_nmatch: 321 case Intrinsic::aarch64_sve_whilege_x2: 322 case Intrinsic::aarch64_sve_whilegt_x2: 323 case Intrinsic::aarch64_sve_whilehi_x2: 324 case Intrinsic::aarch64_sve_whilehs_x2: 325 case Intrinsic::aarch64_sve_whilele_x2: 326 case Intrinsic::aarch64_sve_whilelo_x2: 327 case Intrinsic::aarch64_sve_whilels_x2: 328 case Intrinsic::aarch64_sve_whilelt_x2: 329 return true; 330 } 331 } 332 } 333 334 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, 335 const AArch64Subtarget &STI) 336 : TargetLowering(TM), Subtarget(&STI) { 337 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so 338 // we have to make something up. Arbitrarily, choose ZeroOrOne. 339 setBooleanContents(ZeroOrOneBooleanContent); 340 // When comparing vectors the result sets the different elements in the 341 // vector to all-one or all-zero. 342 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 343 344 // Set up the register classes. 345 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); 346 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); 347 348 if (Subtarget->hasLS64()) { 349 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass); 350 setOperationAction(ISD::LOAD, MVT::i64x8, Custom); 351 setOperationAction(ISD::STORE, MVT::i64x8, Custom); 352 } 353 354 if (Subtarget->hasFPARMv8()) { 355 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 356 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); 357 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 358 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 359 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 360 } 361 362 if (Subtarget->hasNEON()) { 363 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); 364 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); 365 // Someone set us up the NEON. 366 addDRTypeForNEON(MVT::v2f32); 367 addDRTypeForNEON(MVT::v8i8); 368 addDRTypeForNEON(MVT::v4i16); 369 addDRTypeForNEON(MVT::v2i32); 370 addDRTypeForNEON(MVT::v1i64); 371 addDRTypeForNEON(MVT::v1f64); 372 addDRTypeForNEON(MVT::v4f16); 373 if (Subtarget->hasBF16()) 374 addDRTypeForNEON(MVT::v4bf16); 375 376 addQRTypeForNEON(MVT::v4f32); 377 addQRTypeForNEON(MVT::v2f64); 378 addQRTypeForNEON(MVT::v16i8); 379 addQRTypeForNEON(MVT::v8i16); 380 addQRTypeForNEON(MVT::v4i32); 381 addQRTypeForNEON(MVT::v2i64); 382 addQRTypeForNEON(MVT::v8f16); 383 if (Subtarget->hasBF16()) 384 addQRTypeForNEON(MVT::v8bf16); 385 } 386 387 if (Subtarget->hasSVEorSME()) { 388 // Add legal sve predicate types 389 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass); 390 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass); 391 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass); 392 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass); 393 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass); 394 395 // Add legal sve data types 396 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass); 397 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass); 398 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass); 399 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass); 400 401 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass); 402 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass); 403 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass); 404 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass); 405 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); 406 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); 407 408 if (Subtarget->hasBF16()) { 409 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass); 410 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass); 411 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass); 412 } 413 414 if (Subtarget->useSVEForFixedLengthVectors()) { 415 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) 416 if (useSVEForFixedLengthVectorVT(VT)) 417 addRegisterClass(VT, &AArch64::ZPRRegClass); 418 419 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) 420 if (useSVEForFixedLengthVectorVT(VT)) 421 addRegisterClass(VT, &AArch64::ZPRRegClass); 422 } 423 } 424 425 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) { 426 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass); 427 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1); 428 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1); 429 430 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom); 431 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand); 432 } 433 434 // Compute derived properties from the register classes 435 computeRegisterProperties(Subtarget->getRegisterInfo()); 436 437 // Provide all sorts of operation actions 438 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 439 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 440 setOperationAction(ISD::SETCC, MVT::i32, Custom); 441 setOperationAction(ISD::SETCC, MVT::i64, Custom); 442 setOperationAction(ISD::SETCC, MVT::f16, Custom); 443 setOperationAction(ISD::SETCC, MVT::f32, Custom); 444 setOperationAction(ISD::SETCC, MVT::f64, Custom); 445 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); 446 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); 447 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); 448 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); 449 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); 450 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); 451 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 452 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 453 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 454 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 455 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 456 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 457 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 458 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 459 setOperationAction(ISD::SELECT, MVT::i32, Custom); 460 setOperationAction(ISD::SELECT, MVT::i64, Custom); 461 setOperationAction(ISD::SELECT, MVT::f16, Custom); 462 setOperationAction(ISD::SELECT, MVT::bf16, Custom); 463 setOperationAction(ISD::SELECT, MVT::f32, Custom); 464 setOperationAction(ISD::SELECT, MVT::f64, Custom); 465 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 466 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 467 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 468 setOperationAction(ISD::SELECT_CC, MVT::bf16, Expand); 469 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 470 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 471 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 472 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 473 setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom); 474 475 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 476 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 477 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 478 479 setOperationAction(ISD::FREM, MVT::f32, Expand); 480 setOperationAction(ISD::FREM, MVT::f64, Expand); 481 setOperationAction(ISD::FREM, MVT::f80, Expand); 482 483 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); 484 485 // Custom lowering hooks are needed for XOR 486 // to fold it into CSINC/CSINV. 487 setOperationAction(ISD::XOR, MVT::i32, Custom); 488 setOperationAction(ISD::XOR, MVT::i64, Custom); 489 490 // Virtually no operation on f128 is legal, but LLVM can't expand them when 491 // there's a valid register class, so we need custom operations in most cases. 492 setOperationAction(ISD::FABS, MVT::f128, Expand); 493 setOperationAction(ISD::FADD, MVT::f128, LibCall); 494 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 495 setOperationAction(ISD::FCOS, MVT::f128, Expand); 496 setOperationAction(ISD::FDIV, MVT::f128, LibCall); 497 setOperationAction(ISD::FMA, MVT::f128, Expand); 498 setOperationAction(ISD::FMUL, MVT::f128, LibCall); 499 setOperationAction(ISD::FNEG, MVT::f128, Expand); 500 setOperationAction(ISD::FPOW, MVT::f128, Expand); 501 setOperationAction(ISD::FREM, MVT::f128, Expand); 502 setOperationAction(ISD::FRINT, MVT::f128, Expand); 503 setOperationAction(ISD::FSIN, MVT::f128, Expand); 504 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 505 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 506 setOperationAction(ISD::FSUB, MVT::f128, LibCall); 507 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 508 setOperationAction(ISD::SETCC, MVT::f128, Custom); 509 setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); 510 setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom); 511 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 512 setOperationAction(ISD::SELECT, MVT::f128, Custom); 513 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 514 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 515 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently 516 // aren't handled. 517 518 // Lowering for many of the conversions is actually specified by the non-f128 519 // type. The LowerXXX function will be trivial when f128 isn't involved. 520 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 521 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 522 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 523 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); 524 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); 525 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); 526 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 527 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 528 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 529 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); 530 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); 531 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); 532 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 533 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 534 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 535 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); 536 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); 537 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); 538 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 539 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 540 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 541 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); 542 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); 543 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); 544 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 545 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 546 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 547 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); 548 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); 549 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); 550 551 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom); 552 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); 553 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom); 554 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); 555 556 // Variable arguments. 557 setOperationAction(ISD::VASTART, MVT::Other, Custom); 558 setOperationAction(ISD::VAARG, MVT::Other, Custom); 559 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 560 setOperationAction(ISD::VAEND, MVT::Other, Expand); 561 562 // Variable-sized objects. 563 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 564 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 565 566 // Lowering Funnel Shifts to EXTR 567 setOperationAction(ISD::FSHR, MVT::i32, Custom); 568 setOperationAction(ISD::FSHR, MVT::i64, Custom); 569 setOperationAction(ISD::FSHL, MVT::i32, Custom); 570 setOperationAction(ISD::FSHL, MVT::i64, Custom); 571 572 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); 573 574 // Constant pool entries 575 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 576 577 // BlockAddress 578 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 579 580 // AArch64 lacks both left-rotate and popcount instructions. 581 setOperationAction(ISD::ROTL, MVT::i32, Expand); 582 setOperationAction(ISD::ROTL, MVT::i64, Expand); 583 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 584 setOperationAction(ISD::ROTL, VT, Expand); 585 setOperationAction(ISD::ROTR, VT, Expand); 586 } 587 588 // AArch64 doesn't have i32 MULH{S|U}. 589 setOperationAction(ISD::MULHU, MVT::i32, Expand); 590 setOperationAction(ISD::MULHS, MVT::i32, Expand); 591 592 // AArch64 doesn't have {U|S}MUL_LOHI. 593 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 594 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 595 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 596 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 597 598 if (Subtarget->hasCSSC()) { 599 setOperationAction(ISD::CTPOP, MVT::i32, Legal); 600 setOperationAction(ISD::CTPOP, MVT::i64, Legal); 601 setOperationAction(ISD::CTPOP, MVT::i128, Expand); 602 603 setOperationAction(ISD::PARITY, MVT::i128, Expand); 604 605 setOperationAction(ISD::CTTZ, MVT::i32, Legal); 606 setOperationAction(ISD::CTTZ, MVT::i64, Legal); 607 setOperationAction(ISD::CTTZ, MVT::i128, Expand); 608 609 setOperationAction(ISD::ABS, MVT::i32, Legal); 610 setOperationAction(ISD::ABS, MVT::i64, Legal); 611 612 setOperationAction(ISD::SMAX, MVT::i32, Legal); 613 setOperationAction(ISD::SMAX, MVT::i64, Legal); 614 setOperationAction(ISD::UMAX, MVT::i32, Legal); 615 setOperationAction(ISD::UMAX, MVT::i64, Legal); 616 617 setOperationAction(ISD::SMIN, MVT::i32, Legal); 618 setOperationAction(ISD::SMIN, MVT::i64, Legal); 619 setOperationAction(ISD::UMIN, MVT::i32, Legal); 620 setOperationAction(ISD::UMIN, MVT::i64, Legal); 621 } else { 622 setOperationAction(ISD::CTPOP, MVT::i32, Custom); 623 setOperationAction(ISD::CTPOP, MVT::i64, Custom); 624 setOperationAction(ISD::CTPOP, MVT::i128, Custom); 625 626 setOperationAction(ISD::PARITY, MVT::i64, Custom); 627 setOperationAction(ISD::PARITY, MVT::i128, Custom); 628 629 setOperationAction(ISD::ABS, MVT::i32, Custom); 630 setOperationAction(ISD::ABS, MVT::i64, Custom); 631 } 632 633 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 634 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 635 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 636 setOperationAction(ISD::SDIVREM, VT, Expand); 637 setOperationAction(ISD::UDIVREM, VT, Expand); 638 } 639 setOperationAction(ISD::SREM, MVT::i32, Expand); 640 setOperationAction(ISD::SREM, MVT::i64, Expand); 641 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 642 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 643 setOperationAction(ISD::UREM, MVT::i32, Expand); 644 setOperationAction(ISD::UREM, MVT::i64, Expand); 645 646 // Custom lower Add/Sub/Mul with overflow. 647 setOperationAction(ISD::SADDO, MVT::i32, Custom); 648 setOperationAction(ISD::SADDO, MVT::i64, Custom); 649 setOperationAction(ISD::UADDO, MVT::i32, Custom); 650 setOperationAction(ISD::UADDO, MVT::i64, Custom); 651 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 652 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 653 setOperationAction(ISD::USUBO, MVT::i32, Custom); 654 setOperationAction(ISD::USUBO, MVT::i64, Custom); 655 setOperationAction(ISD::SMULO, MVT::i32, Custom); 656 setOperationAction(ISD::SMULO, MVT::i64, Custom); 657 setOperationAction(ISD::UMULO, MVT::i32, Custom); 658 setOperationAction(ISD::UMULO, MVT::i64, Custom); 659 660 setOperationAction(ISD::UADDO_CARRY, MVT::i32, Custom); 661 setOperationAction(ISD::UADDO_CARRY, MVT::i64, Custom); 662 setOperationAction(ISD::USUBO_CARRY, MVT::i32, Custom); 663 setOperationAction(ISD::USUBO_CARRY, MVT::i64, Custom); 664 setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom); 665 setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom); 666 setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom); 667 setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom); 668 669 setOperationAction(ISD::FSIN, MVT::f32, Expand); 670 setOperationAction(ISD::FSIN, MVT::f64, Expand); 671 setOperationAction(ISD::FCOS, MVT::f32, Expand); 672 setOperationAction(ISD::FCOS, MVT::f64, Expand); 673 setOperationAction(ISD::FPOW, MVT::f32, Expand); 674 setOperationAction(ISD::FPOW, MVT::f64, Expand); 675 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 676 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 677 if (Subtarget->hasFullFP16()) 678 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); 679 else 680 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); 681 682 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI, 683 ISD::FCOS, ISD::FSIN, ISD::FSINCOS, 684 ISD::FEXP, ISD::FEXP2, ISD::FEXP10, 685 ISD::FLOG, ISD::FLOG2, ISD::FLOG10, 686 ISD::STRICT_FREM, 687 ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS, 688 ISD::STRICT_FSIN, ISD::STRICT_FEXP, ISD::STRICT_FEXP2, 689 ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) { 690 setOperationAction(Op, MVT::f16, Promote); 691 setOperationAction(Op, MVT::v4f16, Expand); 692 setOperationAction(Op, MVT::v8f16, Expand); 693 } 694 695 if (!Subtarget->hasFullFP16()) { 696 for (auto Op : 697 {ISD::SETCC, ISD::SELECT_CC, 698 ISD::BR_CC, ISD::FADD, ISD::FSUB, 699 ISD::FMUL, ISD::FDIV, ISD::FMA, 700 ISD::FNEG, ISD::FABS, ISD::FCEIL, 701 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT, 702 ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, 703 ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM, 704 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD, 705 ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, 706 ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR, 707 ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT, 708 ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, 709 ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM, 710 ISD::STRICT_FMAXIMUM}) 711 setOperationAction(Op, MVT::f16, Promote); 712 713 // Round-to-integer need custom lowering for fp16, as Promote doesn't work 714 // because the result type is integer. 715 for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT, 716 ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT, 717 ISD::STRICT_LLRINT}) 718 setOperationAction(Op, MVT::f16, Custom); 719 720 // promote v4f16 to v4f32 when that is known to be safe. 721 setOperationPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); 722 setOperationPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); 723 setOperationPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); 724 setOperationPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); 725 726 setOperationAction(ISD::FABS, MVT::v4f16, Expand); 727 setOperationAction(ISD::FNEG, MVT::v4f16, Expand); 728 setOperationAction(ISD::FROUND, MVT::v4f16, Expand); 729 setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Expand); 730 setOperationAction(ISD::FMA, MVT::v4f16, Expand); 731 setOperationAction(ISD::SETCC, MVT::v4f16, Custom); 732 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); 733 setOperationAction(ISD::SELECT, MVT::v4f16, Expand); 734 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); 735 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); 736 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); 737 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); 738 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); 739 setOperationAction(ISD::FRINT, MVT::v4f16, Expand); 740 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); 741 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); 742 743 setOperationAction(ISD::FABS, MVT::v8f16, Expand); 744 setOperationAction(ISD::FADD, MVT::v8f16, Expand); 745 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); 746 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); 747 setOperationAction(ISD::FDIV, MVT::v8f16, Expand); 748 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); 749 setOperationAction(ISD::FMA, MVT::v8f16, Expand); 750 setOperationAction(ISD::FMUL, MVT::v8f16, Expand); 751 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); 752 setOperationAction(ISD::FNEG, MVT::v8f16, Expand); 753 setOperationAction(ISD::FROUND, MVT::v8f16, Expand); 754 setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Expand); 755 setOperationAction(ISD::FRINT, MVT::v8f16, Expand); 756 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); 757 setOperationAction(ISD::FSUB, MVT::v8f16, Expand); 758 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); 759 setOperationAction(ISD::SETCC, MVT::v8f16, Expand); 760 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); 761 setOperationAction(ISD::SELECT, MVT::v8f16, Expand); 762 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); 763 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); 764 } 765 766 // AArch64 has implementations of a lot of rounding-like FP operations. 767 for (auto Op : 768 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, 769 ISD::FRINT, ISD::FTRUNC, ISD::FROUND, 770 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM, 771 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND, 772 ISD::LLROUND, ISD::LRINT, ISD::LLRINT, 773 ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT, 774 ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, 775 ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, 776 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND, 777 ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) { 778 for (MVT Ty : {MVT::f32, MVT::f64}) 779 setOperationAction(Op, Ty, Legal); 780 if (Subtarget->hasFullFP16()) 781 setOperationAction(Op, MVT::f16, Legal); 782 } 783 784 // Basic strict FP operations are legal 785 for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL, 786 ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) { 787 for (MVT Ty : {MVT::f32, MVT::f64}) 788 setOperationAction(Op, Ty, Legal); 789 if (Subtarget->hasFullFP16()) 790 setOperationAction(Op, MVT::f16, Legal); 791 } 792 793 // Strict conversion to a larger type is legal 794 for (auto VT : {MVT::f32, MVT::f64}) 795 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); 796 797 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 798 799 setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); 800 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); 801 802 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 803 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) { 804 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall); 805 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, LibCall); 806 } else { 807 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 808 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand); 809 } 810 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); 811 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 812 813 // Generate outline atomics library calls only if LSE was not specified for 814 // subtarget 815 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) { 816 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall); 817 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall); 818 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall); 819 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall); 820 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall); 821 setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall); 822 setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall); 823 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall); 824 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall); 825 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall); 826 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall); 827 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall); 828 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall); 829 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall); 830 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall); 831 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall); 832 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall); 833 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall); 834 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall); 835 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall); 836 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall); 837 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall); 838 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall); 839 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall); 840 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall); 841 #define LCALLNAMES(A, B, N) \ 842 setLibcallName(A##N##_RELAX, #B #N "_relax"); \ 843 setLibcallName(A##N##_ACQ, #B #N "_acq"); \ 844 setLibcallName(A##N##_REL, #B #N "_rel"); \ 845 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel"); 846 #define LCALLNAME4(A, B) \ 847 LCALLNAMES(A, B, 1) \ 848 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) 849 #define LCALLNAME5(A, B) \ 850 LCALLNAMES(A, B, 1) \ 851 LCALLNAMES(A, B, 2) \ 852 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16) 853 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas) 854 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp) 855 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd) 856 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset) 857 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr) 858 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor) 859 #undef LCALLNAMES 860 #undef LCALLNAME4 861 #undef LCALLNAME5 862 } 863 864 if (Subtarget->hasLSE128()) { 865 // Custom lowering because i128 is not legal. Must be replaced by 2x64 866 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP. 867 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom); 868 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom); 869 setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom); 870 } 871 872 // 128-bit loads and stores can be done without expanding 873 setOperationAction(ISD::LOAD, MVT::i128, Custom); 874 setOperationAction(ISD::STORE, MVT::i128, Custom); 875 876 // Aligned 128-bit loads and stores are single-copy atomic according to the 877 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2. 878 if (Subtarget->hasLSE2()) { 879 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); 880 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); 881 } 882 883 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the 884 // custom lowering, as there are no un-paired non-temporal stores and 885 // legalization will break up 256 bit inputs. 886 setOperationAction(ISD::STORE, MVT::v32i8, Custom); 887 setOperationAction(ISD::STORE, MVT::v16i16, Custom); 888 setOperationAction(ISD::STORE, MVT::v16f16, Custom); 889 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 890 setOperationAction(ISD::STORE, MVT::v8f32, Custom); 891 setOperationAction(ISD::STORE, MVT::v4f64, Custom); 892 setOperationAction(ISD::STORE, MVT::v4i64, Custom); 893 894 // 256 bit non-temporal loads can be lowered to LDNP. This is done using 895 // custom lowering, as there are no un-paired non-temporal loads legalization 896 // will break up 256 bit inputs. 897 setOperationAction(ISD::LOAD, MVT::v32i8, Custom); 898 setOperationAction(ISD::LOAD, MVT::v16i16, Custom); 899 setOperationAction(ISD::LOAD, MVT::v16f16, Custom); 900 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 901 setOperationAction(ISD::LOAD, MVT::v8f32, Custom); 902 setOperationAction(ISD::LOAD, MVT::v4f64, Custom); 903 setOperationAction(ISD::LOAD, MVT::v4i64, Custom); 904 905 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0. 906 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 907 908 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 909 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 910 // Issue __sincos_stret if available. 911 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 912 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 913 } else { 914 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 915 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 916 } 917 918 if (Subtarget->getTargetTriple().isOSMSVCRT()) { 919 // MSVCRT doesn't have powi; fall back to pow 920 setLibcallName(RTLIB::POWI_F32, nullptr); 921 setLibcallName(RTLIB::POWI_F64, nullptr); 922 } 923 924 // Make floating-point constants legal for the large code model, so they don't 925 // become loads from the constant pool. 926 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { 927 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 928 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 929 } 930 931 // AArch64 does not have floating-point extending loads, i1 sign-extending 932 // load, floating-point truncating stores, or v2i32->v2i16 truncating store. 933 for (MVT VT : MVT::fp_valuetypes()) { 934 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 935 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 936 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); 937 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); 938 } 939 for (MVT VT : MVT::integer_valuetypes()) 940 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); 941 942 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 943 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 944 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 945 setTruncStoreAction(MVT::f128, MVT::f80, Expand); 946 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 947 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 948 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 949 950 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 951 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 952 setOperationAction(ISD::BITCAST, MVT::bf16, Custom); 953 954 // Indexed loads and stores are supported. 955 for (unsigned im = (unsigned)ISD::PRE_INC; 956 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 957 setIndexedLoadAction(im, MVT::i8, Legal); 958 setIndexedLoadAction(im, MVT::i16, Legal); 959 setIndexedLoadAction(im, MVT::i32, Legal); 960 setIndexedLoadAction(im, MVT::i64, Legal); 961 setIndexedLoadAction(im, MVT::f64, Legal); 962 setIndexedLoadAction(im, MVT::f32, Legal); 963 setIndexedLoadAction(im, MVT::f16, Legal); 964 setIndexedLoadAction(im, MVT::bf16, Legal); 965 setIndexedStoreAction(im, MVT::i8, Legal); 966 setIndexedStoreAction(im, MVT::i16, Legal); 967 setIndexedStoreAction(im, MVT::i32, Legal); 968 setIndexedStoreAction(im, MVT::i64, Legal); 969 setIndexedStoreAction(im, MVT::f64, Legal); 970 setIndexedStoreAction(im, MVT::f32, Legal); 971 setIndexedStoreAction(im, MVT::f16, Legal); 972 setIndexedStoreAction(im, MVT::bf16, Legal); 973 } 974 975 // Trap. 976 setOperationAction(ISD::TRAP, MVT::Other, Legal); 977 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 978 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); 979 980 // We combine OR nodes for bitfield operations. 981 setTargetDAGCombine(ISD::OR); 982 // Try to create BICs for vector ANDs. 983 setTargetDAGCombine(ISD::AND); 984 985 // Vector add and sub nodes may conceal a high-half opportunity. 986 // Also, try to fold ADD into CSINC/CSINV.. 987 setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP, 988 ISD::UINT_TO_FP}); 989 990 setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, 991 ISD::FP_TO_UINT_SAT, ISD::FADD, ISD::FDIV}); 992 993 // Try and combine setcc with csel 994 setTargetDAGCombine(ISD::SETCC); 995 996 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 997 998 setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND, 999 ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG, 1000 ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR, 1001 ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR}); 1002 setTargetDAGCombine(ISD::TRUNCATE); 1003 setTargetDAGCombine(ISD::LOAD); 1004 1005 setTargetDAGCombine(ISD::MSTORE); 1006 1007 setTargetDAGCombine(ISD::MUL); 1008 1009 setTargetDAGCombine({ISD::SELECT, ISD::VSELECT}); 1010 1011 setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN, 1012 ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, 1013 ISD::VECREDUCE_ADD, ISD::STEP_VECTOR}); 1014 1015 setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER}); 1016 1017 setTargetDAGCombine(ISD::FP_EXTEND); 1018 1019 setTargetDAGCombine(ISD::GlobalAddress); 1020 1021 setTargetDAGCombine(ISD::CTLZ); 1022 1023 setTargetDAGCombine(ISD::VECREDUCE_AND); 1024 setTargetDAGCombine(ISD::VECREDUCE_OR); 1025 setTargetDAGCombine(ISD::VECREDUCE_XOR); 1026 1027 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); 1028 1029 // In case of strict alignment, avoid an excessive number of byte wide stores. 1030 MaxStoresPerMemsetOptSize = 8; 1031 MaxStoresPerMemset = 1032 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32; 1033 1034 MaxGluedStoresPerMemcpy = 4; 1035 MaxStoresPerMemcpyOptSize = 4; 1036 MaxStoresPerMemcpy = 1037 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16; 1038 1039 MaxStoresPerMemmoveOptSize = 4; 1040 MaxStoresPerMemmove = 4; 1041 1042 MaxLoadsPerMemcmpOptSize = 4; 1043 MaxLoadsPerMemcmp = 1044 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8; 1045 1046 setStackPointerRegisterToSaveRestore(AArch64::SP); 1047 1048 setSchedulingPreference(Sched::Hybrid); 1049 1050 EnableExtLdPromotion = true; 1051 1052 // Set required alignment. 1053 setMinFunctionAlignment(Align(4)); 1054 // Set preferred alignments. 1055 1056 // Don't align loops on Windows. The SEH unwind info generation needs to 1057 // know the exact length of functions before the alignments have been 1058 // expanded. 1059 if (!Subtarget->isTargetWindows()) 1060 setPrefLoopAlignment(STI.getPrefLoopAlignment()); 1061 setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment()); 1062 setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); 1063 1064 // Only change the limit for entries in a jump table if specified by 1065 // the sub target, but not at the command line. 1066 unsigned MaxJT = STI.getMaximumJumpTableSize(); 1067 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX) 1068 setMaximumJumpTableSize(MaxJT); 1069 1070 setHasExtractBitsInsn(true); 1071 1072 setMaxDivRemBitWidthSupported(128); 1073 1074 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1075 1076 if (Subtarget->hasNEON()) { 1077 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to 1078 // silliness like this: 1079 for (auto Op : 1080 {ISD::SELECT, ISD::SELECT_CC, 1081 ISD::BR_CC, ISD::FADD, ISD::FSUB, 1082 ISD::FMUL, ISD::FDIV, ISD::FMA, 1083 ISD::FNEG, ISD::FABS, ISD::FCEIL, 1084 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT, 1085 ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, 1086 ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM, 1087 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD, 1088 ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV, 1089 ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR, 1090 ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT, 1091 ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN, 1092 ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM, 1093 ISD::STRICT_FMAXIMUM}) 1094 setOperationAction(Op, MVT::v1f64, Expand); 1095 1096 for (auto Op : 1097 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP, 1098 ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL, 1099 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT, 1100 ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND}) 1101 setOperationAction(Op, MVT::v1i64, Expand); 1102 1103 // AArch64 doesn't have a direct vector ->f32 conversion instructions for 1104 // elements smaller than i32, so promote the input to i32 first. 1105 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32); 1106 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32); 1107 1108 // Similarly, there is no direct i32 -> f64 vector conversion instruction. 1109 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the 1110 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 1111 for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, 1112 ISD::STRICT_UINT_TO_FP}) 1113 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32}) 1114 setOperationAction(Op, VT, Custom); 1115 1116 if (Subtarget->hasFullFP16()) { 1117 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 1118 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); 1119 1120 setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom); 1121 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); 1122 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom); 1123 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); 1124 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 1125 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 1126 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 1127 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 1128 } else { 1129 // when AArch64 doesn't have fullfp16 support, promote the input 1130 // to i32 first. 1131 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32); 1132 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32); 1133 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32); 1134 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32); 1135 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32); 1136 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32); 1137 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32); 1138 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32); 1139 } 1140 1141 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 1142 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 1143 setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal); 1144 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal); 1145 setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom); 1146 setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom); 1147 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); 1148 setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom); 1149 for (auto VT : {MVT::v1i64, MVT::v2i64}) { 1150 setOperationAction(ISD::UMAX, VT, Custom); 1151 setOperationAction(ISD::SMAX, VT, Custom); 1152 setOperationAction(ISD::UMIN, VT, Custom); 1153 setOperationAction(ISD::SMIN, VT, Custom); 1154 } 1155 1156 // Custom handling for some quad-vector types to detect MULL. 1157 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 1158 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 1159 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 1160 setOperationAction(ISD::MUL, MVT::v4i16, Custom); 1161 setOperationAction(ISD::MUL, MVT::v2i32, Custom); 1162 setOperationAction(ISD::MUL, MVT::v1i64, Custom); 1163 1164 // Saturates 1165 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, 1166 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { 1167 setOperationAction(ISD::SADDSAT, VT, Legal); 1168 setOperationAction(ISD::UADDSAT, VT, Legal); 1169 setOperationAction(ISD::SSUBSAT, VT, Legal); 1170 setOperationAction(ISD::USUBSAT, VT, Legal); 1171 } 1172 1173 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, 1174 MVT::v4i32}) { 1175 setOperationAction(ISD::AVGFLOORS, VT, Legal); 1176 setOperationAction(ISD::AVGFLOORU, VT, Legal); 1177 setOperationAction(ISD::AVGCEILS, VT, Legal); 1178 setOperationAction(ISD::AVGCEILU, VT, Legal); 1179 setOperationAction(ISD::ABDS, VT, Legal); 1180 setOperationAction(ISD::ABDU, VT, Legal); 1181 } 1182 1183 // Vector reductions 1184 for (MVT VT : { MVT::v4f16, MVT::v2f32, 1185 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { 1186 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) { 1187 setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal); 1188 setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal); 1189 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal); 1190 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal); 1191 1192 setOperationAction(ISD::VECREDUCE_FADD, VT, Legal); 1193 } 1194 } 1195 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, 1196 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { 1197 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); 1198 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); 1199 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); 1200 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); 1201 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); 1202 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 1203 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 1204 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 1205 } 1206 setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom); 1207 setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom); 1208 setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom); 1209 setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom); 1210 1211 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); 1212 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 1213 // Likewise, narrowing and extending vector loads/stores aren't handled 1214 // directly. 1215 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 1216 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 1217 1218 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { 1219 setOperationAction(ISD::MULHS, VT, Legal); 1220 setOperationAction(ISD::MULHU, VT, Legal); 1221 } else { 1222 setOperationAction(ISD::MULHS, VT, Expand); 1223 setOperationAction(ISD::MULHU, VT, Expand); 1224 } 1225 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 1226 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 1227 1228 setOperationAction(ISD::BSWAP, VT, Expand); 1229 setOperationAction(ISD::CTTZ, VT, Expand); 1230 1231 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 1232 setTruncStoreAction(VT, InnerVT, Expand); 1233 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 1234 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 1235 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 1236 } 1237 } 1238 1239 // AArch64 has implementations of a lot of rounding-like FP operations. 1240 for (auto Op : 1241 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, 1242 ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR, 1243 ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT, 1244 ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) { 1245 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) 1246 setOperationAction(Op, Ty, Legal); 1247 if (Subtarget->hasFullFP16()) 1248 for (MVT Ty : {MVT::v4f16, MVT::v8f16}) 1249 setOperationAction(Op, Ty, Legal); 1250 } 1251 1252 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); 1253 1254 setOperationAction(ISD::BITCAST, MVT::i2, Custom); 1255 setOperationAction(ISD::BITCAST, MVT::i4, Custom); 1256 setOperationAction(ISD::BITCAST, MVT::i8, Custom); 1257 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 1258 1259 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); 1260 setOperationAction(ISD::BITCAST, MVT::v2i16, Custom); 1261 setOperationAction(ISD::BITCAST, MVT::v4i8, Custom); 1262 1263 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom); 1264 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); 1265 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); 1266 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom); 1267 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); 1268 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); 1269 1270 // ADDP custom lowering 1271 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) 1272 setOperationAction(ISD::ADD, VT, Custom); 1273 // FADDP custom lowering 1274 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 }) 1275 setOperationAction(ISD::FADD, VT, Custom); 1276 } 1277 1278 if (Subtarget->hasSME()) { 1279 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 1280 } 1281 1282 // FIXME: Move lowering for more nodes here if those are common between 1283 // SVE and SME. 1284 if (Subtarget->hasSVEorSME()) { 1285 for (auto VT : 1286 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { 1287 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); 1288 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1289 setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); 1290 setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); 1291 } 1292 } 1293 1294 if (Subtarget->hasSVEorSME()) { 1295 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) { 1296 setOperationAction(ISD::BITREVERSE, VT, Custom); 1297 setOperationAction(ISD::BSWAP, VT, Custom); 1298 setOperationAction(ISD::CTLZ, VT, Custom); 1299 setOperationAction(ISD::CTPOP, VT, Custom); 1300 setOperationAction(ISD::CTTZ, VT, Custom); 1301 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1302 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 1303 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 1304 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 1305 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 1306 setOperationAction(ISD::MGATHER, VT, Custom); 1307 setOperationAction(ISD::MSCATTER, VT, Custom); 1308 setOperationAction(ISD::MLOAD, VT, Custom); 1309 setOperationAction(ISD::MUL, VT, Custom); 1310 setOperationAction(ISD::MULHS, VT, Custom); 1311 setOperationAction(ISD::MULHU, VT, Custom); 1312 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); 1313 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); 1314 setOperationAction(ISD::SELECT, VT, Custom); 1315 setOperationAction(ISD::SETCC, VT, Custom); 1316 setOperationAction(ISD::SDIV, VT, Custom); 1317 setOperationAction(ISD::UDIV, VT, Custom); 1318 setOperationAction(ISD::SMIN, VT, Custom); 1319 setOperationAction(ISD::UMIN, VT, Custom); 1320 setOperationAction(ISD::SMAX, VT, Custom); 1321 setOperationAction(ISD::UMAX, VT, Custom); 1322 setOperationAction(ISD::SHL, VT, Custom); 1323 setOperationAction(ISD::SRL, VT, Custom); 1324 setOperationAction(ISD::SRA, VT, Custom); 1325 setOperationAction(ISD::ABS, VT, Custom); 1326 setOperationAction(ISD::ABDS, VT, Custom); 1327 setOperationAction(ISD::ABDU, VT, Custom); 1328 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); 1329 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 1330 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 1331 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 1332 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); 1333 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); 1334 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); 1335 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); 1336 setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); 1337 setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); 1338 1339 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 1340 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 1341 setOperationAction(ISD::SELECT_CC, VT, Expand); 1342 setOperationAction(ISD::ROTL, VT, Expand); 1343 setOperationAction(ISD::ROTR, VT, Expand); 1344 1345 setOperationAction(ISD::SADDSAT, VT, Legal); 1346 setOperationAction(ISD::UADDSAT, VT, Legal); 1347 setOperationAction(ISD::SSUBSAT, VT, Legal); 1348 setOperationAction(ISD::USUBSAT, VT, Legal); 1349 setOperationAction(ISD::UREM, VT, Expand); 1350 setOperationAction(ISD::SREM, VT, Expand); 1351 setOperationAction(ISD::SDIVREM, VT, Expand); 1352 setOperationAction(ISD::UDIVREM, VT, Expand); 1353 1354 setOperationAction(ISD::AVGFLOORS, VT, Custom); 1355 setOperationAction(ISD::AVGFLOORU, VT, Custom); 1356 setOperationAction(ISD::AVGCEILS, VT, Custom); 1357 setOperationAction(ISD::AVGCEILU, VT, Custom); 1358 1359 if (!Subtarget->isLittleEndian()) 1360 setOperationAction(ISD::BITCAST, VT, Expand); 1361 } 1362 1363 // Illegal unpacked integer vector types. 1364 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) { 1365 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1366 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1367 } 1368 1369 // Legalize unpacked bitcasts to REINTERPRET_CAST. 1370 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16, 1371 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32}) 1372 setOperationAction(ISD::BITCAST, VT, Custom); 1373 1374 for (auto VT : 1375 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8, 1376 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 }) 1377 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal); 1378 1379 for (auto VT : 1380 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { 1381 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1382 setOperationAction(ISD::SELECT, VT, Custom); 1383 setOperationAction(ISD::SETCC, VT, Custom); 1384 setOperationAction(ISD::TRUNCATE, VT, Custom); 1385 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 1386 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 1387 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 1388 1389 setOperationAction(ISD::SELECT_CC, VT, Expand); 1390 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1391 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1392 1393 // There are no legal MVT::nxv16f## based types. 1394 if (VT != MVT::nxv16i1) { 1395 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 1396 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 1397 } 1398 } 1399 1400 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does 1401 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64, 1402 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, 1403 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) { 1404 setOperationAction(ISD::MLOAD, VT, Custom); 1405 setOperationAction(ISD::MSTORE, VT, Custom); 1406 setOperationAction(ISD::MGATHER, VT, Custom); 1407 setOperationAction(ISD::MSCATTER, VT, Custom); 1408 } 1409 1410 // Firstly, exclude all scalable vector extending loads/truncating stores, 1411 // include both integer and floating scalable vector. 1412 for (MVT VT : MVT::scalable_vector_valuetypes()) { 1413 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) { 1414 setTruncStoreAction(VT, InnerVT, Expand); 1415 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 1416 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 1417 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 1418 } 1419 } 1420 1421 // Then, selectively enable those which we directly support. 1422 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal); 1423 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal); 1424 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal); 1425 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal); 1426 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal); 1427 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal); 1428 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) { 1429 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal); 1430 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal); 1431 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal); 1432 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal); 1433 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal); 1434 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal); 1435 } 1436 1437 // SVE supports truncating stores of 64 and 128-bit vectors 1438 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom); 1439 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom); 1440 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom); 1441 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); 1442 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); 1443 1444 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, 1445 MVT::nxv4f32, MVT::nxv2f64}) { 1446 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1447 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1448 setOperationAction(ISD::MGATHER, VT, Custom); 1449 setOperationAction(ISD::MSCATTER, VT, Custom); 1450 setOperationAction(ISD::MLOAD, VT, Custom); 1451 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); 1452 setOperationAction(ISD::SELECT, VT, Custom); 1453 setOperationAction(ISD::SETCC, VT, Custom); 1454 setOperationAction(ISD::FADD, VT, Custom); 1455 setOperationAction(ISD::FCOPYSIGN, VT, Custom); 1456 setOperationAction(ISD::FDIV, VT, Custom); 1457 setOperationAction(ISD::FMA, VT, Custom); 1458 setOperationAction(ISD::FMAXIMUM, VT, Custom); 1459 setOperationAction(ISD::FMAXNUM, VT, Custom); 1460 setOperationAction(ISD::FMINIMUM, VT, Custom); 1461 setOperationAction(ISD::FMINNUM, VT, Custom); 1462 setOperationAction(ISD::FMUL, VT, Custom); 1463 setOperationAction(ISD::FNEG, VT, Custom); 1464 setOperationAction(ISD::FSUB, VT, Custom); 1465 setOperationAction(ISD::FCEIL, VT, Custom); 1466 setOperationAction(ISD::FFLOOR, VT, Custom); 1467 setOperationAction(ISD::FNEARBYINT, VT, Custom); 1468 setOperationAction(ISD::FRINT, VT, Custom); 1469 setOperationAction(ISD::FROUND, VT, Custom); 1470 setOperationAction(ISD::FROUNDEVEN, VT, Custom); 1471 setOperationAction(ISD::FTRUNC, VT, Custom); 1472 setOperationAction(ISD::FSQRT, VT, Custom); 1473 setOperationAction(ISD::FABS, VT, Custom); 1474 setOperationAction(ISD::FP_EXTEND, VT, Custom); 1475 setOperationAction(ISD::FP_ROUND, VT, Custom); 1476 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 1477 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 1478 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 1479 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom); 1480 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom); 1481 if (Subtarget->isSVEAvailable()) 1482 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); 1483 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); 1484 setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); 1485 setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); 1486 1487 setOperationAction(ISD::SELECT_CC, VT, Expand); 1488 setOperationAction(ISD::FREM, VT, Expand); 1489 setOperationAction(ISD::FPOW, VT, Expand); 1490 setOperationAction(ISD::FPOWI, VT, Expand); 1491 setOperationAction(ISD::FCOS, VT, Expand); 1492 setOperationAction(ISD::FSIN, VT, Expand); 1493 setOperationAction(ISD::FSINCOS, VT, Expand); 1494 setOperationAction(ISD::FEXP, VT, Expand); 1495 setOperationAction(ISD::FEXP2, VT, Expand); 1496 setOperationAction(ISD::FEXP10, VT, Expand); 1497 setOperationAction(ISD::FLOG, VT, Expand); 1498 setOperationAction(ISD::FLOG2, VT, Expand); 1499 setOperationAction(ISD::FLOG10, VT, Expand); 1500 1501 setCondCodeAction(ISD::SETO, VT, Expand); 1502 setCondCodeAction(ISD::SETOLT, VT, Expand); 1503 setCondCodeAction(ISD::SETLT, VT, Expand); 1504 setCondCodeAction(ISD::SETOLE, VT, Expand); 1505 setCondCodeAction(ISD::SETLE, VT, Expand); 1506 setCondCodeAction(ISD::SETULT, VT, Expand); 1507 setCondCodeAction(ISD::SETULE, VT, Expand); 1508 setCondCodeAction(ISD::SETUGE, VT, Expand); 1509 setCondCodeAction(ISD::SETUGT, VT, Expand); 1510 setCondCodeAction(ISD::SETUEQ, VT, Expand); 1511 setCondCodeAction(ISD::SETONE, VT, Expand); 1512 1513 if (!Subtarget->isLittleEndian()) 1514 setOperationAction(ISD::BITCAST, VT, Expand); 1515 } 1516 1517 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) { 1518 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1519 setOperationAction(ISD::MGATHER, VT, Custom); 1520 setOperationAction(ISD::MSCATTER, VT, Custom); 1521 setOperationAction(ISD::MLOAD, VT, Custom); 1522 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); 1523 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); 1524 1525 if (!Subtarget->isLittleEndian()) 1526 setOperationAction(ISD::BITCAST, VT, Expand); 1527 } 1528 1529 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); 1530 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); 1531 1532 // NEON doesn't support integer divides, but SVE does 1533 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 1534 MVT::v4i32, MVT::v1i64, MVT::v2i64}) { 1535 setOperationAction(ISD::SDIV, VT, Custom); 1536 setOperationAction(ISD::UDIV, VT, Custom); 1537 } 1538 1539 // NEON doesn't support 64-bit vector integer muls, but SVE does. 1540 setOperationAction(ISD::MUL, MVT::v1i64, Custom); 1541 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 1542 1543 if (Subtarget->isSVEAvailable()) { 1544 // NEON doesn't support across-vector reductions, but SVE does. 1545 for (auto VT : 1546 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) 1547 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); 1548 } 1549 1550 if (!Subtarget->isNeonAvailable()) { 1551 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Custom); 1552 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Custom); 1553 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Custom); 1554 setTruncStoreAction(MVT::v1f64, MVT::v1f16, Custom); 1555 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Custom); 1556 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Custom); 1557 setTruncStoreAction(MVT::v1f64, MVT::v1f32, Custom); 1558 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Custom); 1559 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); 1560 for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, 1561 MVT::v4i32, MVT::v1i64, MVT::v2i64}) 1562 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true); 1563 1564 for (MVT VT : 1565 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) 1566 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true); 1567 } 1568 1569 // NOTE: Currently this has to happen after computeRegisterProperties rather 1570 // than the preferred option of combining it with the addRegisterClass call. 1571 if (Subtarget->useSVEForFixedLengthVectors()) { 1572 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) 1573 if (useSVEForFixedLengthVectorVT(VT)) 1574 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false); 1575 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) 1576 if (useSVEForFixedLengthVectorVT(VT)) 1577 addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false); 1578 1579 // 64bit results can mean a bigger than NEON input. 1580 for (auto VT : {MVT::v8i8, MVT::v4i16}) 1581 setOperationAction(ISD::TRUNCATE, VT, Custom); 1582 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); 1583 1584 // 128bit results imply a bigger than NEON input. 1585 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) 1586 setOperationAction(ISD::TRUNCATE, VT, Custom); 1587 for (auto VT : {MVT::v8f16, MVT::v4f32}) 1588 setOperationAction(ISD::FP_ROUND, VT, Custom); 1589 1590 // These operations are not supported on NEON but SVE can do them. 1591 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); 1592 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom); 1593 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); 1594 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 1595 setOperationAction(ISD::MULHS, MVT::v1i64, Custom); 1596 setOperationAction(ISD::MULHS, MVT::v2i64, Custom); 1597 setOperationAction(ISD::MULHU, MVT::v1i64, Custom); 1598 setOperationAction(ISD::MULHU, MVT::v2i64, Custom); 1599 setOperationAction(ISD::SMAX, MVT::v1i64, Custom); 1600 setOperationAction(ISD::SMAX, MVT::v2i64, Custom); 1601 setOperationAction(ISD::SMIN, MVT::v1i64, Custom); 1602 setOperationAction(ISD::SMIN, MVT::v2i64, Custom); 1603 setOperationAction(ISD::UMAX, MVT::v1i64, Custom); 1604 setOperationAction(ISD::UMAX, MVT::v2i64, Custom); 1605 setOperationAction(ISD::UMIN, MVT::v1i64, Custom); 1606 setOperationAction(ISD::UMIN, MVT::v2i64, Custom); 1607 setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom); 1608 setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom); 1609 setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom); 1610 setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom); 1611 1612 // Int operations with no NEON support. 1613 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, 1614 MVT::v2i32, MVT::v4i32, MVT::v2i64}) { 1615 setOperationAction(ISD::BITREVERSE, VT, Custom); 1616 setOperationAction(ISD::CTTZ, VT, Custom); 1617 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 1618 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 1619 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 1620 setOperationAction(ISD::MULHS, VT, Custom); 1621 setOperationAction(ISD::MULHU, VT, Custom); 1622 } 1623 1624 1625 // Use SVE for vectors with more than 2 elements. 1626 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32}) 1627 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 1628 } 1629 1630 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64); 1631 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32); 1632 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16); 1633 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8); 1634 1635 setOperationAction(ISD::VSCALE, MVT::i32, Custom); 1636 } 1637 1638 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) { 1639 // Only required for llvm.aarch64.mops.memset.tag 1640 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 1641 } 1642 1643 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1644 1645 if (Subtarget->hasSVE()) { 1646 setOperationAction(ISD::FLDEXP, MVT::f64, Custom); 1647 setOperationAction(ISD::FLDEXP, MVT::f32, Custom); 1648 setOperationAction(ISD::FLDEXP, MVT::f16, Custom); 1649 } 1650 1651 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); 1652 1653 IsStrictFPEnabled = true; 1654 setMaxAtomicSizeInBitsSupported(128); 1655 } 1656 1657 void AArch64TargetLowering::addTypeForNEON(MVT VT) { 1658 assert(VT.isVector() && "VT should be a vector type"); 1659 1660 if (VT.isFloatingPoint()) { 1661 MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT(); 1662 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo); 1663 setOperationPromotedToType(ISD::STORE, VT, PromoteTo); 1664 } 1665 1666 // Mark vector float intrinsics as expand. 1667 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { 1668 setOperationAction(ISD::FSIN, VT, Expand); 1669 setOperationAction(ISD::FCOS, VT, Expand); 1670 setOperationAction(ISD::FPOW, VT, Expand); 1671 setOperationAction(ISD::FLOG, VT, Expand); 1672 setOperationAction(ISD::FLOG2, VT, Expand); 1673 setOperationAction(ISD::FLOG10, VT, Expand); 1674 setOperationAction(ISD::FEXP, VT, Expand); 1675 setOperationAction(ISD::FEXP2, VT, Expand); 1676 setOperationAction(ISD::FEXP10, VT, Expand); 1677 } 1678 1679 // But we do support custom-lowering for FCOPYSIGN. 1680 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 || 1681 ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16())) 1682 setOperationAction(ISD::FCOPYSIGN, VT, Custom); 1683 1684 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1685 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1686 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1687 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); 1688 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1689 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1690 setOperationAction(ISD::SRA, VT, Custom); 1691 setOperationAction(ISD::SRL, VT, Custom); 1692 setOperationAction(ISD::SHL, VT, Custom); 1693 setOperationAction(ISD::OR, VT, Custom); 1694 setOperationAction(ISD::SETCC, VT, Custom); 1695 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 1696 1697 setOperationAction(ISD::SELECT, VT, Expand); 1698 setOperationAction(ISD::SELECT_CC, VT, Expand); 1699 setOperationAction(ISD::VSELECT, VT, Expand); 1700 for (MVT InnerVT : MVT::all_valuetypes()) 1701 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); 1702 1703 // CNT supports only B element sizes, then use UADDLP to widen. 1704 if (VT != MVT::v8i8 && VT != MVT::v16i8) 1705 setOperationAction(ISD::CTPOP, VT, Custom); 1706 1707 setOperationAction(ISD::UDIV, VT, Expand); 1708 setOperationAction(ISD::SDIV, VT, Expand); 1709 setOperationAction(ISD::UREM, VT, Expand); 1710 setOperationAction(ISD::SREM, VT, Expand); 1711 setOperationAction(ISD::FREM, VT, Expand); 1712 1713 for (unsigned Opcode : 1714 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, 1715 ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) 1716 setOperationAction(Opcode, VT, Custom); 1717 1718 if (!VT.isFloatingPoint()) 1719 setOperationAction(ISD::ABS, VT, Legal); 1720 1721 // [SU][MIN|MAX] are available for all NEON types apart from i64. 1722 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) 1723 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 1724 setOperationAction(Opcode, VT, Legal); 1725 1726 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP 1727 // NEON types. 1728 if (VT.isFloatingPoint() && 1729 VT.getVectorElementType() != MVT::bf16 && 1730 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())) 1731 for (unsigned Opcode : 1732 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM, 1733 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM, 1734 ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB, 1735 ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA, 1736 ISD::STRICT_FSQRT}) 1737 setOperationAction(Opcode, VT, Legal); 1738 1739 // Strict fp extend and trunc are legal 1740 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16) 1741 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); 1742 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64) 1743 setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal); 1744 1745 // FIXME: We could potentially make use of the vector comparison instructions 1746 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of 1747 // complications: 1748 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons, 1749 // so we would need to expand when the condition code doesn't match the 1750 // kind of comparison. 1751 // * Some kinds of comparison require more than one FCMXY instruction so 1752 // would need to be expanded instead. 1753 // * The lowering of the non-strict versions involves target-specific ISD 1754 // nodes so we would likely need to add strict versions of all of them and 1755 // handle them appropriately. 1756 setOperationAction(ISD::STRICT_FSETCC, VT, Expand); 1757 setOperationAction(ISD::STRICT_FSETCCS, VT, Expand); 1758 1759 if (Subtarget->isLittleEndian()) { 1760 for (unsigned im = (unsigned)ISD::PRE_INC; 1761 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1762 setIndexedLoadAction(im, VT, Legal); 1763 setIndexedStoreAction(im, VT, Legal); 1764 } 1765 } 1766 1767 if (Subtarget->hasD128()) { 1768 setOperationAction(ISD::READ_REGISTER, MVT::i128, Custom); 1769 setOperationAction(ISD::WRITE_REGISTER, MVT::i128, Custom); 1770 } 1771 } 1772 1773 bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT, 1774 EVT OpVT) const { 1775 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo). 1776 if (!Subtarget->hasSVE()) 1777 return true; 1778 1779 // We can only support legal predicate result types. We can use the SVE 1780 // whilelo instruction for generating fixed-width predicates too. 1781 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 && 1782 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 && 1783 ResVT != MVT::v8i1 && ResVT != MVT::v16i1) 1784 return true; 1785 1786 // The whilelo instruction only works with i32 or i64 scalar inputs. 1787 if (OpVT != MVT::i32 && OpVT != MVT::i64) 1788 return true; 1789 1790 return false; 1791 } 1792 1793 bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const { 1794 return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1; 1795 } 1796 1797 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT, 1798 bool StreamingSVE) { 1799 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 1800 1801 // By default everything must be expanded. 1802 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) 1803 setOperationAction(Op, VT, Expand); 1804 1805 if (VT.isFloatingPoint()) { 1806 setCondCodeAction(ISD::SETO, VT, Expand); 1807 setCondCodeAction(ISD::SETOLT, VT, Expand); 1808 setCondCodeAction(ISD::SETOLE, VT, Expand); 1809 setCondCodeAction(ISD::SETULT, VT, Expand); 1810 setCondCodeAction(ISD::SETULE, VT, Expand); 1811 setCondCodeAction(ISD::SETUGE, VT, Expand); 1812 setCondCodeAction(ISD::SETUGT, VT, Expand); 1813 setCondCodeAction(ISD::SETUEQ, VT, Expand); 1814 setCondCodeAction(ISD::SETONE, VT, Expand); 1815 } 1816 1817 // Mark integer truncating stores/extending loads as having custom lowering 1818 if (VT.isInteger()) { 1819 MVT InnerVT = VT.changeVectorElementType(MVT::i8); 1820 while (InnerVT != VT) { 1821 setTruncStoreAction(VT, InnerVT, Custom); 1822 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom); 1823 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom); 1824 InnerVT = InnerVT.changeVectorElementType( 1825 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits())); 1826 } 1827 } 1828 1829 // Mark floating-point truncating stores/extending loads as having custom 1830 // lowering 1831 if (VT.isFloatingPoint()) { 1832 MVT InnerVT = VT.changeVectorElementType(MVT::f16); 1833 while (InnerVT != VT) { 1834 setTruncStoreAction(VT, InnerVT, Custom); 1835 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom); 1836 InnerVT = InnerVT.changeVectorElementType( 1837 MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits())); 1838 } 1839 } 1840 1841 // Lower fixed length vector operations to scalable equivalents. 1842 setOperationAction(ISD::ABS, VT, Custom); 1843 setOperationAction(ISD::ADD, VT, Custom); 1844 setOperationAction(ISD::AND, VT, Custom); 1845 setOperationAction(ISD::ANY_EXTEND, VT, Custom); 1846 setOperationAction(ISD::BITCAST, VT, StreamingSVE ? Legal : Custom); 1847 setOperationAction(ISD::BITREVERSE, VT, Custom); 1848 setOperationAction(ISD::BSWAP, VT, Custom); 1849 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 1850 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 1851 setOperationAction(ISD::CTLZ, VT, Custom); 1852 setOperationAction(ISD::CTPOP, VT, Custom); 1853 setOperationAction(ISD::CTTZ, VT, Custom); 1854 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 1855 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 1856 setOperationAction(ISD::FABS, VT, Custom); 1857 setOperationAction(ISD::FADD, VT, Custom); 1858 setOperationAction(ISD::FCEIL, VT, Custom); 1859 setOperationAction(ISD::FCOPYSIGN, VT, Custom); 1860 setOperationAction(ISD::FDIV, VT, Custom); 1861 setOperationAction(ISD::FFLOOR, VT, Custom); 1862 setOperationAction(ISD::FMA, VT, Custom); 1863 setOperationAction(ISD::FMAXIMUM, VT, Custom); 1864 setOperationAction(ISD::FMAXNUM, VT, Custom); 1865 setOperationAction(ISD::FMINIMUM, VT, Custom); 1866 setOperationAction(ISD::FMINNUM, VT, Custom); 1867 setOperationAction(ISD::FMUL, VT, Custom); 1868 setOperationAction(ISD::FNEARBYINT, VT, Custom); 1869 setOperationAction(ISD::FNEG, VT, Custom); 1870 setOperationAction(ISD::FP_EXTEND, VT, Custom); 1871 setOperationAction(ISD::FP_ROUND, VT, Custom); 1872 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 1873 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 1874 setOperationAction(ISD::FRINT, VT, Custom); 1875 setOperationAction(ISD::FROUND, VT, Custom); 1876 setOperationAction(ISD::FROUNDEVEN, VT, Custom); 1877 setOperationAction(ISD::FSQRT, VT, Custom); 1878 setOperationAction(ISD::FSUB, VT, Custom); 1879 setOperationAction(ISD::FTRUNC, VT, Custom); 1880 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 1881 setOperationAction(ISD::LOAD, VT, StreamingSVE ? Legal : Custom); 1882 setOperationAction(ISD::MGATHER, VT, StreamingSVE ? Expand : Custom); 1883 setOperationAction(ISD::MLOAD, VT, Custom); 1884 setOperationAction(ISD::MSCATTER, VT, StreamingSVE ? Expand : Custom); 1885 setOperationAction(ISD::MSTORE, VT, Custom); 1886 setOperationAction(ISD::MUL, VT, Custom); 1887 setOperationAction(ISD::MULHS, VT, Custom); 1888 setOperationAction(ISD::MULHU, VT, Custom); 1889 setOperationAction(ISD::OR, VT, Custom); 1890 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, StreamingSVE ? Legal : Expand); 1891 setOperationAction(ISD::SDIV, VT, Custom); 1892 setOperationAction(ISD::SELECT, VT, Custom); 1893 setOperationAction(ISD::SETCC, VT, Custom); 1894 setOperationAction(ISD::SHL, VT, Custom); 1895 setOperationAction(ISD::SIGN_EXTEND, VT, Custom); 1896 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); 1897 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 1898 setOperationAction(ISD::SMAX, VT, Custom); 1899 setOperationAction(ISD::SMIN, VT, Custom); 1900 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); 1901 setOperationAction(ISD::SRA, VT, Custom); 1902 setOperationAction(ISD::SRL, VT, Custom); 1903 setOperationAction(ISD::STORE, VT, StreamingSVE ? Legal : Custom); 1904 setOperationAction(ISD::SUB, VT, Custom); 1905 setOperationAction(ISD::TRUNCATE, VT, Custom); 1906 setOperationAction(ISD::UDIV, VT, Custom); 1907 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 1908 setOperationAction(ISD::UMAX, VT, Custom); 1909 setOperationAction(ISD::UMIN, VT, Custom); 1910 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); 1911 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 1912 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 1913 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 1914 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 1915 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom); 1916 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom); 1917 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 1918 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, 1919 StreamingSVE ? Expand : Custom); 1920 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); 1921 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); 1922 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); 1923 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); 1924 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 1925 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 1926 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); 1927 setOperationAction(ISD::VSELECT, VT, Custom); 1928 setOperationAction(ISD::XOR, VT, Custom); 1929 setOperationAction(ISD::ZERO_EXTEND, VT, Custom); 1930 } 1931 1932 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { 1933 addRegisterClass(VT, &AArch64::FPR64RegClass); 1934 addTypeForNEON(VT); 1935 } 1936 1937 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { 1938 addRegisterClass(VT, &AArch64::FPR128RegClass); 1939 addTypeForNEON(VT); 1940 } 1941 1942 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, 1943 LLVMContext &C, EVT VT) const { 1944 if (!VT.isVector()) 1945 return MVT::i32; 1946 if (VT.isScalableVector()) 1947 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount()); 1948 return VT.changeVectorElementTypeToInteger(); 1949 } 1950 1951 // isIntImmediate - This method tests to see if the node is a constant 1952 // operand. If so Imm will receive the value. 1953 static bool isIntImmediate(const SDNode *N, uint64_t &Imm) { 1954 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) { 1955 Imm = C->getZExtValue(); 1956 return true; 1957 } 1958 return false; 1959 } 1960 1961 // isOpcWithIntImmediate - This method tests to see if the node is a specific 1962 // opcode and that it has a immediate integer right operand. 1963 // If so Imm will receive the value. 1964 static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, 1965 uint64_t &Imm) { 1966 return N->getOpcode() == Opc && 1967 isIntImmediate(N->getOperand(1).getNode(), Imm); 1968 } 1969 1970 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, 1971 const APInt &Demanded, 1972 TargetLowering::TargetLoweringOpt &TLO, 1973 unsigned NewOpc) { 1974 uint64_t OldImm = Imm, NewImm, Enc; 1975 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; 1976 1977 // Return if the immediate is already all zeros, all ones, a bimm32 or a 1978 // bimm64. 1979 if (Imm == 0 || Imm == Mask || 1980 AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) 1981 return false; 1982 1983 unsigned EltSize = Size; 1984 uint64_t DemandedBits = Demanded.getZExtValue(); 1985 1986 // Clear bits that are not demanded. 1987 Imm &= DemandedBits; 1988 1989 while (true) { 1990 // The goal here is to set the non-demanded bits in a way that minimizes 1991 // the number of switching between 0 and 1. In order to achieve this goal, 1992 // we set the non-demanded bits to the value of the preceding demanded bits. 1993 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a 1994 // non-demanded bit), we copy bit0 (1) to the least significant 'x', 1995 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. 1996 // The final result is 0b11000011. 1997 uint64_t NonDemandedBits = ~DemandedBits; 1998 uint64_t InvertedImm = ~Imm & DemandedBits; 1999 uint64_t RotatedImm = 2000 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & 2001 NonDemandedBits; 2002 uint64_t Sum = RotatedImm + NonDemandedBits; 2003 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); 2004 uint64_t Ones = (Sum + Carry) & NonDemandedBits; 2005 NewImm = (Imm | Ones) & Mask; 2006 2007 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate 2008 // or all-ones or all-zeros, in which case we can stop searching. Otherwise, 2009 // we halve the element size and continue the search. 2010 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) 2011 break; 2012 2013 // We cannot shrink the element size any further if it is 2-bits. 2014 if (EltSize == 2) 2015 return false; 2016 2017 EltSize /= 2; 2018 Mask >>= EltSize; 2019 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; 2020 2021 // Return if there is mismatch in any of the demanded bits of Imm and Hi. 2022 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) 2023 return false; 2024 2025 // Merge the upper and lower halves of Imm and DemandedBits. 2026 Imm |= Hi; 2027 DemandedBits |= DemandedBitsHi; 2028 } 2029 2030 ++NumOptimizedImms; 2031 2032 // Replicate the element across the register width. 2033 while (EltSize < Size) { 2034 NewImm |= NewImm << EltSize; 2035 EltSize *= 2; 2036 } 2037 2038 (void)OldImm; 2039 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && 2040 "demanded bits should never be altered"); 2041 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); 2042 2043 // Create the new constant immediate node. 2044 EVT VT = Op.getValueType(); 2045 SDLoc DL(Op); 2046 SDValue New; 2047 2048 // If the new constant immediate is all-zeros or all-ones, let the target 2049 // independent DAG combine optimize this node. 2050 if (NewImm == 0 || NewImm == OrigMask) { 2051 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), 2052 TLO.DAG.getConstant(NewImm, DL, VT)); 2053 // Otherwise, create a machine node so that target independent DAG combine 2054 // doesn't undo this optimization. 2055 } else { 2056 Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); 2057 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); 2058 New = SDValue( 2059 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); 2060 } 2061 2062 return TLO.CombineTo(Op, New); 2063 } 2064 2065 bool AArch64TargetLowering::targetShrinkDemandedConstant( 2066 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, 2067 TargetLoweringOpt &TLO) const { 2068 // Delay this optimization to as late as possible. 2069 if (!TLO.LegalOps) 2070 return false; 2071 2072 if (!EnableOptimizeLogicalImm) 2073 return false; 2074 2075 EVT VT = Op.getValueType(); 2076 if (VT.isVector()) 2077 return false; 2078 2079 unsigned Size = VT.getSizeInBits(); 2080 assert((Size == 32 || Size == 64) && 2081 "i32 or i64 is expected after legalization."); 2082 2083 // Exit early if we demand all bits. 2084 if (DemandedBits.popcount() == Size) 2085 return false; 2086 2087 unsigned NewOpc; 2088 switch (Op.getOpcode()) { 2089 default: 2090 return false; 2091 case ISD::AND: 2092 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; 2093 break; 2094 case ISD::OR: 2095 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; 2096 break; 2097 case ISD::XOR: 2098 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; 2099 break; 2100 } 2101 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 2102 if (!C) 2103 return false; 2104 uint64_t Imm = C->getZExtValue(); 2105 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc); 2106 } 2107 2108 /// computeKnownBitsForTargetNode - Determine which of the bits specified in 2109 /// Mask are known to be either zero or one and return them Known. 2110 void AArch64TargetLowering::computeKnownBitsForTargetNode( 2111 const SDValue Op, KnownBits &Known, const APInt &DemandedElts, 2112 const SelectionDAG &DAG, unsigned Depth) const { 2113 switch (Op.getOpcode()) { 2114 default: 2115 break; 2116 case AArch64ISD::DUP: { 2117 SDValue SrcOp = Op.getOperand(0); 2118 Known = DAG.computeKnownBits(SrcOp, Depth + 1); 2119 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) { 2120 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() && 2121 "Expected DUP implicit truncation"); 2122 Known = Known.trunc(Op.getScalarValueSizeInBits()); 2123 } 2124 break; 2125 } 2126 case AArch64ISD::CSEL: { 2127 KnownBits Known2; 2128 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 2129 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 2130 Known = Known.intersectWith(Known2); 2131 break; 2132 } 2133 case AArch64ISD::BICi: { 2134 // Compute the bit cleared value. 2135 uint64_t Mask = 2136 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2)); 2137 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 2138 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask)); 2139 break; 2140 } 2141 case AArch64ISD::VLSHR: { 2142 KnownBits Known2; 2143 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 2144 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 2145 Known = KnownBits::lshr(Known, Known2); 2146 break; 2147 } 2148 case AArch64ISD::VASHR: { 2149 KnownBits Known2; 2150 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 2151 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 2152 Known = KnownBits::ashr(Known, Known2); 2153 break; 2154 } 2155 case AArch64ISD::VSHL: { 2156 KnownBits Known2; 2157 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 2158 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 2159 Known = KnownBits::shl(Known, Known2); 2160 break; 2161 } 2162 case AArch64ISD::MOVI: { 2163 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(0)); 2164 Known = 2165 KnownBits::makeConstant(APInt(Known.getBitWidth(), CN->getZExtValue())); 2166 break; 2167 } 2168 case AArch64ISD::LOADgot: 2169 case AArch64ISD::ADDlow: { 2170 if (!Subtarget->isTargetILP32()) 2171 break; 2172 // In ILP32 mode all valid pointers are in the low 4GB of the address-space. 2173 Known.Zero = APInt::getHighBitsSet(64, 32); 2174 break; 2175 } 2176 case AArch64ISD::ASSERT_ZEXT_BOOL: { 2177 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 2178 Known.Zero |= APInt(Known.getBitWidth(), 0xFE); 2179 break; 2180 } 2181 case ISD::INTRINSIC_W_CHAIN: { 2182 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 2183 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 2184 switch (IntID) { 2185 default: return; 2186 case Intrinsic::aarch64_ldaxr: 2187 case Intrinsic::aarch64_ldxr: { 2188 unsigned BitWidth = Known.getBitWidth(); 2189 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 2190 unsigned MemBits = VT.getScalarSizeInBits(); 2191 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 2192 return; 2193 } 2194 } 2195 break; 2196 } 2197 case ISD::INTRINSIC_WO_CHAIN: 2198 case ISD::INTRINSIC_VOID: { 2199 unsigned IntNo = Op.getConstantOperandVal(0); 2200 switch (IntNo) { 2201 default: 2202 break; 2203 case Intrinsic::aarch64_neon_uaddlv: { 2204 MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); 2205 unsigned BitWidth = Known.getBitWidth(); 2206 if (VT == MVT::v8i8 || VT == MVT::v16i8) { 2207 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12; 2208 assert(BitWidth >= Bound && "Unexpected width!"); 2209 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - Bound); 2210 Known.Zero |= Mask; 2211 } 2212 break; 2213 } 2214 case Intrinsic::aarch64_neon_umaxv: 2215 case Intrinsic::aarch64_neon_uminv: { 2216 // Figure out the datatype of the vector operand. The UMINV instruction 2217 // will zero extend the result, so we can mark as known zero all the 2218 // bits larger than the element datatype. 32-bit or larget doesn't need 2219 // this as those are legal types and will be handled by isel directly. 2220 MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); 2221 unsigned BitWidth = Known.getBitWidth(); 2222 if (VT == MVT::v8i8 || VT == MVT::v16i8) { 2223 assert(BitWidth >= 8 && "Unexpected width!"); 2224 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); 2225 Known.Zero |= Mask; 2226 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { 2227 assert(BitWidth >= 16 && "Unexpected width!"); 2228 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 2229 Known.Zero |= Mask; 2230 } 2231 break; 2232 } break; 2233 } 2234 } 2235 } 2236 } 2237 2238 unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode( 2239 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, 2240 unsigned Depth) const { 2241 EVT VT = Op.getValueType(); 2242 unsigned VTBits = VT.getScalarSizeInBits(); 2243 unsigned Opcode = Op.getOpcode(); 2244 switch (Opcode) { 2245 case AArch64ISD::CMEQ: 2246 case AArch64ISD::CMGE: 2247 case AArch64ISD::CMGT: 2248 case AArch64ISD::CMHI: 2249 case AArch64ISD::CMHS: 2250 case AArch64ISD::FCMEQ: 2251 case AArch64ISD::FCMGE: 2252 case AArch64ISD::FCMGT: 2253 case AArch64ISD::CMEQz: 2254 case AArch64ISD::CMGEz: 2255 case AArch64ISD::CMGTz: 2256 case AArch64ISD::CMLEz: 2257 case AArch64ISD::CMLTz: 2258 case AArch64ISD::FCMEQz: 2259 case AArch64ISD::FCMGEz: 2260 case AArch64ISD::FCMGTz: 2261 case AArch64ISD::FCMLEz: 2262 case AArch64ISD::FCMLTz: 2263 // Compares return either 0 or all-ones 2264 return VTBits; 2265 } 2266 2267 return 1; 2268 } 2269 2270 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, 2271 EVT) const { 2272 return MVT::i64; 2273 } 2274 2275 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( 2276 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, 2277 unsigned *Fast) const { 2278 if (Subtarget->requiresStrictAlign()) 2279 return false; 2280 2281 if (Fast) { 2282 // Some CPUs are fine with unaligned stores except for 128-bit ones. 2283 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || 2284 // See comments in performSTORECombine() for more details about 2285 // these conditions. 2286 2287 // Code that uses clang vector extensions can mark that it 2288 // wants unaligned accesses to be treated as fast by 2289 // underspecifying alignment to be 1 or 2. 2290 Alignment <= 2 || 2291 2292 // Disregard v2i64. Memcpy lowering produces those and splitting 2293 // them regresses performance on micro-benchmarks and olden/bh. 2294 VT == MVT::v2i64; 2295 } 2296 return true; 2297 } 2298 2299 // Same as above but handling LLTs instead. 2300 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( 2301 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, 2302 unsigned *Fast) const { 2303 if (Subtarget->requiresStrictAlign()) 2304 return false; 2305 2306 if (Fast) { 2307 // Some CPUs are fine with unaligned stores except for 128-bit ones. 2308 *Fast = !Subtarget->isMisaligned128StoreSlow() || 2309 Ty.getSizeInBytes() != 16 || 2310 // See comments in performSTORECombine() for more details about 2311 // these conditions. 2312 2313 // Code that uses clang vector extensions can mark that it 2314 // wants unaligned accesses to be treated as fast by 2315 // underspecifying alignment to be 1 or 2. 2316 Alignment <= 2 || 2317 2318 // Disregard v2i64. Memcpy lowering produces those and splitting 2319 // them regresses performance on micro-benchmarks and olden/bh. 2320 Ty == LLT::fixed_vector(2, 64); 2321 } 2322 return true; 2323 } 2324 2325 FastISel * 2326 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 2327 const TargetLibraryInfo *libInfo) const { 2328 return AArch64::createFastISel(funcInfo, libInfo); 2329 } 2330 2331 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 2332 #define MAKE_CASE(V) \ 2333 case V: \ 2334 return #V; 2335 switch ((AArch64ISD::NodeType)Opcode) { 2336 case AArch64ISD::FIRST_NUMBER: 2337 break; 2338 MAKE_CASE(AArch64ISD::SMSTART) 2339 MAKE_CASE(AArch64ISD::SMSTOP) 2340 MAKE_CASE(AArch64ISD::RESTORE_ZA) 2341 MAKE_CASE(AArch64ISD::CALL) 2342 MAKE_CASE(AArch64ISD::ADRP) 2343 MAKE_CASE(AArch64ISD::ADR) 2344 MAKE_CASE(AArch64ISD::ADDlow) 2345 MAKE_CASE(AArch64ISD::LOADgot) 2346 MAKE_CASE(AArch64ISD::RET_GLUE) 2347 MAKE_CASE(AArch64ISD::BRCOND) 2348 MAKE_CASE(AArch64ISD::CSEL) 2349 MAKE_CASE(AArch64ISD::CSINV) 2350 MAKE_CASE(AArch64ISD::CSNEG) 2351 MAKE_CASE(AArch64ISD::CSINC) 2352 MAKE_CASE(AArch64ISD::THREAD_POINTER) 2353 MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) 2354 MAKE_CASE(AArch64ISD::PROBED_ALLOCA) 2355 MAKE_CASE(AArch64ISD::ABDS_PRED) 2356 MAKE_CASE(AArch64ISD::ABDU_PRED) 2357 MAKE_CASE(AArch64ISD::HADDS_PRED) 2358 MAKE_CASE(AArch64ISD::HADDU_PRED) 2359 MAKE_CASE(AArch64ISD::MUL_PRED) 2360 MAKE_CASE(AArch64ISD::MULHS_PRED) 2361 MAKE_CASE(AArch64ISD::MULHU_PRED) 2362 MAKE_CASE(AArch64ISD::RHADDS_PRED) 2363 MAKE_CASE(AArch64ISD::RHADDU_PRED) 2364 MAKE_CASE(AArch64ISD::SDIV_PRED) 2365 MAKE_CASE(AArch64ISD::SHL_PRED) 2366 MAKE_CASE(AArch64ISD::SMAX_PRED) 2367 MAKE_CASE(AArch64ISD::SMIN_PRED) 2368 MAKE_CASE(AArch64ISD::SRA_PRED) 2369 MAKE_CASE(AArch64ISD::SRL_PRED) 2370 MAKE_CASE(AArch64ISD::UDIV_PRED) 2371 MAKE_CASE(AArch64ISD::UMAX_PRED) 2372 MAKE_CASE(AArch64ISD::UMIN_PRED) 2373 MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1) 2374 MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU) 2375 MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU) 2376 MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU) 2377 MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU) 2378 MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU) 2379 MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU) 2380 MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU) 2381 MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU) 2382 MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU) 2383 MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU) 2384 MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU) 2385 MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU) 2386 MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU) 2387 MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU) 2388 MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU) 2389 MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU) 2390 MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU) 2391 MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU) 2392 MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU) 2393 MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU) 2394 MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU) 2395 MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) 2396 MAKE_CASE(AArch64ISD::ADC) 2397 MAKE_CASE(AArch64ISD::SBC) 2398 MAKE_CASE(AArch64ISD::ADDS) 2399 MAKE_CASE(AArch64ISD::SUBS) 2400 MAKE_CASE(AArch64ISD::ADCS) 2401 MAKE_CASE(AArch64ISD::SBCS) 2402 MAKE_CASE(AArch64ISD::ANDS) 2403 MAKE_CASE(AArch64ISD::CCMP) 2404 MAKE_CASE(AArch64ISD::CCMN) 2405 MAKE_CASE(AArch64ISD::FCCMP) 2406 MAKE_CASE(AArch64ISD::FCMP) 2407 MAKE_CASE(AArch64ISD::STRICT_FCMP) 2408 MAKE_CASE(AArch64ISD::STRICT_FCMPE) 2409 MAKE_CASE(AArch64ISD::SME_ZA_LDR) 2410 MAKE_CASE(AArch64ISD::SME_ZA_STR) 2411 MAKE_CASE(AArch64ISD::DUP) 2412 MAKE_CASE(AArch64ISD::DUPLANE8) 2413 MAKE_CASE(AArch64ISD::DUPLANE16) 2414 MAKE_CASE(AArch64ISD::DUPLANE32) 2415 MAKE_CASE(AArch64ISD::DUPLANE64) 2416 MAKE_CASE(AArch64ISD::DUPLANE128) 2417 MAKE_CASE(AArch64ISD::MOVI) 2418 MAKE_CASE(AArch64ISD::MOVIshift) 2419 MAKE_CASE(AArch64ISD::MOVIedit) 2420 MAKE_CASE(AArch64ISD::MOVImsl) 2421 MAKE_CASE(AArch64ISD::FMOV) 2422 MAKE_CASE(AArch64ISD::MVNIshift) 2423 MAKE_CASE(AArch64ISD::MVNImsl) 2424 MAKE_CASE(AArch64ISD::BICi) 2425 MAKE_CASE(AArch64ISD::ORRi) 2426 MAKE_CASE(AArch64ISD::BSP) 2427 MAKE_CASE(AArch64ISD::ZIP1) 2428 MAKE_CASE(AArch64ISD::ZIP2) 2429 MAKE_CASE(AArch64ISD::UZP1) 2430 MAKE_CASE(AArch64ISD::UZP2) 2431 MAKE_CASE(AArch64ISD::TRN1) 2432 MAKE_CASE(AArch64ISD::TRN2) 2433 MAKE_CASE(AArch64ISD::REV16) 2434 MAKE_CASE(AArch64ISD::REV32) 2435 MAKE_CASE(AArch64ISD::REV64) 2436 MAKE_CASE(AArch64ISD::EXT) 2437 MAKE_CASE(AArch64ISD::SPLICE) 2438 MAKE_CASE(AArch64ISD::VSHL) 2439 MAKE_CASE(AArch64ISD::VLSHR) 2440 MAKE_CASE(AArch64ISD::VASHR) 2441 MAKE_CASE(AArch64ISD::VSLI) 2442 MAKE_CASE(AArch64ISD::VSRI) 2443 MAKE_CASE(AArch64ISD::CMEQ) 2444 MAKE_CASE(AArch64ISD::CMGE) 2445 MAKE_CASE(AArch64ISD::CMGT) 2446 MAKE_CASE(AArch64ISD::CMHI) 2447 MAKE_CASE(AArch64ISD::CMHS) 2448 MAKE_CASE(AArch64ISD::FCMEQ) 2449 MAKE_CASE(AArch64ISD::FCMGE) 2450 MAKE_CASE(AArch64ISD::FCMGT) 2451 MAKE_CASE(AArch64ISD::CMEQz) 2452 MAKE_CASE(AArch64ISD::CMGEz) 2453 MAKE_CASE(AArch64ISD::CMGTz) 2454 MAKE_CASE(AArch64ISD::CMLEz) 2455 MAKE_CASE(AArch64ISD::CMLTz) 2456 MAKE_CASE(AArch64ISD::FCMEQz) 2457 MAKE_CASE(AArch64ISD::FCMGEz) 2458 MAKE_CASE(AArch64ISD::FCMGTz) 2459 MAKE_CASE(AArch64ISD::FCMLEz) 2460 MAKE_CASE(AArch64ISD::FCMLTz) 2461 MAKE_CASE(AArch64ISD::SADDV) 2462 MAKE_CASE(AArch64ISD::UADDV) 2463 MAKE_CASE(AArch64ISD::UADDLV) 2464 MAKE_CASE(AArch64ISD::SDOT) 2465 MAKE_CASE(AArch64ISD::UDOT) 2466 MAKE_CASE(AArch64ISD::SMINV) 2467 MAKE_CASE(AArch64ISD::UMINV) 2468 MAKE_CASE(AArch64ISD::SMAXV) 2469 MAKE_CASE(AArch64ISD::UMAXV) 2470 MAKE_CASE(AArch64ISD::SADDV_PRED) 2471 MAKE_CASE(AArch64ISD::UADDV_PRED) 2472 MAKE_CASE(AArch64ISD::SMAXV_PRED) 2473 MAKE_CASE(AArch64ISD::UMAXV_PRED) 2474 MAKE_CASE(AArch64ISD::SMINV_PRED) 2475 MAKE_CASE(AArch64ISD::UMINV_PRED) 2476 MAKE_CASE(AArch64ISD::ORV_PRED) 2477 MAKE_CASE(AArch64ISD::EORV_PRED) 2478 MAKE_CASE(AArch64ISD::ANDV_PRED) 2479 MAKE_CASE(AArch64ISD::CLASTA_N) 2480 MAKE_CASE(AArch64ISD::CLASTB_N) 2481 MAKE_CASE(AArch64ISD::LASTA) 2482 MAKE_CASE(AArch64ISD::LASTB) 2483 MAKE_CASE(AArch64ISD::REINTERPRET_CAST) 2484 MAKE_CASE(AArch64ISD::LS64_BUILD) 2485 MAKE_CASE(AArch64ISD::LS64_EXTRACT) 2486 MAKE_CASE(AArch64ISD::TBL) 2487 MAKE_CASE(AArch64ISD::FADD_PRED) 2488 MAKE_CASE(AArch64ISD::FADDA_PRED) 2489 MAKE_CASE(AArch64ISD::FADDV_PRED) 2490 MAKE_CASE(AArch64ISD::FDIV_PRED) 2491 MAKE_CASE(AArch64ISD::FMA_PRED) 2492 MAKE_CASE(AArch64ISD::FMAX_PRED) 2493 MAKE_CASE(AArch64ISD::FMAXV_PRED) 2494 MAKE_CASE(AArch64ISD::FMAXNM_PRED) 2495 MAKE_CASE(AArch64ISD::FMAXNMV_PRED) 2496 MAKE_CASE(AArch64ISD::FMIN_PRED) 2497 MAKE_CASE(AArch64ISD::FMINV_PRED) 2498 MAKE_CASE(AArch64ISD::FMINNM_PRED) 2499 MAKE_CASE(AArch64ISD::FMINNMV_PRED) 2500 MAKE_CASE(AArch64ISD::FMUL_PRED) 2501 MAKE_CASE(AArch64ISD::FSUB_PRED) 2502 MAKE_CASE(AArch64ISD::RDSVL) 2503 MAKE_CASE(AArch64ISD::BIC) 2504 MAKE_CASE(AArch64ISD::BIT) 2505 MAKE_CASE(AArch64ISD::CBZ) 2506 MAKE_CASE(AArch64ISD::CBNZ) 2507 MAKE_CASE(AArch64ISD::TBZ) 2508 MAKE_CASE(AArch64ISD::TBNZ) 2509 MAKE_CASE(AArch64ISD::TC_RETURN) 2510 MAKE_CASE(AArch64ISD::PREFETCH) 2511 MAKE_CASE(AArch64ISD::SITOF) 2512 MAKE_CASE(AArch64ISD::UITOF) 2513 MAKE_CASE(AArch64ISD::NVCAST) 2514 MAKE_CASE(AArch64ISD::MRS) 2515 MAKE_CASE(AArch64ISD::SQSHL_I) 2516 MAKE_CASE(AArch64ISD::UQSHL_I) 2517 MAKE_CASE(AArch64ISD::SRSHR_I) 2518 MAKE_CASE(AArch64ISD::URSHR_I) 2519 MAKE_CASE(AArch64ISD::SQSHLU_I) 2520 MAKE_CASE(AArch64ISD::WrapperLarge) 2521 MAKE_CASE(AArch64ISD::LD2post) 2522 MAKE_CASE(AArch64ISD::LD3post) 2523 MAKE_CASE(AArch64ISD::LD4post) 2524 MAKE_CASE(AArch64ISD::ST2post) 2525 MAKE_CASE(AArch64ISD::ST3post) 2526 MAKE_CASE(AArch64ISD::ST4post) 2527 MAKE_CASE(AArch64ISD::LD1x2post) 2528 MAKE_CASE(AArch64ISD::LD1x3post) 2529 MAKE_CASE(AArch64ISD::LD1x4post) 2530 MAKE_CASE(AArch64ISD::ST1x2post) 2531 MAKE_CASE(AArch64ISD::ST1x3post) 2532 MAKE_CASE(AArch64ISD::ST1x4post) 2533 MAKE_CASE(AArch64ISD::LD1DUPpost) 2534 MAKE_CASE(AArch64ISD::LD2DUPpost) 2535 MAKE_CASE(AArch64ISD::LD3DUPpost) 2536 MAKE_CASE(AArch64ISD::LD4DUPpost) 2537 MAKE_CASE(AArch64ISD::LD1LANEpost) 2538 MAKE_CASE(AArch64ISD::LD2LANEpost) 2539 MAKE_CASE(AArch64ISD::LD3LANEpost) 2540 MAKE_CASE(AArch64ISD::LD4LANEpost) 2541 MAKE_CASE(AArch64ISD::ST2LANEpost) 2542 MAKE_CASE(AArch64ISD::ST3LANEpost) 2543 MAKE_CASE(AArch64ISD::ST4LANEpost) 2544 MAKE_CASE(AArch64ISD::SMULL) 2545 MAKE_CASE(AArch64ISD::UMULL) 2546 MAKE_CASE(AArch64ISD::PMULL) 2547 MAKE_CASE(AArch64ISD::FRECPE) 2548 MAKE_CASE(AArch64ISD::FRECPS) 2549 MAKE_CASE(AArch64ISD::FRSQRTE) 2550 MAKE_CASE(AArch64ISD::FRSQRTS) 2551 MAKE_CASE(AArch64ISD::STG) 2552 MAKE_CASE(AArch64ISD::STZG) 2553 MAKE_CASE(AArch64ISD::ST2G) 2554 MAKE_CASE(AArch64ISD::STZ2G) 2555 MAKE_CASE(AArch64ISD::SUNPKHI) 2556 MAKE_CASE(AArch64ISD::SUNPKLO) 2557 MAKE_CASE(AArch64ISD::UUNPKHI) 2558 MAKE_CASE(AArch64ISD::UUNPKLO) 2559 MAKE_CASE(AArch64ISD::INSR) 2560 MAKE_CASE(AArch64ISD::PTEST) 2561 MAKE_CASE(AArch64ISD::PTEST_ANY) 2562 MAKE_CASE(AArch64ISD::PTRUE) 2563 MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO) 2564 MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO) 2565 MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO) 2566 MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO) 2567 MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO) 2568 MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO) 2569 MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO) 2570 MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO) 2571 MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO) 2572 MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO) 2573 MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO) 2574 MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO) 2575 MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO) 2576 MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO) 2577 MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO) 2578 MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO) 2579 MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO) 2580 MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO) 2581 MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO) 2582 MAKE_CASE(AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) 2583 MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO) 2584 MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO) 2585 MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO) 2586 MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO) 2587 MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO) 2588 MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO) 2589 MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO) 2590 MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO) 2591 MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO) 2592 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO) 2593 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO) 2594 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO) 2595 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO) 2596 MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO) 2597 MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO) 2598 MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO) 2599 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO) 2600 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO) 2601 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO) 2602 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO) 2603 MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO) 2604 MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO) 2605 MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) 2606 MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO) 2607 MAKE_CASE(AArch64ISD::SST1Q_PRED) 2608 MAKE_CASE(AArch64ISD::SST1Q_INDEX_PRED) 2609 MAKE_CASE(AArch64ISD::ST1_PRED) 2610 MAKE_CASE(AArch64ISD::SST1_PRED) 2611 MAKE_CASE(AArch64ISD::SST1_SCALED_PRED) 2612 MAKE_CASE(AArch64ISD::SST1_SXTW_PRED) 2613 MAKE_CASE(AArch64ISD::SST1_UXTW_PRED) 2614 MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED) 2615 MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED) 2616 MAKE_CASE(AArch64ISD::SST1_IMM_PRED) 2617 MAKE_CASE(AArch64ISD::SSTNT1_PRED) 2618 MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED) 2619 MAKE_CASE(AArch64ISD::LDP) 2620 MAKE_CASE(AArch64ISD::LDIAPP) 2621 MAKE_CASE(AArch64ISD::LDNP) 2622 MAKE_CASE(AArch64ISD::STP) 2623 MAKE_CASE(AArch64ISD::STILP) 2624 MAKE_CASE(AArch64ISD::STNP) 2625 MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) 2626 MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU) 2627 MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU) 2628 MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU) 2629 MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU) 2630 MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU) 2631 MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) 2632 MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) 2633 MAKE_CASE(AArch64ISD::INDEX_VECTOR) 2634 MAKE_CASE(AArch64ISD::ADDP) 2635 MAKE_CASE(AArch64ISD::SADDLP) 2636 MAKE_CASE(AArch64ISD::UADDLP) 2637 MAKE_CASE(AArch64ISD::CALL_RVMARKER) 2638 MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL) 2639 MAKE_CASE(AArch64ISD::MOPS_MEMSET) 2640 MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING) 2641 MAKE_CASE(AArch64ISD::MOPS_MEMCOPY) 2642 MAKE_CASE(AArch64ISD::MOPS_MEMMOVE) 2643 MAKE_CASE(AArch64ISD::CALL_BTI) 2644 MAKE_CASE(AArch64ISD::MRRS) 2645 MAKE_CASE(AArch64ISD::MSRR) 2646 MAKE_CASE(AArch64ISD::RSHRNB_I) 2647 MAKE_CASE(AArch64ISD::CTTZ_ELTS) 2648 } 2649 #undef MAKE_CASE 2650 return nullptr; 2651 } 2652 2653 MachineBasicBlock * 2654 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, 2655 MachineBasicBlock *MBB) const { 2656 // We materialise the F128CSEL pseudo-instruction as some control flow and a 2657 // phi node: 2658 2659 // OrigBB: 2660 // [... previous instrs leading to comparison ...] 2661 // b.ne TrueBB 2662 // b EndBB 2663 // TrueBB: 2664 // ; Fallthrough 2665 // EndBB: 2666 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] 2667 2668 MachineFunction *MF = MBB->getParent(); 2669 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2670 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 2671 DebugLoc DL = MI.getDebugLoc(); 2672 MachineFunction::iterator It = ++MBB->getIterator(); 2673 2674 Register DestReg = MI.getOperand(0).getReg(); 2675 Register IfTrueReg = MI.getOperand(1).getReg(); 2676 Register IfFalseReg = MI.getOperand(2).getReg(); 2677 unsigned CondCode = MI.getOperand(3).getImm(); 2678 bool NZCVKilled = MI.getOperand(4).isKill(); 2679 2680 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 2681 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 2682 MF->insert(It, TrueBB); 2683 MF->insert(It, EndBB); 2684 2685 // Transfer rest of current basic-block to EndBB 2686 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), 2687 MBB->end()); 2688 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 2689 2690 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); 2691 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); 2692 MBB->addSuccessor(TrueBB); 2693 MBB->addSuccessor(EndBB); 2694 2695 // TrueBB falls through to the end. 2696 TrueBB->addSuccessor(EndBB); 2697 2698 if (!NZCVKilled) { 2699 TrueBB->addLiveIn(AArch64::NZCV); 2700 EndBB->addLiveIn(AArch64::NZCV); 2701 } 2702 2703 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) 2704 .addReg(IfTrueReg) 2705 .addMBB(TrueBB) 2706 .addReg(IfFalseReg) 2707 .addMBB(MBB); 2708 2709 MI.eraseFromParent(); 2710 return EndBB; 2711 } 2712 2713 MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( 2714 MachineInstr &MI, MachineBasicBlock *BB) const { 2715 assert(!isAsynchronousEHPersonality(classifyEHPersonality( 2716 BB->getParent()->getFunction().getPersonalityFn())) && 2717 "SEH does not use catchret!"); 2718 return BB; 2719 } 2720 2721 MachineBasicBlock * 2722 AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI, 2723 MachineBasicBlock *MBB) const { 2724 MachineFunction &MF = *MBB->getParent(); 2725 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2726 DebugLoc DL = MBB->findDebugLoc(MBBI); 2727 const AArch64InstrInfo &TII = 2728 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); 2729 Register TargetReg = MI.getOperand(0).getReg(); 2730 MachineBasicBlock::iterator NextInst = 2731 TII.probedStackAlloc(MBBI, TargetReg, false); 2732 2733 MI.eraseFromParent(); 2734 return NextInst->getParent(); 2735 } 2736 2737 MachineBasicBlock * 2738 AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg, 2739 MachineInstr &MI, 2740 MachineBasicBlock *BB) const { 2741 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2742 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); 2743 2744 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); 2745 MIB.add(MI.getOperand(1)); // slice index register 2746 MIB.add(MI.getOperand(2)); // slice index offset 2747 MIB.add(MI.getOperand(3)); // pg 2748 MIB.add(MI.getOperand(4)); // base 2749 MIB.add(MI.getOperand(5)); // offset 2750 2751 MI.eraseFromParent(); // The pseudo is gone now. 2752 return BB; 2753 } 2754 2755 MachineBasicBlock * 2756 AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const { 2757 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2758 MachineInstrBuilder MIB = 2759 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA)); 2760 2761 MIB.addReg(AArch64::ZA, RegState::Define); 2762 MIB.add(MI.getOperand(0)); // Vector select register 2763 MIB.add(MI.getOperand(1)); // Vector select offset 2764 MIB.add(MI.getOperand(2)); // Base 2765 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset 2766 2767 MI.eraseFromParent(); // The pseudo is gone now. 2768 return BB; 2769 } 2770 2771 MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI, 2772 MachineBasicBlock *BB, 2773 unsigned Opcode, 2774 bool Op0IsDef) const { 2775 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2776 MachineInstrBuilder MIB; 2777 2778 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode)) 2779 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0); 2780 for (unsigned I = 1; I < MI.getNumOperands(); ++I) 2781 MIB.add(MI.getOperand(I)); 2782 2783 MI.eraseFromParent(); // The pseudo is gone now. 2784 return BB; 2785 } 2786 2787 MachineBasicBlock * 2788 AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg, 2789 MachineInstr &MI, 2790 MachineBasicBlock *BB, bool HasTile) const { 2791 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2792 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); 2793 unsigned StartIdx = 0; 2794 2795 if (HasTile) { 2796 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); 2797 MIB.addReg(BaseReg + MI.getOperand(0).getImm()); 2798 StartIdx = 1; 2799 } else 2800 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg); 2801 2802 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I) 2803 MIB.add(MI.getOperand(I)); 2804 2805 MI.eraseFromParent(); // The pseudo is gone now. 2806 return BB; 2807 } 2808 2809 MachineBasicBlock * 2810 AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const { 2811 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2812 MachineInstrBuilder MIB = 2813 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M)); 2814 MIB.add(MI.getOperand(0)); // Mask 2815 2816 unsigned Mask = MI.getOperand(0).getImm(); 2817 for (unsigned I = 0; I < 8; I++) { 2818 if (Mask & (1 << I)) 2819 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine); 2820 } 2821 2822 MI.eraseFromParent(); // The pseudo is gone now. 2823 return BB; 2824 } 2825 2826 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( 2827 MachineInstr &MI, MachineBasicBlock *BB) const { 2828 2829 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode()); 2830 if (SMEOrigInstr != -1) { 2831 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2832 uint64_t SMEMatrixType = 2833 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask; 2834 switch (SMEMatrixType) { 2835 case (AArch64::SMEMatrixArray): 2836 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false); 2837 case (AArch64::SMEMatrixTileB): 2838 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true); 2839 case (AArch64::SMEMatrixTileH): 2840 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true); 2841 case (AArch64::SMEMatrixTileS): 2842 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true); 2843 case (AArch64::SMEMatrixTileD): 2844 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true); 2845 case (AArch64::SMEMatrixTileQ): 2846 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true); 2847 } 2848 } 2849 2850 switch (MI.getOpcode()) { 2851 default: 2852 #ifndef NDEBUG 2853 MI.dump(); 2854 #endif 2855 llvm_unreachable("Unexpected instruction for custom inserter!"); 2856 2857 case AArch64::F128CSEL: 2858 return EmitF128CSEL(MI, BB); 2859 case TargetOpcode::STATEPOINT: 2860 // STATEPOINT is a pseudo instruction which has no implicit defs/uses 2861 // while bl call instruction (where statepoint will be lowered at the end) 2862 // has implicit def. This def is early-clobber as it will be set at 2863 // the moment of the call and earlier than any use is read. 2864 // Add this implicit dead def here as a workaround. 2865 MI.addOperand(*MI.getMF(), 2866 MachineOperand::CreateReg( 2867 AArch64::LR, /*isDef*/ true, 2868 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true, 2869 /*isUndef*/ false, /*isEarlyClobber*/ true)); 2870 [[fallthrough]]; 2871 case TargetOpcode::STACKMAP: 2872 case TargetOpcode::PATCHPOINT: 2873 return emitPatchPoint(MI, BB); 2874 2875 case TargetOpcode::PATCHABLE_EVENT_CALL: 2876 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL: 2877 return BB; 2878 2879 case AArch64::CATCHRET: 2880 return EmitLoweredCatchRet(MI, BB); 2881 2882 case AArch64::PROBED_STACKALLOC_DYN: 2883 return EmitDynamicProbedAlloc(MI, BB); 2884 2885 case AArch64::LD1_MXIPXX_H_PSEUDO_B: 2886 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB); 2887 case AArch64::LD1_MXIPXX_H_PSEUDO_H: 2888 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB); 2889 case AArch64::LD1_MXIPXX_H_PSEUDO_S: 2890 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB); 2891 case AArch64::LD1_MXIPXX_H_PSEUDO_D: 2892 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB); 2893 case AArch64::LD1_MXIPXX_H_PSEUDO_Q: 2894 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB); 2895 case AArch64::LD1_MXIPXX_V_PSEUDO_B: 2896 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB); 2897 case AArch64::LD1_MXIPXX_V_PSEUDO_H: 2898 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB); 2899 case AArch64::LD1_MXIPXX_V_PSEUDO_S: 2900 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB); 2901 case AArch64::LD1_MXIPXX_V_PSEUDO_D: 2902 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB); 2903 case AArch64::LD1_MXIPXX_V_PSEUDO_Q: 2904 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB); 2905 case AArch64::LDR_ZA_PSEUDO: 2906 return EmitFill(MI, BB); 2907 case AArch64::LDR_TX_PSEUDO: 2908 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true); 2909 case AArch64::STR_TX_PSEUDO: 2910 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false); 2911 case AArch64::ZERO_M_PSEUDO: 2912 return EmitZero(MI, BB); 2913 case AArch64::ZERO_T_PSEUDO: 2914 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true); 2915 } 2916 } 2917 2918 //===----------------------------------------------------------------------===// 2919 // AArch64 Lowering private implementation. 2920 //===----------------------------------------------------------------------===// 2921 2922 //===----------------------------------------------------------------------===// 2923 // Lowering Code 2924 //===----------------------------------------------------------------------===// 2925 2926 // Forward declarations of SVE fixed length lowering helpers 2927 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT); 2928 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); 2929 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V); 2930 static SDValue convertFixedMaskToScalableVector(SDValue Mask, 2931 SelectionDAG &DAG); 2932 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, 2933 EVT VT); 2934 2935 /// isZerosVector - Check whether SDNode N is a zero-filled vector. 2936 static bool isZerosVector(const SDNode *N) { 2937 // Look through a bit convert. 2938 while (N->getOpcode() == ISD::BITCAST) 2939 N = N->getOperand(0).getNode(); 2940 2941 if (ISD::isConstantSplatVectorAllZeros(N)) 2942 return true; 2943 2944 if (N->getOpcode() != AArch64ISD::DUP) 2945 return false; 2946 2947 auto Opnd0 = N->getOperand(0); 2948 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0); 2949 } 2950 2951 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 2952 /// CC 2953 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { 2954 switch (CC) { 2955 default: 2956 llvm_unreachable("Unknown condition code!"); 2957 case ISD::SETNE: 2958 return AArch64CC::NE; 2959 case ISD::SETEQ: 2960 return AArch64CC::EQ; 2961 case ISD::SETGT: 2962 return AArch64CC::GT; 2963 case ISD::SETGE: 2964 return AArch64CC::GE; 2965 case ISD::SETLT: 2966 return AArch64CC::LT; 2967 case ISD::SETLE: 2968 return AArch64CC::LE; 2969 case ISD::SETUGT: 2970 return AArch64CC::HI; 2971 case ISD::SETUGE: 2972 return AArch64CC::HS; 2973 case ISD::SETULT: 2974 return AArch64CC::LO; 2975 case ISD::SETULE: 2976 return AArch64CC::LS; 2977 } 2978 } 2979 2980 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. 2981 static void changeFPCCToAArch64CC(ISD::CondCode CC, 2982 AArch64CC::CondCode &CondCode, 2983 AArch64CC::CondCode &CondCode2) { 2984 CondCode2 = AArch64CC::AL; 2985 switch (CC) { 2986 default: 2987 llvm_unreachable("Unknown FP condition!"); 2988 case ISD::SETEQ: 2989 case ISD::SETOEQ: 2990 CondCode = AArch64CC::EQ; 2991 break; 2992 case ISD::SETGT: 2993 case ISD::SETOGT: 2994 CondCode = AArch64CC::GT; 2995 break; 2996 case ISD::SETGE: 2997 case ISD::SETOGE: 2998 CondCode = AArch64CC::GE; 2999 break; 3000 case ISD::SETOLT: 3001 CondCode = AArch64CC::MI; 3002 break; 3003 case ISD::SETOLE: 3004 CondCode = AArch64CC::LS; 3005 break; 3006 case ISD::SETONE: 3007 CondCode = AArch64CC::MI; 3008 CondCode2 = AArch64CC::GT; 3009 break; 3010 case ISD::SETO: 3011 CondCode = AArch64CC::VC; 3012 break; 3013 case ISD::SETUO: 3014 CondCode = AArch64CC::VS; 3015 break; 3016 case ISD::SETUEQ: 3017 CondCode = AArch64CC::EQ; 3018 CondCode2 = AArch64CC::VS; 3019 break; 3020 case ISD::SETUGT: 3021 CondCode = AArch64CC::HI; 3022 break; 3023 case ISD::SETUGE: 3024 CondCode = AArch64CC::PL; 3025 break; 3026 case ISD::SETLT: 3027 case ISD::SETULT: 3028 CondCode = AArch64CC::LT; 3029 break; 3030 case ISD::SETLE: 3031 case ISD::SETULE: 3032 CondCode = AArch64CC::LE; 3033 break; 3034 case ISD::SETNE: 3035 case ISD::SETUNE: 3036 CondCode = AArch64CC::NE; 3037 break; 3038 } 3039 } 3040 3041 /// Convert a DAG fp condition code to an AArch64 CC. 3042 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that 3043 /// should be AND'ed instead of OR'ed. 3044 static void changeFPCCToANDAArch64CC(ISD::CondCode CC, 3045 AArch64CC::CondCode &CondCode, 3046 AArch64CC::CondCode &CondCode2) { 3047 CondCode2 = AArch64CC::AL; 3048 switch (CC) { 3049 default: 3050 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 3051 assert(CondCode2 == AArch64CC::AL); 3052 break; 3053 case ISD::SETONE: 3054 // (a one b) 3055 // == ((a olt b) || (a ogt b)) 3056 // == ((a ord b) && (a une b)) 3057 CondCode = AArch64CC::VC; 3058 CondCode2 = AArch64CC::NE; 3059 break; 3060 case ISD::SETUEQ: 3061 // (a ueq b) 3062 // == ((a uno b) || (a oeq b)) 3063 // == ((a ule b) && (a uge b)) 3064 CondCode = AArch64CC::PL; 3065 CondCode2 = AArch64CC::LE; 3066 break; 3067 } 3068 } 3069 3070 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 3071 /// CC usable with the vector instructions. Fewer operations are available 3072 /// without a real NZCV register, so we have to use less efficient combinations 3073 /// to get the same effect. 3074 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, 3075 AArch64CC::CondCode &CondCode, 3076 AArch64CC::CondCode &CondCode2, 3077 bool &Invert) { 3078 Invert = false; 3079 switch (CC) { 3080 default: 3081 // Mostly the scalar mappings work fine. 3082 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 3083 break; 3084 case ISD::SETUO: 3085 Invert = true; 3086 [[fallthrough]]; 3087 case ISD::SETO: 3088 CondCode = AArch64CC::MI; 3089 CondCode2 = AArch64CC::GE; 3090 break; 3091 case ISD::SETUEQ: 3092 case ISD::SETULT: 3093 case ISD::SETULE: 3094 case ISD::SETUGT: 3095 case ISD::SETUGE: 3096 // All of the compare-mask comparisons are ordered, but we can switch 3097 // between the two by a double inversion. E.g. ULE == !OGT. 3098 Invert = true; 3099 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32), 3100 CondCode, CondCode2); 3101 break; 3102 } 3103 } 3104 3105 static bool isLegalArithImmed(uint64_t C) { 3106 // Matches AArch64DAGToDAGISel::SelectArithImmed(). 3107 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); 3108 LLVM_DEBUG(dbgs() << "Is imm " << C 3109 << " legal: " << (IsLegal ? "yes\n" : "no\n")); 3110 return IsLegal; 3111 } 3112 3113 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on 3114 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags 3115 // can be set differently by this operation. It comes down to whether 3116 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then 3117 // everything is fine. If not then the optimization is wrong. Thus general 3118 // comparisons are only valid if op2 != 0. 3119 // 3120 // So, finally, the only LLVM-native comparisons that don't mention C and V 3121 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in 3122 // the absence of information about op2. 3123 static bool isCMN(SDValue Op, ISD::CondCode CC) { 3124 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) && 3125 (CC == ISD::SETEQ || CC == ISD::SETNE); 3126 } 3127 3128 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, 3129 SelectionDAG &DAG, SDValue Chain, 3130 bool IsSignaling) { 3131 EVT VT = LHS.getValueType(); 3132 assert(VT != MVT::f128); 3133 3134 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); 3135 3136 if (VT == MVT::f16 && !FullFP16) { 3137 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, 3138 {Chain, LHS}); 3139 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, 3140 {LHS.getValue(1), RHS}); 3141 Chain = RHS.getValue(1); 3142 VT = MVT::f32; 3143 } 3144 unsigned Opcode = 3145 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP; 3146 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS}); 3147 } 3148 3149 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, 3150 const SDLoc &dl, SelectionDAG &DAG) { 3151 EVT VT = LHS.getValueType(); 3152 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); 3153 3154 if (VT.isFloatingPoint()) { 3155 assert(VT != MVT::f128); 3156 if (VT == MVT::f16 && !FullFP16) { 3157 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 3158 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 3159 VT = MVT::f32; 3160 } 3161 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); 3162 } 3163 3164 // The CMP instruction is just an alias for SUBS, and representing it as 3165 // SUBS means that it's possible to get CSE with subtract operations. 3166 // A later phase can perform the optimization of setting the destination 3167 // register to WZR/XZR if it ends up being unused. 3168 unsigned Opcode = AArch64ISD::SUBS; 3169 3170 if (isCMN(RHS, CC)) { 3171 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ? 3172 Opcode = AArch64ISD::ADDS; 3173 RHS = RHS.getOperand(1); 3174 } else if (isCMN(LHS, CC)) { 3175 // As we are looking for EQ/NE compares, the operands can be commuted ; can 3176 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ? 3177 Opcode = AArch64ISD::ADDS; 3178 LHS = LHS.getOperand(1); 3179 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { 3180 if (LHS.getOpcode() == ISD::AND) { 3181 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST 3182 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one 3183 // of the signed comparisons. 3184 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl, 3185 DAG.getVTList(VT, MVT_CC), 3186 LHS.getOperand(0), 3187 LHS.getOperand(1)); 3188 // Replace all users of (and X, Y) with newly generated (ands X, Y) 3189 DAG.ReplaceAllUsesWith(LHS, ANDSNode); 3190 return ANDSNode.getValue(1); 3191 } else if (LHS.getOpcode() == AArch64ISD::ANDS) { 3192 // Use result of ANDS 3193 return LHS.getValue(1); 3194 } 3195 } 3196 3197 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) 3198 .getValue(1); 3199 } 3200 3201 /// \defgroup AArch64CCMP CMP;CCMP matching 3202 /// 3203 /// These functions deal with the formation of CMP;CCMP;... sequences. 3204 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of 3205 /// a comparison. They set the NZCV flags to a predefined value if their 3206 /// predicate is false. This allows to express arbitrary conjunctions, for 3207 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))" 3208 /// expressed as: 3209 /// cmp A 3210 /// ccmp B, inv(CB), CA 3211 /// check for CB flags 3212 /// 3213 /// This naturally lets us implement chains of AND operations with SETCC 3214 /// operands. And we can even implement some other situations by transforming 3215 /// them: 3216 /// - We can implement (NEG SETCC) i.e. negating a single comparison by 3217 /// negating the flags used in a CCMP/FCCMP operations. 3218 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations 3219 /// by negating the flags we test for afterwards. i.e. 3220 /// NEG (CMP CCMP CCCMP ...) can be implemented. 3221 /// - Note that we can only ever negate all previously processed results. 3222 /// What we can not implement by flipping the flags to test is a negation 3223 /// of two sub-trees (because the negation affects all sub-trees emitted so 3224 /// far, so the 2nd sub-tree we emit would also affect the first). 3225 /// With those tools we can implement some OR operations: 3226 /// - (OR (SETCC A) (SETCC B)) can be implemented via: 3227 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B))) 3228 /// - After transforming OR to NEG/AND combinations we may be able to use NEG 3229 /// elimination rules from earlier to implement the whole thing as a 3230 /// CCMP/FCCMP chain. 3231 /// 3232 /// As complete example: 3233 /// or (or (setCA (cmp A)) (setCB (cmp B))) 3234 /// (and (setCC (cmp C)) (setCD (cmp D)))" 3235 /// can be reassociated to: 3236 /// or (and (setCC (cmp C)) setCD (cmp D)) 3237 // (or (setCA (cmp A)) (setCB (cmp B))) 3238 /// can be transformed to: 3239 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) 3240 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" 3241 /// which can be implemented as: 3242 /// cmp C 3243 /// ccmp D, inv(CD), CC 3244 /// ccmp A, CA, inv(CD) 3245 /// ccmp B, CB, inv(CA) 3246 /// check for CB flags 3247 /// 3248 /// A counterexample is "or (and A B) (and C D)" which translates to 3249 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we 3250 /// can only implement 1 of the inner (not) operations, but not both! 3251 /// @{ 3252 3253 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. 3254 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, 3255 ISD::CondCode CC, SDValue CCOp, 3256 AArch64CC::CondCode Predicate, 3257 AArch64CC::CondCode OutCC, 3258 const SDLoc &DL, SelectionDAG &DAG) { 3259 unsigned Opcode = 0; 3260 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); 3261 3262 if (LHS.getValueType().isFloatingPoint()) { 3263 assert(LHS.getValueType() != MVT::f128); 3264 if (LHS.getValueType() == MVT::f16 && !FullFP16) { 3265 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); 3266 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); 3267 } 3268 Opcode = AArch64ISD::FCCMP; 3269 } else if (RHS.getOpcode() == ISD::SUB) { 3270 SDValue SubOp0 = RHS.getOperand(0); 3271 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 3272 // See emitComparison() on why we can only do this for SETEQ and SETNE. 3273 Opcode = AArch64ISD::CCMN; 3274 RHS = RHS.getOperand(1); 3275 } 3276 } 3277 if (Opcode == 0) 3278 Opcode = AArch64ISD::CCMP; 3279 3280 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); 3281 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 3282 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 3283 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); 3284 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); 3285 } 3286 3287 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be 3288 /// expressed as a conjunction. See \ref AArch64CCMP. 3289 /// \param CanNegate Set to true if we can negate the whole sub-tree just by 3290 /// changing the conditions on the SETCC tests. 3291 /// (this means we can call emitConjunctionRec() with 3292 /// Negate==true on this sub-tree) 3293 /// \param MustBeFirst Set to true if this subtree needs to be negated and we 3294 /// cannot do the negation naturally. We are required to 3295 /// emit the subtree first in this case. 3296 /// \param WillNegate Is true if are called when the result of this 3297 /// subexpression must be negated. This happens when the 3298 /// outer expression is an OR. We can use this fact to know 3299 /// that we have a double negation (or (or ...) ...) that 3300 /// can be implemented for free. 3301 static bool canEmitConjunction(const SDValue Val, bool &CanNegate, 3302 bool &MustBeFirst, bool WillNegate, 3303 unsigned Depth = 0) { 3304 if (!Val.hasOneUse()) 3305 return false; 3306 unsigned Opcode = Val->getOpcode(); 3307 if (Opcode == ISD::SETCC) { 3308 if (Val->getOperand(0).getValueType() == MVT::f128) 3309 return false; 3310 CanNegate = true; 3311 MustBeFirst = false; 3312 return true; 3313 } 3314 // Protect against exponential runtime and stack overflow. 3315 if (Depth > 6) 3316 return false; 3317 if (Opcode == ISD::AND || Opcode == ISD::OR) { 3318 bool IsOR = Opcode == ISD::OR; 3319 SDValue O0 = Val->getOperand(0); 3320 SDValue O1 = Val->getOperand(1); 3321 bool CanNegateL; 3322 bool MustBeFirstL; 3323 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1)) 3324 return false; 3325 bool CanNegateR; 3326 bool MustBeFirstR; 3327 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1)) 3328 return false; 3329 3330 if (MustBeFirstL && MustBeFirstR) 3331 return false; 3332 3333 if (IsOR) { 3334 // For an OR expression we need to be able to naturally negate at least 3335 // one side or we cannot do the transformation at all. 3336 if (!CanNegateL && !CanNegateR) 3337 return false; 3338 // If we the result of the OR will be negated and we can naturally negate 3339 // the leafs, then this sub-tree as a whole negates naturally. 3340 CanNegate = WillNegate && CanNegateL && CanNegateR; 3341 // If we cannot naturally negate the whole sub-tree, then this must be 3342 // emitted first. 3343 MustBeFirst = !CanNegate; 3344 } else { 3345 assert(Opcode == ISD::AND && "Must be OR or AND"); 3346 // We cannot naturally negate an AND operation. 3347 CanNegate = false; 3348 MustBeFirst = MustBeFirstL || MustBeFirstR; 3349 } 3350 return true; 3351 } 3352 return false; 3353 } 3354 3355 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain 3356 /// of CCMP/CFCMP ops. See @ref AArch64CCMP. 3357 /// Tries to transform the given i1 producing node @p Val to a series compare 3358 /// and conditional compare operations. @returns an NZCV flags producing node 3359 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if 3360 /// transformation was not possible. 3361 /// \p Negate is true if we want this sub-tree being negated just by changing 3362 /// SETCC conditions. 3363 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, 3364 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, 3365 AArch64CC::CondCode Predicate) { 3366 // We're at a tree leaf, produce a conditional comparison operation. 3367 unsigned Opcode = Val->getOpcode(); 3368 if (Opcode == ISD::SETCC) { 3369 SDValue LHS = Val->getOperand(0); 3370 SDValue RHS = Val->getOperand(1); 3371 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); 3372 bool isInteger = LHS.getValueType().isInteger(); 3373 if (Negate) 3374 CC = getSetCCInverse(CC, LHS.getValueType()); 3375 SDLoc DL(Val); 3376 // Determine OutCC and handle FP special case. 3377 if (isInteger) { 3378 OutCC = changeIntCCToAArch64CC(CC); 3379 } else { 3380 assert(LHS.getValueType().isFloatingPoint()); 3381 AArch64CC::CondCode ExtraCC; 3382 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); 3383 // Some floating point conditions can't be tested with a single condition 3384 // code. Construct an additional comparison in this case. 3385 if (ExtraCC != AArch64CC::AL) { 3386 SDValue ExtraCmp; 3387 if (!CCOp.getNode()) 3388 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); 3389 else 3390 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, 3391 ExtraCC, DL, DAG); 3392 CCOp = ExtraCmp; 3393 Predicate = ExtraCC; 3394 } 3395 } 3396 3397 // Produce a normal comparison if we are first in the chain 3398 if (!CCOp) 3399 return emitComparison(LHS, RHS, CC, DL, DAG); 3400 // Otherwise produce a ccmp. 3401 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, 3402 DAG); 3403 } 3404 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree"); 3405 3406 bool IsOR = Opcode == ISD::OR; 3407 3408 SDValue LHS = Val->getOperand(0); 3409 bool CanNegateL; 3410 bool MustBeFirstL; 3411 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR); 3412 assert(ValidL && "Valid conjunction/disjunction tree"); 3413 (void)ValidL; 3414 3415 SDValue RHS = Val->getOperand(1); 3416 bool CanNegateR; 3417 bool MustBeFirstR; 3418 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR); 3419 assert(ValidR && "Valid conjunction/disjunction tree"); 3420 (void)ValidR; 3421 3422 // Swap sub-tree that must come first to the right side. 3423 if (MustBeFirstL) { 3424 assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); 3425 std::swap(LHS, RHS); 3426 std::swap(CanNegateL, CanNegateR); 3427 std::swap(MustBeFirstL, MustBeFirstR); 3428 } 3429 3430 bool NegateR; 3431 bool NegateAfterR; 3432 bool NegateL; 3433 bool NegateAfterAll; 3434 if (Opcode == ISD::OR) { 3435 // Swap the sub-tree that we can negate naturally to the left. 3436 if (!CanNegateL) { 3437 assert(CanNegateR && "at least one side must be negatable"); 3438 assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); 3439 assert(!Negate); 3440 std::swap(LHS, RHS); 3441 NegateR = false; 3442 NegateAfterR = true; 3443 } else { 3444 // Negate the left sub-tree if possible, otherwise negate the result. 3445 NegateR = CanNegateR; 3446 NegateAfterR = !CanNegateR; 3447 } 3448 NegateL = true; 3449 NegateAfterAll = !Negate; 3450 } else { 3451 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree"); 3452 assert(!Negate && "Valid conjunction/disjunction tree"); 3453 3454 NegateL = false; 3455 NegateR = false; 3456 NegateAfterR = false; 3457 NegateAfterAll = false; 3458 } 3459 3460 // Emit sub-trees. 3461 AArch64CC::CondCode RHSCC; 3462 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate); 3463 if (NegateAfterR) 3464 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 3465 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC); 3466 if (NegateAfterAll) 3467 OutCC = AArch64CC::getInvertedCondCode(OutCC); 3468 return CmpL; 3469 } 3470 3471 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). 3472 /// In some cases this is even possible with OR operations in the expression. 3473 /// See \ref AArch64CCMP. 3474 /// \see emitConjunctionRec(). 3475 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, 3476 AArch64CC::CondCode &OutCC) { 3477 bool DummyCanNegate; 3478 bool DummyMustBeFirst; 3479 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false)) 3480 return SDValue(); 3481 3482 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL); 3483 } 3484 3485 /// @} 3486 3487 /// Returns how profitable it is to fold a comparison's operand's shift and/or 3488 /// extension operations. 3489 static unsigned getCmpOperandFoldingProfit(SDValue Op) { 3490 auto isSupportedExtend = [&](SDValue V) { 3491 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG) 3492 return true; 3493 3494 if (V.getOpcode() == ISD::AND) 3495 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) { 3496 uint64_t Mask = MaskCst->getZExtValue(); 3497 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF); 3498 } 3499 3500 return false; 3501 }; 3502 3503 if (!Op.hasOneUse()) 3504 return 0; 3505 3506 if (isSupportedExtend(Op)) 3507 return 1; 3508 3509 unsigned Opc = Op.getOpcode(); 3510 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) 3511 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 3512 uint64_t Shift = ShiftCst->getZExtValue(); 3513 if (isSupportedExtend(Op.getOperand(0))) 3514 return (Shift <= 4) ? 2 : 1; 3515 EVT VT = Op.getValueType(); 3516 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63)) 3517 return 1; 3518 } 3519 3520 return 0; 3521 } 3522 3523 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 3524 SDValue &AArch64cc, SelectionDAG &DAG, 3525 const SDLoc &dl) { 3526 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 3527 EVT VT = RHS.getValueType(); 3528 uint64_t C = RHSC->getZExtValue(); 3529 if (!isLegalArithImmed(C)) { 3530 // Constant does not fit, try adjusting it by one? 3531 switch (CC) { 3532 default: 3533 break; 3534 case ISD::SETLT: 3535 case ISD::SETGE: 3536 if ((VT == MVT::i32 && C != 0x80000000 && 3537 isLegalArithImmed((uint32_t)(C - 1))) || 3538 (VT == MVT::i64 && C != 0x80000000ULL && 3539 isLegalArithImmed(C - 1ULL))) { 3540 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 3541 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 3542 RHS = DAG.getConstant(C, dl, VT); 3543 } 3544 break; 3545 case ISD::SETULT: 3546 case ISD::SETUGE: 3547 if ((VT == MVT::i32 && C != 0 && 3548 isLegalArithImmed((uint32_t)(C - 1))) || 3549 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { 3550 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 3551 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 3552 RHS = DAG.getConstant(C, dl, VT); 3553 } 3554 break; 3555 case ISD::SETLE: 3556 case ISD::SETGT: 3557 if ((VT == MVT::i32 && C != INT32_MAX && 3558 isLegalArithImmed((uint32_t)(C + 1))) || 3559 (VT == MVT::i64 && C != INT64_MAX && 3560 isLegalArithImmed(C + 1ULL))) { 3561 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 3562 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 3563 RHS = DAG.getConstant(C, dl, VT); 3564 } 3565 break; 3566 case ISD::SETULE: 3567 case ISD::SETUGT: 3568 if ((VT == MVT::i32 && C != UINT32_MAX && 3569 isLegalArithImmed((uint32_t)(C + 1))) || 3570 (VT == MVT::i64 && C != UINT64_MAX && 3571 isLegalArithImmed(C + 1ULL))) { 3572 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 3573 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 3574 RHS = DAG.getConstant(C, dl, VT); 3575 } 3576 break; 3577 } 3578 } 3579 } 3580 3581 // Comparisons are canonicalized so that the RHS operand is simpler than the 3582 // LHS one, the extreme case being when RHS is an immediate. However, AArch64 3583 // can fold some shift+extend operations on the RHS operand, so swap the 3584 // operands if that can be done. 3585 // 3586 // For example: 3587 // lsl w13, w11, #1 3588 // cmp w13, w12 3589 // can be turned into: 3590 // cmp w12, w11, lsl #1 3591 if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) { 3592 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS; 3593 3594 if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) { 3595 std::swap(LHS, RHS); 3596 CC = ISD::getSetCCSwappedOperands(CC); 3597 } 3598 } 3599 3600 SDValue Cmp; 3601 AArch64CC::CondCode AArch64CC; 3602 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { 3603 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); 3604 3605 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. 3606 // For the i8 operand, the largest immediate is 255, so this can be easily 3607 // encoded in the compare instruction. For the i16 operand, however, the 3608 // largest immediate cannot be encoded in the compare. 3609 // Therefore, use a sign extending load and cmn to avoid materializing the 3610 // -1 constant. For example, 3611 // movz w1, #65535 3612 // ldrh w0, [x0, #0] 3613 // cmp w0, w1 3614 // > 3615 // ldrsh w0, [x0, #0] 3616 // cmn w0, #1 3617 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) 3618 // if and only if (sext LHS) == (sext RHS). The checks are in place to 3619 // ensure both the LHS and RHS are truly zero extended and to make sure the 3620 // transformation is profitable. 3621 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && 3622 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && 3623 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && 3624 LHS.getNode()->hasNUsesOfValue(1, 0)) { 3625 int16_t ValueofRHS = RHS->getAsZExtVal(); 3626 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { 3627 SDValue SExt = 3628 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, 3629 DAG.getValueType(MVT::i16)); 3630 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, 3631 RHS.getValueType()), 3632 CC, dl, DAG); 3633 AArch64CC = changeIntCCToAArch64CC(CC); 3634 } 3635 } 3636 3637 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) { 3638 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) { 3639 if ((CC == ISD::SETNE) ^ RHSC->isZero()) 3640 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); 3641 } 3642 } 3643 } 3644 3645 if (!Cmp) { 3646 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3647 AArch64CC = changeIntCCToAArch64CC(CC); 3648 } 3649 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); 3650 return Cmp; 3651 } 3652 3653 static std::pair<SDValue, SDValue> 3654 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { 3655 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && 3656 "Unsupported value type"); 3657 SDValue Value, Overflow; 3658 SDLoc DL(Op); 3659 SDValue LHS = Op.getOperand(0); 3660 SDValue RHS = Op.getOperand(1); 3661 unsigned Opc = 0; 3662 switch (Op.getOpcode()) { 3663 default: 3664 llvm_unreachable("Unknown overflow instruction!"); 3665 case ISD::SADDO: 3666 Opc = AArch64ISD::ADDS; 3667 CC = AArch64CC::VS; 3668 break; 3669 case ISD::UADDO: 3670 Opc = AArch64ISD::ADDS; 3671 CC = AArch64CC::HS; 3672 break; 3673 case ISD::SSUBO: 3674 Opc = AArch64ISD::SUBS; 3675 CC = AArch64CC::VS; 3676 break; 3677 case ISD::USUBO: 3678 Opc = AArch64ISD::SUBS; 3679 CC = AArch64CC::LO; 3680 break; 3681 // Multiply needs a little bit extra work. 3682 case ISD::SMULO: 3683 case ISD::UMULO: { 3684 CC = AArch64CC::NE; 3685 bool IsSigned = Op.getOpcode() == ISD::SMULO; 3686 if (Op.getValueType() == MVT::i32) { 3687 // Extend to 64-bits, then perform a 64-bit multiply. 3688 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3689 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); 3690 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); 3691 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 3692 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul); 3693 3694 // Check that the result fits into a 32-bit integer. 3695 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC); 3696 if (IsSigned) { 3697 // cmp xreg, wreg, sxtw 3698 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value); 3699 Overflow = 3700 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1); 3701 } else { 3702 // tst xreg, #0xffffffff00000000 3703 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64); 3704 Overflow = 3705 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1); 3706 } 3707 break; 3708 } 3709 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); 3710 // For the 64 bit multiply 3711 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 3712 if (IsSigned) { 3713 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); 3714 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, 3715 DAG.getConstant(63, DL, MVT::i64)); 3716 // It is important that LowerBits is last, otherwise the arithmetic 3717 // shift will not be folded into the compare (SUBS). 3718 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 3719 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 3720 .getValue(1); 3721 } else { 3722 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); 3723 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 3724 Overflow = 3725 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 3726 DAG.getConstant(0, DL, MVT::i64), 3727 UpperBits).getValue(1); 3728 } 3729 break; 3730 } 3731 } // switch (...) 3732 3733 if (Opc) { 3734 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); 3735 3736 // Emit the AArch64 operation with overflow check. 3737 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); 3738 Overflow = Value.getValue(1); 3739 } 3740 return std::make_pair(Value, Overflow); 3741 } 3742 3743 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { 3744 if (useSVEForFixedLengthVectorVT(Op.getValueType(), 3745 !Subtarget->isNeonAvailable())) 3746 return LowerToScalableOp(Op, DAG); 3747 3748 SDValue Sel = Op.getOperand(0); 3749 SDValue Other = Op.getOperand(1); 3750 SDLoc dl(Sel); 3751 3752 // If the operand is an overflow checking operation, invert the condition 3753 // code and kill the Not operation. I.e., transform: 3754 // (xor (overflow_op_bool, 1)) 3755 // --> 3756 // (csel 1, 0, invert(cc), overflow_op_bool) 3757 // ... which later gets transformed to just a cset instruction with an 3758 // inverted condition code, rather than a cset + eor sequence. 3759 if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) { 3760 // Only lower legal XALUO ops. 3761 if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) 3762 return SDValue(); 3763 3764 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 3765 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 3766 AArch64CC::CondCode CC; 3767 SDValue Value, Overflow; 3768 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG); 3769 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 3770 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal, 3771 CCVal, Overflow); 3772 } 3773 // If neither operand is a SELECT_CC, give up. 3774 if (Sel.getOpcode() != ISD::SELECT_CC) 3775 std::swap(Sel, Other); 3776 if (Sel.getOpcode() != ISD::SELECT_CC) 3777 return Op; 3778 3779 // The folding we want to perform is: 3780 // (xor x, (select_cc a, b, cc, 0, -1) ) 3781 // --> 3782 // (csel x, (xor x, -1), cc ...) 3783 // 3784 // The latter will get matched to a CSINV instruction. 3785 3786 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); 3787 SDValue LHS = Sel.getOperand(0); 3788 SDValue RHS = Sel.getOperand(1); 3789 SDValue TVal = Sel.getOperand(2); 3790 SDValue FVal = Sel.getOperand(3); 3791 3792 // FIXME: This could be generalized to non-integer comparisons. 3793 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 3794 return Op; 3795 3796 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 3797 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 3798 3799 // The values aren't constants, this isn't the pattern we're looking for. 3800 if (!CFVal || !CTVal) 3801 return Op; 3802 3803 // We can commute the SELECT_CC by inverting the condition. This 3804 // might be needed to make this fit into a CSINV pattern. 3805 if (CTVal->isAllOnes() && CFVal->isZero()) { 3806 std::swap(TVal, FVal); 3807 std::swap(CTVal, CFVal); 3808 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 3809 } 3810 3811 // If the constants line up, perform the transform! 3812 if (CTVal->isZero() && CFVal->isAllOnes()) { 3813 SDValue CCVal; 3814 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 3815 3816 FVal = Other; 3817 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, 3818 DAG.getConstant(-1ULL, dl, Other.getValueType())); 3819 3820 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, 3821 CCVal, Cmp); 3822 } 3823 3824 return Op; 3825 } 3826 3827 // If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C' 3828 // bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else 3829 // sets 'C' bit to 0. 3830 static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) { 3831 SDLoc DL(Value); 3832 EVT VT = Value.getValueType(); 3833 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value; 3834 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT); 3835 SDValue Cmp = 3836 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1); 3837 return Cmp.getValue(1); 3838 } 3839 3840 // If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0. 3841 // If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1. 3842 static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, 3843 bool Invert) { 3844 assert(Glue.getResNo() == 1); 3845 SDLoc DL(Glue); 3846 SDValue Zero = DAG.getConstant(0, DL, VT); 3847 SDValue One = DAG.getConstant(1, DL, VT); 3848 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS; 3849 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32); 3850 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue); 3851 } 3852 3853 // Value is 1 if 'V' bit of NZCV is 1, else 0 3854 static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) { 3855 assert(Glue.getResNo() == 1); 3856 SDLoc DL(Glue); 3857 SDValue Zero = DAG.getConstant(0, DL, VT); 3858 SDValue One = DAG.getConstant(1, DL, VT); 3859 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32); 3860 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue); 3861 } 3862 3863 // This lowering is inefficient, but it will get cleaned up by 3864 // `foldOverflowCheck` 3865 static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, 3866 unsigned Opcode, bool IsSigned) { 3867 EVT VT0 = Op.getValue(0).getValueType(); 3868 EVT VT1 = Op.getValue(1).getValueType(); 3869 3870 if (VT0 != MVT::i32 && VT0 != MVT::i64) 3871 return SDValue(); 3872 3873 bool InvertCarry = Opcode == AArch64ISD::SBCS; 3874 SDValue OpLHS = Op.getOperand(0); 3875 SDValue OpRHS = Op.getOperand(1); 3876 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry); 3877 3878 SDLoc DL(Op); 3879 SDVTList VTs = DAG.getVTList(VT0, VT1); 3880 3881 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS, 3882 OpRHS, OpCarryIn); 3883 3884 SDValue OutFlag = 3885 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG) 3886 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry); 3887 3888 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag); 3889 } 3890 3891 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 3892 // Let legalize expand this if it isn't a legal type yet. 3893 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 3894 return SDValue(); 3895 3896 SDLoc dl(Op); 3897 AArch64CC::CondCode CC; 3898 // The actual operation that sets the overflow or carry flag. 3899 SDValue Value, Overflow; 3900 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); 3901 3902 // We use 0 and 1 as false and true values. 3903 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 3904 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 3905 3906 // We use an inverted condition, because the conditional select is inverted 3907 // too. This will allow it to be selected to a single instruction: 3908 // CSINC Wd, WZR, WZR, invert(cond). 3909 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 3910 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, 3911 CCVal, Overflow); 3912 3913 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 3914 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 3915 } 3916 3917 // Prefetch operands are: 3918 // 1: Address to prefetch 3919 // 2: bool isWrite 3920 // 3: int locality (0 = no locality ... 3 = extreme locality) 3921 // 4: bool isDataCache 3922 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { 3923 SDLoc DL(Op); 3924 unsigned IsWrite = Op.getConstantOperandVal(2); 3925 unsigned Locality = Op.getConstantOperandVal(3); 3926 unsigned IsData = Op.getConstantOperandVal(4); 3927 3928 bool IsStream = !Locality; 3929 // When the locality number is set 3930 if (Locality) { 3931 // The front-end should have filtered out the out-of-range values 3932 assert(Locality <= 3 && "Prefetch locality out-of-range"); 3933 // The locality degree is the opposite of the cache speed. 3934 // Put the number the other way around. 3935 // The encoding starts at 0 for level 1 3936 Locality = 3 - Locality; 3937 } 3938 3939 // built the mask value encoding the expected behavior. 3940 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 3941 (!IsData << 3) | // IsDataCache bit 3942 (Locality << 1) | // Cache level bits 3943 (unsigned)IsStream; // Stream bit 3944 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), 3945 DAG.getTargetConstant(PrfOp, DL, MVT::i32), 3946 Op.getOperand(1)); 3947 } 3948 3949 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, 3950 SelectionDAG &DAG) const { 3951 EVT VT = Op.getValueType(); 3952 if (VT.isScalableVector()) 3953 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU); 3954 3955 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) 3956 return LowerFixedLengthFPExtendToSVE(Op, DAG); 3957 3958 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 3959 return SDValue(); 3960 } 3961 3962 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, 3963 SelectionDAG &DAG) const { 3964 if (Op.getValueType().isScalableVector()) 3965 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU); 3966 3967 bool IsStrict = Op->isStrictFPOpcode(); 3968 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 3969 EVT SrcVT = SrcVal.getValueType(); 3970 3971 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable())) 3972 return LowerFixedLengthFPRoundToSVE(Op, DAG); 3973 3974 if (SrcVT != MVT::f128) { 3975 // Expand cases where the input is a vector bigger than NEON. 3976 if (useSVEForFixedLengthVectorVT(SrcVT)) 3977 return SDValue(); 3978 3979 // It's legal except when f128 is involved 3980 return Op; 3981 } 3982 3983 return SDValue(); 3984 } 3985 3986 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, 3987 SelectionDAG &DAG) const { 3988 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 3989 // Any additional optimization in this function should be recorded 3990 // in the cost tables. 3991 bool IsStrict = Op->isStrictFPOpcode(); 3992 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType(); 3993 EVT VT = Op.getValueType(); 3994 3995 if (VT.isScalableVector()) { 3996 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT 3997 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU 3998 : AArch64ISD::FCVTZS_MERGE_PASSTHRU; 3999 return LowerToPredicatedOp(Op, DAG, Opcode); 4000 } 4001 4002 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) || 4003 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) 4004 return LowerFixedLengthFPToIntToSVE(Op, DAG); 4005 4006 unsigned NumElts = InVT.getVectorNumElements(); 4007 4008 // f16 conversions are promoted to f32 when full fp16 is not supported. 4009 if (InVT.getVectorElementType() == MVT::f16 && 4010 !Subtarget->hasFullFP16()) { 4011 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); 4012 SDLoc dl(Op); 4013 if (IsStrict) { 4014 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other}, 4015 {Op.getOperand(0), Op.getOperand(1)}); 4016 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, 4017 {Ext.getValue(1), Ext.getValue(0)}); 4018 } 4019 return DAG.getNode( 4020 Op.getOpcode(), dl, Op.getValueType(), 4021 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); 4022 } 4023 4024 uint64_t VTSize = VT.getFixedSizeInBits(); 4025 uint64_t InVTSize = InVT.getFixedSizeInBits(); 4026 if (VTSize < InVTSize) { 4027 SDLoc dl(Op); 4028 if (IsStrict) { 4029 InVT = InVT.changeVectorElementTypeToInteger(); 4030 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other}, 4031 {Op.getOperand(0), Op.getOperand(1)}); 4032 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); 4033 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl); 4034 } 4035 SDValue Cv = 4036 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), 4037 Op.getOperand(0)); 4038 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); 4039 } 4040 4041 if (VTSize > InVTSize) { 4042 SDLoc dl(Op); 4043 MVT ExtVT = 4044 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), 4045 VT.getVectorNumElements()); 4046 if (IsStrict) { 4047 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other}, 4048 {Op.getOperand(0), Op.getOperand(1)}); 4049 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, 4050 {Ext.getValue(1), Ext.getValue(0)}); 4051 } 4052 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); 4053 return DAG.getNode(Op.getOpcode(), dl, VT, Ext); 4054 } 4055 4056 // Use a scalar operation for conversions between single-element vectors of 4057 // the same size. 4058 if (NumElts == 1) { 4059 SDLoc dl(Op); 4060 SDValue Extract = DAG.getNode( 4061 ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(), 4062 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64)); 4063 EVT ScalarVT = VT.getScalarType(); 4064 if (IsStrict) 4065 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other}, 4066 {Op.getOperand(0), Extract}); 4067 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract); 4068 } 4069 4070 // Type changing conversions are illegal. 4071 return Op; 4072 } 4073 4074 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, 4075 SelectionDAG &DAG) const { 4076 bool IsStrict = Op->isStrictFPOpcode(); 4077 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 4078 4079 if (SrcVal.getValueType().isVector()) 4080 return LowerVectorFP_TO_INT(Op, DAG); 4081 4082 // f16 conversions are promoted to f32 when full fp16 is not supported. 4083 if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { 4084 SDLoc dl(Op); 4085 if (IsStrict) { 4086 SDValue Ext = 4087 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, 4088 {Op.getOperand(0), SrcVal}); 4089 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other}, 4090 {Ext.getValue(1), Ext.getValue(0)}); 4091 } 4092 return DAG.getNode( 4093 Op.getOpcode(), dl, Op.getValueType(), 4094 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal)); 4095 } 4096 4097 if (SrcVal.getValueType() != MVT::f128) { 4098 // It's legal except when f128 is involved 4099 return Op; 4100 } 4101 4102 return SDValue(); 4103 } 4104 4105 SDValue 4106 AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op, 4107 SelectionDAG &DAG) const { 4108 // AArch64 FP-to-int conversions saturate to the destination element size, so 4109 // we can lower common saturating conversions to simple instructions. 4110 SDValue SrcVal = Op.getOperand(0); 4111 EVT SrcVT = SrcVal.getValueType(); 4112 EVT DstVT = Op.getValueType(); 4113 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 4114 4115 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits(); 4116 uint64_t DstElementWidth = DstVT.getScalarSizeInBits(); 4117 uint64_t SatWidth = SatVT.getScalarSizeInBits(); 4118 assert(SatWidth <= DstElementWidth && 4119 "Saturation width cannot exceed result width"); 4120 4121 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT. 4122 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable 4123 // types, so this is hard to reach. 4124 if (DstVT.isScalableVector()) 4125 return SDValue(); 4126 4127 EVT SrcElementVT = SrcVT.getVectorElementType(); 4128 4129 // In the absence of FP16 support, promote f16 to f32 and saturate the result. 4130 if (SrcElementVT == MVT::f16 && 4131 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) { 4132 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements()); 4133 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal); 4134 SrcVT = F32VT; 4135 SrcElementVT = MVT::f32; 4136 SrcElementWidth = 32; 4137 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 && 4138 SrcElementVT != MVT::f16) 4139 return SDValue(); 4140 4141 SDLoc DL(Op); 4142 // Cases that we can emit directly. 4143 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) 4144 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, 4145 DAG.getValueType(DstVT.getScalarType())); 4146 4147 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the 4148 // result. This is only valid if the legal cvt is larger than the saturate 4149 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize 4150 // (at least until sqxtn is selected). 4151 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64) 4152 return SDValue(); 4153 4154 EVT IntVT = SrcVT.changeVectorElementTypeToInteger(); 4155 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal, 4156 DAG.getValueType(IntVT.getScalarType())); 4157 SDValue Sat; 4158 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { 4159 SDValue MinC = DAG.getConstant( 4160 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT); 4161 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC); 4162 SDValue MaxC = DAG.getConstant( 4163 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT); 4164 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC); 4165 } else { 4166 SDValue MinC = DAG.getConstant( 4167 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT); 4168 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC); 4169 } 4170 4171 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat); 4172 } 4173 4174 SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, 4175 SelectionDAG &DAG) const { 4176 // AArch64 FP-to-int conversions saturate to the destination register size, so 4177 // we can lower common saturating conversions to simple instructions. 4178 SDValue SrcVal = Op.getOperand(0); 4179 EVT SrcVT = SrcVal.getValueType(); 4180 4181 if (SrcVT.isVector()) 4182 return LowerVectorFP_TO_INT_SAT(Op, DAG); 4183 4184 EVT DstVT = Op.getValueType(); 4185 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 4186 uint64_t SatWidth = SatVT.getScalarSizeInBits(); 4187 uint64_t DstWidth = DstVT.getScalarSizeInBits(); 4188 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width"); 4189 4190 // In the absence of FP16 support, promote f16 to f32 and saturate the result. 4191 if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) { 4192 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal); 4193 SrcVT = MVT::f32; 4194 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16) 4195 return SDValue(); 4196 4197 SDLoc DL(Op); 4198 // Cases that we can emit directly. 4199 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 || 4200 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) && 4201 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32)) 4202 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, 4203 DAG.getValueType(DstVT)); 4204 4205 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the 4206 // result. This is only valid if the legal cvt is larger than the saturate 4207 // width. 4208 if (DstWidth < SatWidth) 4209 return SDValue(); 4210 4211 SDValue NativeCvt = 4212 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT)); 4213 SDValue Sat; 4214 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { 4215 SDValue MinC = DAG.getConstant( 4216 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT); 4217 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC); 4218 SDValue MaxC = DAG.getConstant( 4219 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT); 4220 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC); 4221 } else { 4222 SDValue MinC = DAG.getConstant( 4223 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT); 4224 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC); 4225 } 4226 4227 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat); 4228 } 4229 4230 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, 4231 SelectionDAG &DAG) const { 4232 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 4233 // Any additional optimization in this function should be recorded 4234 // in the cost tables. 4235 bool IsStrict = Op->isStrictFPOpcode(); 4236 EVT VT = Op.getValueType(); 4237 SDLoc dl(Op); 4238 SDValue In = Op.getOperand(IsStrict ? 1 : 0); 4239 EVT InVT = In.getValueType(); 4240 unsigned Opc = Op.getOpcode(); 4241 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP; 4242 4243 if (VT.isScalableVector()) { 4244 if (InVT.getVectorElementType() == MVT::i1) { 4245 // We can't directly extend an SVE predicate; extend it first. 4246 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 4247 EVT CastVT = getPromotedVTForPredicate(InVT); 4248 In = DAG.getNode(CastOpc, dl, CastVT, In); 4249 return DAG.getNode(Opc, dl, VT, In); 4250 } 4251 4252 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU 4253 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; 4254 return LowerToPredicatedOp(Op, DAG, Opcode); 4255 } 4256 4257 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) || 4258 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) 4259 return LowerFixedLengthIntToFPToSVE(Op, DAG); 4260 4261 uint64_t VTSize = VT.getFixedSizeInBits(); 4262 uint64_t InVTSize = InVT.getFixedSizeInBits(); 4263 if (VTSize < InVTSize) { 4264 MVT CastVT = 4265 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), 4266 InVT.getVectorNumElements()); 4267 if (IsStrict) { 4268 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other}, 4269 {Op.getOperand(0), In}); 4270 return DAG.getNode( 4271 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other}, 4272 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)}); 4273 } 4274 In = DAG.getNode(Opc, dl, CastVT, In); 4275 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, 4276 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); 4277 } 4278 4279 if (VTSize > InVTSize) { 4280 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 4281 EVT CastVT = VT.changeVectorElementTypeToInteger(); 4282 In = DAG.getNode(CastOpc, dl, CastVT, In); 4283 if (IsStrict) 4284 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In}); 4285 return DAG.getNode(Opc, dl, VT, In); 4286 } 4287 4288 // Use a scalar operation for conversions between single-element vectors of 4289 // the same size. 4290 if (VT.getVectorNumElements() == 1) { 4291 SDValue Extract = DAG.getNode( 4292 ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(), 4293 In, DAG.getConstant(0, dl, MVT::i64)); 4294 EVT ScalarVT = VT.getScalarType(); 4295 if (IsStrict) 4296 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other}, 4297 {Op.getOperand(0), Extract}); 4298 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract); 4299 } 4300 4301 return Op; 4302 } 4303 4304 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, 4305 SelectionDAG &DAG) const { 4306 if (Op.getValueType().isVector()) 4307 return LowerVectorINT_TO_FP(Op, DAG); 4308 4309 bool IsStrict = Op->isStrictFPOpcode(); 4310 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 4311 4312 // f16 conversions are promoted to f32 when full fp16 is not supported. 4313 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { 4314 SDLoc dl(Op); 4315 if (IsStrict) { 4316 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other}, 4317 {Op.getOperand(0), SrcVal}); 4318 return DAG.getNode( 4319 ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other}, 4320 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)}); 4321 } 4322 return DAG.getNode( 4323 ISD::FP_ROUND, dl, MVT::f16, 4324 DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal), 4325 DAG.getIntPtrConstant(0, dl)); 4326 } 4327 4328 // i128 conversions are libcalls. 4329 if (SrcVal.getValueType() == MVT::i128) 4330 return SDValue(); 4331 4332 // Other conversions are legal, unless it's to the completely software-based 4333 // fp128. 4334 if (Op.getValueType() != MVT::f128) 4335 return Op; 4336 return SDValue(); 4337 } 4338 4339 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, 4340 SelectionDAG &DAG) const { 4341 // For iOS, we want to call an alternative entry point: __sincos_stret, 4342 // which returns the values in two S / D registers. 4343 SDLoc dl(Op); 4344 SDValue Arg = Op.getOperand(0); 4345 EVT ArgVT = Arg.getValueType(); 4346 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 4347 4348 ArgListTy Args; 4349 ArgListEntry Entry; 4350 4351 Entry.Node = Arg; 4352 Entry.Ty = ArgTy; 4353 Entry.IsSExt = false; 4354 Entry.IsZExt = false; 4355 Args.push_back(Entry); 4356 4357 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 4358 : RTLIB::SINCOS_STRET_F32; 4359 const char *LibcallName = getLibcallName(LC); 4360 SDValue Callee = 4361 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); 4362 4363 StructType *RetTy = StructType::get(ArgTy, ArgTy); 4364 TargetLowering::CallLoweringInfo CLI(DAG); 4365 CLI.setDebugLoc(dl) 4366 .setChain(DAG.getEntryNode()) 4367 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); 4368 4369 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 4370 return CallResult.first; 4371 } 4372 4373 static MVT getSVEContainerType(EVT ContentTy); 4374 4375 SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op, 4376 SelectionDAG &DAG) const { 4377 EVT OpVT = Op.getValueType(); 4378 EVT ArgVT = Op.getOperand(0).getValueType(); 4379 4380 if (useSVEForFixedLengthVectorVT(OpVT)) 4381 return LowerFixedLengthBitcastToSVE(Op, DAG); 4382 4383 if (OpVT.isScalableVector()) { 4384 // Bitcasting between unpacked vector types of different element counts is 4385 // not a NOP because the live elements are laid out differently. 4386 // 01234567 4387 // e.g. nxv2i32 = XX??XX?? 4388 // nxv4f16 = X?X?X?X? 4389 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount()) 4390 return SDValue(); 4391 4392 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) { 4393 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() && 4394 "Expected int->fp bitcast!"); 4395 SDValue ExtResult = 4396 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT), 4397 Op.getOperand(0)); 4398 return getSVESafeBitCast(OpVT, ExtResult, DAG); 4399 } 4400 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG); 4401 } 4402 4403 if (OpVT != MVT::f16 && OpVT != MVT::bf16) 4404 return SDValue(); 4405 4406 // Bitcasts between f16 and bf16 are legal. 4407 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16) 4408 return Op; 4409 4410 assert(ArgVT == MVT::i16); 4411 SDLoc DL(Op); 4412 4413 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); 4414 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); 4415 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op); 4416 } 4417 4418 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 4419 if (OrigVT.getSizeInBits() >= 64) 4420 return OrigVT; 4421 4422 assert(OrigVT.isSimple() && "Expecting a simple value type"); 4423 4424 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 4425 switch (OrigSimpleTy) { 4426 default: llvm_unreachable("Unexpected Vector Type"); 4427 case MVT::v2i8: 4428 case MVT::v2i16: 4429 return MVT::v2i32; 4430 case MVT::v4i8: 4431 return MVT::v4i16; 4432 } 4433 } 4434 4435 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, 4436 const EVT &OrigTy, 4437 const EVT &ExtTy, 4438 unsigned ExtOpcode) { 4439 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 4440 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 4441 // 64-bits we need to insert a new extension so that it will be 64-bits. 4442 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 4443 if (OrigTy.getSizeInBits() >= 64) 4444 return N; 4445 4446 // Must extend size to at least 64 bits to be used as an operand for VMULL. 4447 EVT NewVT = getExtensionTo64Bits(OrigTy); 4448 4449 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 4450 } 4451 4452 // Returns lane if Op extracts from a two-element vector and lane is constant 4453 // (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise. 4454 static std::optional<uint64_t> 4455 getConstantLaneNumOfExtractHalfOperand(SDValue &Op) { 4456 SDNode *OpNode = Op.getNode(); 4457 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 4458 return std::nullopt; 4459 4460 EVT VT = OpNode->getOperand(0).getValueType(); 4461 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1)); 4462 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C) 4463 return std::nullopt; 4464 4465 return C->getZExtValue(); 4466 } 4467 4468 static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, 4469 bool isSigned) { 4470 EVT VT = N.getValueType(); 4471 4472 if (N.getOpcode() != ISD::BUILD_VECTOR) 4473 return false; 4474 4475 for (const SDValue &Elt : N->op_values()) { 4476 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 4477 unsigned EltSize = VT.getScalarSizeInBits(); 4478 unsigned HalfSize = EltSize / 2; 4479 if (isSigned) { 4480 if (!isIntN(HalfSize, C->getSExtValue())) 4481 return false; 4482 } else { 4483 if (!isUIntN(HalfSize, C->getZExtValue())) 4484 return false; 4485 } 4486 continue; 4487 } 4488 return false; 4489 } 4490 4491 return true; 4492 } 4493 4494 static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG) { 4495 EVT VT = N.getValueType(); 4496 assert(VT.is128BitVector() && "Unexpected vector MULL size"); 4497 4498 unsigned NumElts = VT.getVectorNumElements(); 4499 unsigned OrigEltSize = VT.getScalarSizeInBits(); 4500 unsigned EltSize = OrigEltSize / 2; 4501 MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 4502 4503 APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize); 4504 if (DAG.MaskedValueIsZero(N, HiBits)) 4505 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N); 4506 4507 if (ISD::isExtOpcode(N.getOpcode())) 4508 return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG, 4509 N.getOperand(0).getValueType(), VT, 4510 N.getOpcode()); 4511 4512 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 4513 SDLoc dl(N); 4514 SmallVector<SDValue, 8> Ops; 4515 for (unsigned i = 0; i != NumElts; ++i) { 4516 ConstantSDNode *C = cast<ConstantSDNode>(N.getOperand(i)); 4517 const APInt &CInt = C->getAPIntValue(); 4518 // Element types smaller than 32 bits are not legal, so use i32 elements. 4519 // The values are implicitly truncated so sext vs. zext doesn't matter. 4520 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 4521 } 4522 return DAG.getBuildVector(TruncVT, dl, Ops); 4523 } 4524 4525 static bool isSignExtended(SDValue N, SelectionDAG &DAG) { 4526 return N.getOpcode() == ISD::SIGN_EXTEND || 4527 N.getOpcode() == ISD::ANY_EXTEND || 4528 isExtendedBUILD_VECTOR(N, DAG, true); 4529 } 4530 4531 static bool isZeroExtended(SDValue N, SelectionDAG &DAG) { 4532 return N.getOpcode() == ISD::ZERO_EXTEND || 4533 N.getOpcode() == ISD::ANY_EXTEND || 4534 isExtendedBUILD_VECTOR(N, DAG, false); 4535 } 4536 4537 static bool isAddSubSExt(SDValue N, SelectionDAG &DAG) { 4538 unsigned Opcode = N.getOpcode(); 4539 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 4540 SDValue N0 = N.getOperand(0); 4541 SDValue N1 = N.getOperand(1); 4542 return N0->hasOneUse() && N1->hasOneUse() && 4543 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 4544 } 4545 return false; 4546 } 4547 4548 static bool isAddSubZExt(SDValue N, SelectionDAG &DAG) { 4549 unsigned Opcode = N.getOpcode(); 4550 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 4551 SDValue N0 = N.getOperand(0); 4552 SDValue N1 = N.getOperand(1); 4553 return N0->hasOneUse() && N1->hasOneUse() && 4554 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 4555 } 4556 return false; 4557 } 4558 4559 SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op, 4560 SelectionDAG &DAG) const { 4561 // The rounding mode is in bits 23:22 of the FPSCR. 4562 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 4563 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 4564 // so that the shift + and get folded into a bitfield extract. 4565 SDLoc dl(Op); 4566 4567 SDValue Chain = Op.getOperand(0); 4568 SDValue FPCR_64 = DAG.getNode( 4569 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other}, 4570 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)}); 4571 Chain = FPCR_64.getValue(1); 4572 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64); 4573 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32, 4574 DAG.getConstant(1U << 22, dl, MVT::i32)); 4575 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 4576 DAG.getConstant(22, dl, MVT::i32)); 4577 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 4578 DAG.getConstant(3, dl, MVT::i32)); 4579 return DAG.getMergeValues({AND, Chain}, dl); 4580 } 4581 4582 SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op, 4583 SelectionDAG &DAG) const { 4584 SDLoc DL(Op); 4585 SDValue Chain = Op->getOperand(0); 4586 SDValue RMValue = Op->getOperand(1); 4587 4588 // The rounding mode is in bits 23:22 of the FPCR. 4589 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping 4590 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is 4591 // ((arg - 1) & 3) << 22). 4592 // 4593 // The argument of llvm.set.rounding must be within the segment [0, 3], so 4594 // NearestTiesToAway (4) is not handled here. It is responsibility of the code 4595 // generated llvm.set.rounding to ensure this condition. 4596 4597 // Calculate new value of FPCR[23:22]. 4598 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue, 4599 DAG.getConstant(1, DL, MVT::i32)); 4600 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue, 4601 DAG.getConstant(0x3, DL, MVT::i32)); 4602 RMValue = 4603 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue, 4604 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32)); 4605 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue); 4606 4607 // Get current value of FPCR. 4608 SDValue Ops[] = { 4609 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)}; 4610 SDValue FPCR = 4611 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops); 4612 Chain = FPCR.getValue(1); 4613 FPCR = FPCR.getValue(0); 4614 4615 // Put new rounding mode into FPSCR[23:22]. 4616 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos); 4617 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR, 4618 DAG.getConstant(RMMask, DL, MVT::i64)); 4619 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue); 4620 SDValue Ops2[] = { 4621 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), 4622 FPCR}; 4623 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); 4624 } 4625 4626 static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, 4627 SDLoc DL, bool &IsMLA) { 4628 bool IsN0SExt = isSignExtended(N0, DAG); 4629 bool IsN1SExt = isSignExtended(N1, DAG); 4630 if (IsN0SExt && IsN1SExt) 4631 return AArch64ISD::SMULL; 4632 4633 bool IsN0ZExt = isZeroExtended(N0, DAG); 4634 bool IsN1ZExt = isZeroExtended(N1, DAG); 4635 4636 if (IsN0ZExt && IsN1ZExt) 4637 return AArch64ISD::UMULL; 4638 4639 // Select SMULL if we can replace zext with sext. 4640 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) && 4641 !isExtendedBUILD_VECTOR(N0, DAG, false) && 4642 !isExtendedBUILD_VECTOR(N1, DAG, false)) { 4643 SDValue ZextOperand; 4644 if (IsN0ZExt) 4645 ZextOperand = N0.getOperand(0); 4646 else 4647 ZextOperand = N1.getOperand(0); 4648 if (DAG.SignBitIsZero(ZextOperand)) { 4649 SDValue NewSext = 4650 DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType()); 4651 if (IsN0ZExt) 4652 N0 = NewSext; 4653 else 4654 N1 = NewSext; 4655 return AArch64ISD::SMULL; 4656 } 4657 } 4658 4659 // Select UMULL if we can replace the other operand with an extend. 4660 if (IsN0ZExt || IsN1ZExt) { 4661 EVT VT = N0.getValueType(); 4662 APInt Mask = APInt::getHighBitsSet(VT.getScalarSizeInBits(), 4663 VT.getScalarSizeInBits() / 2); 4664 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask)) 4665 return AArch64ISD::UMULL; 4666 } 4667 4668 if (!IsN1SExt && !IsN1ZExt) 4669 return 0; 4670 4671 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 4672 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 4673 if (IsN1SExt && isAddSubSExt(N0, DAG)) { 4674 IsMLA = true; 4675 return AArch64ISD::SMULL; 4676 } 4677 if (IsN1ZExt && isAddSubZExt(N0, DAG)) { 4678 IsMLA = true; 4679 return AArch64ISD::UMULL; 4680 } 4681 if (IsN0ZExt && isAddSubZExt(N1, DAG)) { 4682 std::swap(N0, N1); 4683 IsMLA = true; 4684 return AArch64ISD::UMULL; 4685 } 4686 return 0; 4687 } 4688 4689 SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { 4690 EVT VT = Op.getValueType(); 4691 4692 bool OverrideNEON = !Subtarget->isNeonAvailable(); 4693 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) 4694 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); 4695 4696 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so 4697 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal. 4698 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() && 4699 "unexpected type for custom-lowering ISD::MUL"); 4700 SDValue N0 = Op.getOperand(0); 4701 SDValue N1 = Op.getOperand(1); 4702 bool isMLA = false; 4703 EVT OVT = VT; 4704 if (VT.is64BitVector()) { 4705 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && 4706 isNullConstant(N0.getOperand(1)) && 4707 N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && 4708 isNullConstant(N1.getOperand(1))) { 4709 N0 = N0.getOperand(0); 4710 N1 = N1.getOperand(0); 4711 VT = N0.getValueType(); 4712 } else { 4713 if (VT == MVT::v1i64) { 4714 if (Subtarget->hasSVE()) 4715 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); 4716 // Fall through to expand this. It is not legal. 4717 return SDValue(); 4718 } else 4719 // Other vector multiplications are legal. 4720 return Op; 4721 } 4722 } 4723 4724 SDLoc DL(Op); 4725 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA); 4726 4727 if (!NewOpc) { 4728 if (VT.getVectorElementType() == MVT::i64) { 4729 // If SVE is available then i64 vector multiplications can also be made 4730 // legal. 4731 if (Subtarget->hasSVE()) 4732 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED); 4733 // Fall through to expand this. It is not legal. 4734 return SDValue(); 4735 } else 4736 // Other vector multiplications are legal. 4737 return Op; 4738 } 4739 4740 // Legalize to a S/UMULL instruction 4741 SDValue Op0; 4742 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); 4743 if (!isMLA) { 4744 Op0 = skipExtensionForVectorMULL(N0, DAG); 4745 assert(Op0.getValueType().is64BitVector() && 4746 Op1.getValueType().is64BitVector() && 4747 "unexpected types for extended operands to VMULL"); 4748 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT, 4749 DAG.getNode(NewOpc, DL, VT, Op0, Op1), 4750 DAG.getConstant(0, DL, MVT::i64)); 4751 } 4752 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during 4753 // isel lowering to take advantage of no-stall back to back s/umul + s/umla. 4754 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 4755 SDValue N00 = skipExtensionForVectorMULL(N0.getOperand(0), DAG); 4756 SDValue N01 = skipExtensionForVectorMULL(N0.getOperand(1), DAG); 4757 EVT Op1VT = Op1.getValueType(); 4758 return DAG.getNode( 4759 ISD::EXTRACT_SUBVECTOR, DL, OVT, 4760 DAG.getNode(N0.getOpcode(), DL, VT, 4761 DAG.getNode(NewOpc, DL, VT, 4762 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 4763 DAG.getNode(NewOpc, DL, VT, 4764 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)), 4765 DAG.getConstant(0, DL, MVT::i64)); 4766 } 4767 4768 static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, 4769 int Pattern) { 4770 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all) 4771 return DAG.getConstant(1, DL, MVT::nxv1i1); 4772 return DAG.getNode(AArch64ISD::PTRUE, DL, VT, 4773 DAG.getTargetConstant(Pattern, DL, MVT::i32)); 4774 } 4775 4776 static SDValue optimizeWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, 4777 bool IsLess, bool IsEqual) { 4778 if (!isa<ConstantSDNode>(Op.getOperand(1)) || 4779 !isa<ConstantSDNode>(Op.getOperand(2))) 4780 return SDValue(); 4781 4782 SDLoc dl(Op); 4783 APInt X = Op.getConstantOperandAPInt(1); 4784 APInt Y = Op.getConstantOperandAPInt(2); 4785 APInt NumActiveElems; 4786 bool Overflow; 4787 if (IsLess) 4788 NumActiveElems = IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow); 4789 else 4790 NumActiveElems = IsSigned ? X.ssub_ov(Y, Overflow) : X.usub_ov(Y, Overflow); 4791 4792 if (Overflow) 4793 return SDValue(); 4794 4795 if (IsEqual) { 4796 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned); 4797 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow) 4798 : NumActiveElems.uadd_ov(One, Overflow); 4799 if (Overflow) 4800 return SDValue(); 4801 } 4802 4803 std::optional<unsigned> PredPattern = 4804 getSVEPredPatternFromNumElements(NumActiveElems.getZExtValue()); 4805 unsigned MinSVEVectorSize = std::max( 4806 DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), 128u); 4807 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements(); 4808 if (PredPattern != std::nullopt && 4809 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize)) 4810 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern); 4811 4812 return SDValue(); 4813 } 4814 4815 // Returns a safe bitcast between two scalable vector predicates, where 4816 // any newly created lanes from a widening bitcast are defined as zero. 4817 static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) { 4818 SDLoc DL(Op); 4819 EVT InVT = Op.getValueType(); 4820 4821 assert(InVT.getVectorElementType() == MVT::i1 && 4822 VT.getVectorElementType() == MVT::i1 && 4823 "Expected a predicate-to-predicate bitcast"); 4824 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 4825 InVT.isScalableVector() && 4826 DAG.getTargetLoweringInfo().isTypeLegal(InVT) && 4827 "Only expect to cast between legal scalable predicate types!"); 4828 4829 // Return the operand if the cast isn't changing type, 4830 // e.g. <n x 16 x i1> -> <n x 16 x i1> 4831 if (InVT == VT) 4832 return Op; 4833 4834 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); 4835 4836 // We only have to zero the lanes if new lanes are being defined, e.g. when 4837 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the 4838 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then 4839 // we can return here. 4840 if (InVT.bitsGT(VT)) 4841 return Reinterpret; 4842 4843 // Check if the other lanes are already known to be zeroed by 4844 // construction. 4845 if (isZeroingInactiveLanes(Op)) 4846 return Reinterpret; 4847 4848 // Zero the newly introduced lanes. 4849 SDValue Mask = DAG.getConstant(1, DL, InVT); 4850 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask); 4851 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask); 4852 } 4853 4854 SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain, 4855 SMEAttrs Attrs, SDLoc DL, 4856 EVT VT) const { 4857 if (Attrs.hasStreamingInterfaceOrBody()) 4858 return DAG.getConstant(1, DL, VT); 4859 4860 if (Attrs.hasNonStreamingInterfaceAndBody()) 4861 return DAG.getConstant(0, DL, VT); 4862 4863 assert(Attrs.hasStreamingCompatibleInterface() && "Unexpected interface"); 4864 4865 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state", 4866 getPointerTy(DAG.getDataLayout())); 4867 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext()); 4868 Type *RetTy = StructType::get(Int64Ty, Int64Ty); 4869 TargetLowering::CallLoweringInfo CLI(DAG); 4870 ArgListTy Args; 4871 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( 4872 CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2, 4873 RetTy, Callee, std::move(Args)); 4874 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 4875 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64); 4876 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0), 4877 Mask); 4878 } 4879 4880 // Lower an SME LDR/STR ZA intrinsic 4881 // Case 1: If the vector number (vecnum) is an immediate in range, it gets 4882 // folded into the instruction 4883 // ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11] 4884 // Case 2: If the vecnum is not an immediate, then it is used to modify the base 4885 // and tile slice registers 4886 // ldr(%tileslice, %ptr, %vecnum) 4887 // -> 4888 // %svl = rdsvl 4889 // %ptr2 = %ptr + %svl * %vecnum 4890 // %tileslice2 = %tileslice + %vecnum 4891 // ldr [%tileslice2, 0], [%ptr2, 0] 4892 // Case 3: If the vecnum is an immediate out of range, then the same is done as 4893 // case 2, but the base and slice registers are modified by the greatest 4894 // multiple of 15 lower than the vecnum and the remainder is folded into the 4895 // instruction. This means that successive loads and stores that are offset from 4896 // each other can share the same base and slice register updates. 4897 // ldr(%tileslice, %ptr, 22) 4898 // ldr(%tileslice, %ptr, 23) 4899 // -> 4900 // %svl = rdsvl 4901 // %ptr2 = %ptr + %svl * 15 4902 // %tileslice2 = %tileslice + 15 4903 // ldr [%tileslice2, 7], [%ptr2, 7] 4904 // ldr [%tileslice2, 8], [%ptr2, 8] 4905 // Case 4: If the vecnum is an add of an immediate, then the non-immediate 4906 // operand and the immediate can be folded into the instruction, like case 2. 4907 // ldr(%tileslice, %ptr, %vecnum + 7) 4908 // ldr(%tileslice, %ptr, %vecnum + 8) 4909 // -> 4910 // %svl = rdsvl 4911 // %ptr2 = %ptr + %svl * %vecnum 4912 // %tileslice2 = %tileslice + %vecnum 4913 // ldr [%tileslice2, 7], [%ptr2, 7] 4914 // ldr [%tileslice2, 8], [%ptr2, 8] 4915 // Case 5: The vecnum being an add of an immediate out of range is also handled, 4916 // in which case the same remainder logic as case 3 is used. 4917 SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) { 4918 SDLoc DL(N); 4919 4920 SDValue TileSlice = N->getOperand(2); 4921 SDValue Base = N->getOperand(3); 4922 SDValue VecNum = N->getOperand(4); 4923 int32_t ConstAddend = 0; 4924 SDValue VarAddend = VecNum; 4925 4926 // If the vnum is an add of an immediate, we can fold it into the instruction 4927 if (VecNum.getOpcode() == ISD::ADD && 4928 isa<ConstantSDNode>(VecNum.getOperand(1))) { 4929 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue(); 4930 VarAddend = VecNum.getOperand(0); 4931 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) { 4932 ConstAddend = ImmNode->getSExtValue(); 4933 VarAddend = SDValue(); 4934 } 4935 4936 int32_t ImmAddend = ConstAddend % 16; 4937 if (int32_t C = (ConstAddend - ImmAddend)) { 4938 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32); 4939 VarAddend = VarAddend 4940 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal}) 4941 : CVal; 4942 } 4943 4944 if (VarAddend) { 4945 // Get the vector length that will be multiplied by vnum 4946 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, 4947 DAG.getConstant(1, DL, MVT::i32)); 4948 4949 // Multiply SVL and vnum then add it to the base 4950 SDValue Mul = DAG.getNode( 4951 ISD::MUL, DL, MVT::i64, 4952 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)}); 4953 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul}); 4954 // Just add vnum to the tileslice 4955 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend}); 4956 } 4957 4958 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR, 4959 DL, MVT::Other, 4960 {/*Chain=*/N.getOperand(0), TileSlice, Base, 4961 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)}); 4962 } 4963 4964 SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op, 4965 SelectionDAG &DAG) const { 4966 unsigned IntNo = Op.getConstantOperandVal(1); 4967 SDLoc DL(Op); 4968 switch (IntNo) { 4969 default: 4970 return SDValue(); // Don't custom lower most intrinsics. 4971 case Intrinsic::aarch64_prefetch: { 4972 SDValue Chain = Op.getOperand(0); 4973 SDValue Addr = Op.getOperand(2); 4974 4975 unsigned IsWrite = Op.getConstantOperandVal(3); 4976 unsigned Locality = Op.getConstantOperandVal(4); 4977 unsigned IsStream = Op.getConstantOperandVal(5); 4978 unsigned IsData = Op.getConstantOperandVal(6); 4979 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 4980 (!IsData << 3) | // IsDataCache bit 4981 (Locality << 1) | // Cache level bits 4982 (unsigned)IsStream; // Stream bit 4983 4984 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain, 4985 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr); 4986 } 4987 case Intrinsic::aarch64_sme_str: 4988 case Intrinsic::aarch64_sme_ldr: { 4989 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr); 4990 } 4991 case Intrinsic::aarch64_sme_za_enable: 4992 return DAG.getNode( 4993 AArch64ISD::SMSTART, DL, MVT::Other, 4994 Op->getOperand(0), // Chain 4995 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), 4996 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); 4997 case Intrinsic::aarch64_sme_za_disable: 4998 return DAG.getNode( 4999 AArch64ISD::SMSTOP, DL, MVT::Other, 5000 Op->getOperand(0), // Chain 5001 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), 5002 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); 5003 } 5004 } 5005 5006 SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, 5007 SelectionDAG &DAG) const { 5008 unsigned IntNo = Op.getConstantOperandVal(1); 5009 SDLoc DL(Op); 5010 switch (IntNo) { 5011 default: 5012 return SDValue(); // Don't custom lower most intrinsics. 5013 case Intrinsic::aarch64_mops_memset_tag: { 5014 auto Node = cast<MemIntrinsicSDNode>(Op.getNode()); 5015 SDValue Chain = Node->getChain(); 5016 SDValue Dst = Op.getOperand(2); 5017 SDValue Val = Op.getOperand(3); 5018 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64); 5019 SDValue Size = Op.getOperand(4); 5020 auto Alignment = Node->getMemOperand()->getAlign(); 5021 bool IsVol = Node->isVolatile(); 5022 auto DstPtrInfo = Node->getPointerInfo(); 5023 5024 const auto &SDI = 5025 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo()); 5026 SDValue MS = 5027 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val, 5028 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{}); 5029 5030 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the 5031 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise 5032 // LowerOperationWrapper will complain that the number of results has 5033 // changed. 5034 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL); 5035 } 5036 } 5037 } 5038 5039 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 5040 SelectionDAG &DAG) const { 5041 unsigned IntNo = Op.getConstantOperandVal(0); 5042 SDLoc dl(Op); 5043 switch (IntNo) { 5044 default: return SDValue(); // Don't custom lower most intrinsics. 5045 case Intrinsic::thread_pointer: { 5046 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5047 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); 5048 } 5049 case Intrinsic::aarch64_neon_abs: { 5050 EVT Ty = Op.getValueType(); 5051 if (Ty == MVT::i64) { 5052 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, 5053 Op.getOperand(1)); 5054 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result); 5055 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result); 5056 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) { 5057 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1)); 5058 } else { 5059 report_fatal_error("Unexpected type for AArch64 NEON intrinic"); 5060 } 5061 } 5062 case Intrinsic::aarch64_neon_pmull64: { 5063 SDValue LHS = Op.getOperand(1); 5064 SDValue RHS = Op.getOperand(2); 5065 5066 std::optional<uint64_t> LHSLane = 5067 getConstantLaneNumOfExtractHalfOperand(LHS); 5068 std::optional<uint64_t> RHSLane = 5069 getConstantLaneNumOfExtractHalfOperand(RHS); 5070 5071 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1"); 5072 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1"); 5073 5074 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2 5075 // instructions execute on SIMD registers. So canonicalize i64 to v1i64, 5076 // which ISel recognizes better. For example, generate a ldr into d* 5077 // registers as opposed to a GPR load followed by a fmov. 5078 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane, 5079 std::optional<uint64_t> OtherLane, 5080 const SDLoc &dl, 5081 SelectionDAG &DAG) -> SDValue { 5082 // If the operand is an higher half itself, rewrite it to 5083 // extract_high_v2i64; this way aarch64_neon_pmull64 could 5084 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}. 5085 if (NLane && *NLane == 1) 5086 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64, 5087 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64)); 5088 5089 // Operand N is not a higher half but the other operand is. 5090 if (OtherLane && *OtherLane == 1) { 5091 // If this operand is a lower half, rewrite it to 5092 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to 5093 // align lanes of two operands. A roundtrip sequence (to move from lane 5094 // 1 to lane 0) is like this: 5095 // mov x8, v0.d[1] 5096 // fmov d0, x8 5097 if (NLane && *NLane == 0) 5098 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64, 5099 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64, 5100 N.getOperand(0), 5101 DAG.getConstant(0, dl, MVT::i64)), 5102 DAG.getConstant(1, dl, MVT::i64)); 5103 5104 // Otherwise just dup from main to all lanes. 5105 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N); 5106 } 5107 5108 // Neither operand is an extract of higher half, so codegen may just use 5109 // the non-high version of PMULL instruction. Use v1i64 to represent i64. 5110 assert(N.getValueType() == MVT::i64 && 5111 "Intrinsic aarch64_neon_pmull64 requires i64 parameters"); 5112 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N); 5113 }; 5114 5115 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG); 5116 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG); 5117 5118 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS); 5119 } 5120 case Intrinsic::aarch64_neon_smax: 5121 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), 5122 Op.getOperand(1), Op.getOperand(2)); 5123 case Intrinsic::aarch64_neon_umax: 5124 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), 5125 Op.getOperand(1), Op.getOperand(2)); 5126 case Intrinsic::aarch64_neon_smin: 5127 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), 5128 Op.getOperand(1), Op.getOperand(2)); 5129 case Intrinsic::aarch64_neon_umin: 5130 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), 5131 Op.getOperand(1), Op.getOperand(2)); 5132 case Intrinsic::aarch64_neon_scalar_sqxtn: 5133 case Intrinsic::aarch64_neon_scalar_sqxtun: 5134 case Intrinsic::aarch64_neon_scalar_uqxtn: { 5135 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32); 5136 if (Op.getValueType() == MVT::i32) 5137 return DAG.getNode(ISD::BITCAST, dl, MVT::i32, 5138 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32, 5139 Op.getOperand(0), 5140 DAG.getNode(ISD::BITCAST, dl, MVT::f64, 5141 Op.getOperand(1)))); 5142 return SDValue(); 5143 } 5144 case Intrinsic::aarch64_sve_whilelo: 5145 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true, 5146 /*IsEqual=*/false); 5147 case Intrinsic::aarch64_sve_whilelt: 5148 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true, 5149 /*IsEqual=*/false); 5150 case Intrinsic::aarch64_sve_whilels: 5151 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/true, 5152 /*IsEqual=*/true); 5153 case Intrinsic::aarch64_sve_whilele: 5154 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/true, 5155 /*IsEqual=*/true); 5156 case Intrinsic::aarch64_sve_whilege: 5157 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false, 5158 /*IsEqual=*/true); 5159 case Intrinsic::aarch64_sve_whilegt: 5160 return optimizeWhile(Op, DAG, /*IsSigned=*/true, /*IsLess=*/false, 5161 /*IsEqual=*/false); 5162 case Intrinsic::aarch64_sve_whilehs: 5163 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false, 5164 /*IsEqual=*/true); 5165 case Intrinsic::aarch64_sve_whilehi: 5166 return optimizeWhile(Op, DAG, /*IsSigned=*/false, /*IsLess=*/false, 5167 /*IsEqual=*/false); 5168 case Intrinsic::aarch64_sve_sunpkhi: 5169 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(), 5170 Op.getOperand(1)); 5171 case Intrinsic::aarch64_sve_sunpklo: 5172 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(), 5173 Op.getOperand(1)); 5174 case Intrinsic::aarch64_sve_uunpkhi: 5175 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(), 5176 Op.getOperand(1)); 5177 case Intrinsic::aarch64_sve_uunpklo: 5178 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(), 5179 Op.getOperand(1)); 5180 case Intrinsic::aarch64_sve_clasta_n: 5181 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(), 5182 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 5183 case Intrinsic::aarch64_sve_clastb_n: 5184 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(), 5185 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 5186 case Intrinsic::aarch64_sve_lasta: 5187 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(), 5188 Op.getOperand(1), Op.getOperand(2)); 5189 case Intrinsic::aarch64_sve_lastb: 5190 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(), 5191 Op.getOperand(1), Op.getOperand(2)); 5192 case Intrinsic::aarch64_sve_rev: 5193 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(), 5194 Op.getOperand(1)); 5195 case Intrinsic::aarch64_sve_tbl: 5196 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(), 5197 Op.getOperand(1), Op.getOperand(2)); 5198 case Intrinsic::aarch64_sve_trn1: 5199 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(), 5200 Op.getOperand(1), Op.getOperand(2)); 5201 case Intrinsic::aarch64_sve_trn2: 5202 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(), 5203 Op.getOperand(1), Op.getOperand(2)); 5204 case Intrinsic::aarch64_sve_uzp1: 5205 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(), 5206 Op.getOperand(1), Op.getOperand(2)); 5207 case Intrinsic::aarch64_sve_uzp2: 5208 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(), 5209 Op.getOperand(1), Op.getOperand(2)); 5210 case Intrinsic::aarch64_sve_zip1: 5211 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(), 5212 Op.getOperand(1), Op.getOperand(2)); 5213 case Intrinsic::aarch64_sve_zip2: 5214 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(), 5215 Op.getOperand(1), Op.getOperand(2)); 5216 case Intrinsic::aarch64_sve_splice: 5217 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(), 5218 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 5219 case Intrinsic::aarch64_sve_ptrue: 5220 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1)); 5221 case Intrinsic::aarch64_sve_clz: 5222 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(), 5223 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5224 case Intrinsic::aarch64_sme_cntsb: 5225 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), 5226 DAG.getConstant(1, dl, MVT::i32)); 5227 case Intrinsic::aarch64_sme_cntsh: { 5228 SDValue One = DAG.getConstant(1, dl, MVT::i32); 5229 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One); 5230 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One); 5231 } 5232 case Intrinsic::aarch64_sme_cntsw: { 5233 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), 5234 DAG.getConstant(1, dl, MVT::i32)); 5235 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, 5236 DAG.getConstant(2, dl, MVT::i32)); 5237 } 5238 case Intrinsic::aarch64_sme_cntsd: { 5239 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), 5240 DAG.getConstant(1, dl, MVT::i32)); 5241 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, 5242 DAG.getConstant(3, dl, MVT::i32)); 5243 } 5244 case Intrinsic::aarch64_sve_cnt: { 5245 SDValue Data = Op.getOperand(3); 5246 // CTPOP only supports integer operands. 5247 if (Data.getValueType().isFloatingPoint()) 5248 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data); 5249 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(), 5250 Op.getOperand(2), Data, Op.getOperand(1)); 5251 } 5252 case Intrinsic::aarch64_sve_dupq_lane: 5253 return LowerDUPQLane(Op, DAG); 5254 case Intrinsic::aarch64_sve_convert_from_svbool: 5255 if (Op.getValueType() == MVT::aarch64svcount) 5256 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1)); 5257 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG); 5258 case Intrinsic::aarch64_sve_convert_to_svbool: 5259 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount) 5260 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1)); 5261 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG); 5262 case Intrinsic::aarch64_sve_fneg: 5263 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(), 5264 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5265 case Intrinsic::aarch64_sve_frintp: 5266 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(), 5267 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5268 case Intrinsic::aarch64_sve_frintm: 5269 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(), 5270 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5271 case Intrinsic::aarch64_sve_frinti: 5272 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(), 5273 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5274 case Intrinsic::aarch64_sve_frintx: 5275 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(), 5276 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5277 case Intrinsic::aarch64_sve_frinta: 5278 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(), 5279 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5280 case Intrinsic::aarch64_sve_frintn: 5281 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(), 5282 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5283 case Intrinsic::aarch64_sve_frintz: 5284 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(), 5285 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5286 case Intrinsic::aarch64_sve_ucvtf: 5287 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl, 5288 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 5289 Op.getOperand(1)); 5290 case Intrinsic::aarch64_sve_scvtf: 5291 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl, 5292 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 5293 Op.getOperand(1)); 5294 case Intrinsic::aarch64_sve_fcvtzu: 5295 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl, 5296 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 5297 Op.getOperand(1)); 5298 case Intrinsic::aarch64_sve_fcvtzs: 5299 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl, 5300 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 5301 Op.getOperand(1)); 5302 case Intrinsic::aarch64_sve_fsqrt: 5303 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(), 5304 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5305 case Intrinsic::aarch64_sve_frecpx: 5306 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(), 5307 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5308 case Intrinsic::aarch64_sve_frecpe_x: 5309 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(), 5310 Op.getOperand(1)); 5311 case Intrinsic::aarch64_sve_frecps_x: 5312 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(), 5313 Op.getOperand(1), Op.getOperand(2)); 5314 case Intrinsic::aarch64_sve_frsqrte_x: 5315 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(), 5316 Op.getOperand(1)); 5317 case Intrinsic::aarch64_sve_frsqrts_x: 5318 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(), 5319 Op.getOperand(1), Op.getOperand(2)); 5320 case Intrinsic::aarch64_sve_fabs: 5321 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(), 5322 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5323 case Intrinsic::aarch64_sve_abs: 5324 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(), 5325 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5326 case Intrinsic::aarch64_sve_neg: 5327 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(), 5328 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5329 case Intrinsic::aarch64_sve_insr: { 5330 SDValue Scalar = Op.getOperand(2); 5331 EVT ScalarTy = Scalar.getValueType(); 5332 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) 5333 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); 5334 5335 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(), 5336 Op.getOperand(1), Scalar); 5337 } 5338 case Intrinsic::aarch64_sve_rbit: 5339 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl, 5340 Op.getValueType(), Op.getOperand(2), Op.getOperand(3), 5341 Op.getOperand(1)); 5342 case Intrinsic::aarch64_sve_revb: 5343 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(), 5344 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5345 case Intrinsic::aarch64_sve_revh: 5346 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(), 5347 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5348 case Intrinsic::aarch64_sve_revw: 5349 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(), 5350 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5351 case Intrinsic::aarch64_sve_revd: 5352 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(), 5353 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); 5354 case Intrinsic::aarch64_sve_sxtb: 5355 return DAG.getNode( 5356 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 5357 Op.getOperand(2), Op.getOperand(3), 5358 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), 5359 Op.getOperand(1)); 5360 case Intrinsic::aarch64_sve_sxth: 5361 return DAG.getNode( 5362 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 5363 Op.getOperand(2), Op.getOperand(3), 5364 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), 5365 Op.getOperand(1)); 5366 case Intrinsic::aarch64_sve_sxtw: 5367 return DAG.getNode( 5368 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 5369 Op.getOperand(2), Op.getOperand(3), 5370 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), 5371 Op.getOperand(1)); 5372 case Intrinsic::aarch64_sve_uxtb: 5373 return DAG.getNode( 5374 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 5375 Op.getOperand(2), Op.getOperand(3), 5376 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), 5377 Op.getOperand(1)); 5378 case Intrinsic::aarch64_sve_uxth: 5379 return DAG.getNode( 5380 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 5381 Op.getOperand(2), Op.getOperand(3), 5382 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), 5383 Op.getOperand(1)); 5384 case Intrinsic::aarch64_sve_uxtw: 5385 return DAG.getNode( 5386 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), 5387 Op.getOperand(2), Op.getOperand(3), 5388 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), 5389 Op.getOperand(1)); 5390 case Intrinsic::localaddress: { 5391 const auto &MF = DAG.getMachineFunction(); 5392 const auto *RegInfo = Subtarget->getRegisterInfo(); 5393 unsigned Reg = RegInfo->getLocalAddressRegister(MF); 5394 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, 5395 Op.getSimpleValueType()); 5396 } 5397 5398 case Intrinsic::eh_recoverfp: { 5399 // FIXME: This needs to be implemented to correctly handle highly aligned 5400 // stack objects. For now we simply return the incoming FP. Refer D53541 5401 // for more details. 5402 SDValue FnOp = Op.getOperand(1); 5403 SDValue IncomingFPOp = Op.getOperand(2); 5404 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp); 5405 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr); 5406 if (!Fn) 5407 report_fatal_error( 5408 "llvm.eh.recoverfp must take a function as the first argument"); 5409 return IncomingFPOp; 5410 } 5411 5412 case Intrinsic::aarch64_neon_vsri: 5413 case Intrinsic::aarch64_neon_vsli: { 5414 EVT Ty = Op.getValueType(); 5415 5416 if (!Ty.isVector()) 5417 report_fatal_error("Unexpected type for aarch64_neon_vsli"); 5418 5419 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits()); 5420 5421 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri; 5422 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; 5423 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), 5424 Op.getOperand(3)); 5425 } 5426 5427 case Intrinsic::aarch64_neon_srhadd: 5428 case Intrinsic::aarch64_neon_urhadd: 5429 case Intrinsic::aarch64_neon_shadd: 5430 case Intrinsic::aarch64_neon_uhadd: { 5431 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || 5432 IntNo == Intrinsic::aarch64_neon_shadd); 5433 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || 5434 IntNo == Intrinsic::aarch64_neon_urhadd); 5435 unsigned Opcode = IsSignedAdd 5436 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS) 5437 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU); 5438 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), 5439 Op.getOperand(2)); 5440 } 5441 case Intrinsic::aarch64_neon_saddlp: 5442 case Intrinsic::aarch64_neon_uaddlp: { 5443 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp 5444 ? AArch64ISD::UADDLP 5445 : AArch64ISD::SADDLP; 5446 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1)); 5447 } 5448 case Intrinsic::aarch64_neon_sdot: 5449 case Intrinsic::aarch64_neon_udot: 5450 case Intrinsic::aarch64_sve_sdot: 5451 case Intrinsic::aarch64_sve_udot: { 5452 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot || 5453 IntNo == Intrinsic::aarch64_sve_udot) 5454 ? AArch64ISD::UDOT 5455 : AArch64ISD::SDOT; 5456 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), 5457 Op.getOperand(2), Op.getOperand(3)); 5458 } 5459 case Intrinsic::get_active_lane_mask: { 5460 SDValue ID = 5461 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64); 5462 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID, 5463 Op.getOperand(1), Op.getOperand(2)); 5464 } 5465 case Intrinsic::aarch64_neon_uaddlv: { 5466 EVT OpVT = Op.getOperand(1).getValueType(); 5467 EVT ResVT = Op.getValueType(); 5468 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 || 5469 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) { 5470 // In order to avoid insert_subvector, used v4i32 than v2i32. 5471 SDValue UADDLV = 5472 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1)); 5473 SDValue EXTRACT_VEC_ELT = 5474 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV, 5475 DAG.getConstant(0, dl, MVT::i64)); 5476 return EXTRACT_VEC_ELT; 5477 } 5478 return SDValue(); 5479 } 5480 case Intrinsic::experimental_cttz_elts: { 5481 SDValue NewCttzElts = 5482 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1)); 5483 5484 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType()); 5485 } 5486 } 5487 } 5488 5489 bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const { 5490 if (VT.getVectorElementType() == MVT::i8 || 5491 VT.getVectorElementType() == MVT::i16) { 5492 EltTy = MVT::i32; 5493 return true; 5494 } 5495 return false; 5496 } 5497 5498 bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend, 5499 EVT DataVT) const { 5500 const EVT IndexVT = Extend.getOperand(0).getValueType(); 5501 // SVE only supports implicit extension of 32-bit indices. 5502 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32) 5503 return false; 5504 5505 // Indices cannot be smaller than the main data type. 5506 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits()) 5507 return false; 5508 5509 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit 5510 // element container type, which would violate the previous clause. 5511 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2; 5512 } 5513 5514 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 5515 EVT ExtVT = ExtVal.getValueType(); 5516 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors()) 5517 return false; 5518 5519 // It may be worth creating extending masked loads if there are multiple 5520 // masked loads using the same predicate. That way we'll end up creating 5521 // extending masked loads that may then get split by the legaliser. This 5522 // results in just one set of predicate unpacks at the start, instead of 5523 // multiple sets of vector unpacks after each load. 5524 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) { 5525 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) { 5526 // Disable extending masked loads for fixed-width for now, since the code 5527 // quality doesn't look great. 5528 if (!ExtVT.isScalableVector()) 5529 return false; 5530 5531 unsigned NumExtMaskedLoads = 0; 5532 for (auto *U : Ld->getMask()->uses()) 5533 if (isa<MaskedLoadSDNode>(U)) 5534 NumExtMaskedLoads++; 5535 5536 if (NumExtMaskedLoads <= 1) 5537 return false; 5538 } 5539 } 5540 5541 return true; 5542 } 5543 5544 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { 5545 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = { 5546 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), 5547 AArch64ISD::GLD1_MERGE_ZERO}, 5548 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), 5549 AArch64ISD::GLD1_UXTW_MERGE_ZERO}, 5550 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), 5551 AArch64ISD::GLD1_MERGE_ZERO}, 5552 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), 5553 AArch64ISD::GLD1_SXTW_MERGE_ZERO}, 5554 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), 5555 AArch64ISD::GLD1_SCALED_MERGE_ZERO}, 5556 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), 5557 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO}, 5558 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), 5559 AArch64ISD::GLD1_SCALED_MERGE_ZERO}, 5560 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), 5561 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO}, 5562 }; 5563 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); 5564 return AddrModes.find(Key)->second; 5565 } 5566 5567 unsigned getSignExtendedGatherOpcode(unsigned Opcode) { 5568 switch (Opcode) { 5569 default: 5570 llvm_unreachable("unimplemented opcode"); 5571 return Opcode; 5572 case AArch64ISD::GLD1_MERGE_ZERO: 5573 return AArch64ISD::GLD1S_MERGE_ZERO; 5574 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 5575 return AArch64ISD::GLD1S_IMM_MERGE_ZERO; 5576 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 5577 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO; 5578 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 5579 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO; 5580 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 5581 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO; 5582 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 5583 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; 5584 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 5585 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; 5586 } 5587 } 5588 5589 SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, 5590 SelectionDAG &DAG) const { 5591 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op); 5592 5593 SDLoc DL(Op); 5594 SDValue Chain = MGT->getChain(); 5595 SDValue PassThru = MGT->getPassThru(); 5596 SDValue Mask = MGT->getMask(); 5597 SDValue BasePtr = MGT->getBasePtr(); 5598 SDValue Index = MGT->getIndex(); 5599 SDValue Scale = MGT->getScale(); 5600 EVT VT = Op.getValueType(); 5601 EVT MemVT = MGT->getMemoryVT(); 5602 ISD::LoadExtType ExtType = MGT->getExtensionType(); 5603 ISD::MemIndexType IndexType = MGT->getIndexType(); 5604 5605 // SVE supports zero (and so undef) passthrough values only, everything else 5606 // must be handled manually by an explicit select on the load's output. 5607 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) { 5608 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale}; 5609 SDValue Load = 5610 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops, 5611 MGT->getMemOperand(), IndexType, ExtType); 5612 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru); 5613 return DAG.getMergeValues({Select, Load.getValue(1)}, DL); 5614 } 5615 5616 bool IsScaled = MGT->isIndexScaled(); 5617 bool IsSigned = MGT->isIndexSigned(); 5618 5619 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else 5620 // must be calculated before hand. 5621 uint64_t ScaleVal = Scale->getAsZExtVal(); 5622 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) { 5623 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types"); 5624 EVT IndexVT = Index.getValueType(); 5625 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, 5626 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT)); 5627 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType()); 5628 5629 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; 5630 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops, 5631 MGT->getMemOperand(), IndexType, ExtType); 5632 } 5633 5634 // Lower fixed length gather to a scalable equivalent. 5635 if (VT.isFixedLengthVector()) { 5636 assert(Subtarget->useSVEForFixedLengthVectors() && 5637 "Cannot lower when not using SVE for fixed vectors!"); 5638 5639 // NOTE: Handle floating-point as if integer then bitcast the result. 5640 EVT DataVT = VT.changeVectorElementTypeToInteger(); 5641 MemVT = MemVT.changeVectorElementTypeToInteger(); 5642 5643 // Find the smallest integer fixed length vector we can use for the gather. 5644 EVT PromotedVT = VT.changeVectorElementType(MVT::i32); 5645 if (DataVT.getVectorElementType() == MVT::i64 || 5646 Index.getValueType().getVectorElementType() == MVT::i64 || 5647 Mask.getValueType().getVectorElementType() == MVT::i64) 5648 PromotedVT = VT.changeVectorElementType(MVT::i64); 5649 5650 // Promote vector operands except for passthrough, which we know is either 5651 // undef or zero, and thus best constructed directly. 5652 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 5653 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index); 5654 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask); 5655 5656 // A promoted result type forces the need for an extending load. 5657 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD) 5658 ExtType = ISD::EXTLOAD; 5659 5660 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT); 5661 5662 // Convert fixed length vector operands to scalable. 5663 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType()); 5664 Index = convertToScalableVector(DAG, ContainerVT, Index); 5665 Mask = convertFixedMaskToScalableVector(Mask, DAG); 5666 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT) 5667 : DAG.getConstant(0, DL, ContainerVT); 5668 5669 // Emit equivalent scalable vector gather. 5670 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; 5671 SDValue Load = 5672 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL, 5673 Ops, MGT->getMemOperand(), IndexType, ExtType); 5674 5675 // Extract fixed length data then convert to the required result type. 5676 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load); 5677 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result); 5678 if (VT.isFloatingPoint()) 5679 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result); 5680 5681 return DAG.getMergeValues({Result, Load.getValue(1)}, DL); 5682 } 5683 5684 // Everything else is legal. 5685 return Op; 5686 } 5687 5688 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, 5689 SelectionDAG &DAG) const { 5690 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op); 5691 5692 SDLoc DL(Op); 5693 SDValue Chain = MSC->getChain(); 5694 SDValue StoreVal = MSC->getValue(); 5695 SDValue Mask = MSC->getMask(); 5696 SDValue BasePtr = MSC->getBasePtr(); 5697 SDValue Index = MSC->getIndex(); 5698 SDValue Scale = MSC->getScale(); 5699 EVT VT = StoreVal.getValueType(); 5700 EVT MemVT = MSC->getMemoryVT(); 5701 ISD::MemIndexType IndexType = MSC->getIndexType(); 5702 bool Truncating = MSC->isTruncatingStore(); 5703 5704 bool IsScaled = MSC->isIndexScaled(); 5705 bool IsSigned = MSC->isIndexSigned(); 5706 5707 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else 5708 // must be calculated before hand. 5709 uint64_t ScaleVal = Scale->getAsZExtVal(); 5710 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) { 5711 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types"); 5712 EVT IndexVT = Index.getValueType(); 5713 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, 5714 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT)); 5715 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType()); 5716 5717 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; 5718 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops, 5719 MSC->getMemOperand(), IndexType, Truncating); 5720 } 5721 5722 // Lower fixed length scatter to a scalable equivalent. 5723 if (VT.isFixedLengthVector()) { 5724 assert(Subtarget->useSVEForFixedLengthVectors() && 5725 "Cannot lower when not using SVE for fixed vectors!"); 5726 5727 // Once bitcast we treat floating-point scatters as if integer. 5728 if (VT.isFloatingPoint()) { 5729 VT = VT.changeVectorElementTypeToInteger(); 5730 MemVT = MemVT.changeVectorElementTypeToInteger(); 5731 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal); 5732 } 5733 5734 // Find the smallest integer fixed length vector we can use for the scatter. 5735 EVT PromotedVT = VT.changeVectorElementType(MVT::i32); 5736 if (VT.getVectorElementType() == MVT::i64 || 5737 Index.getValueType().getVectorElementType() == MVT::i64 || 5738 Mask.getValueType().getVectorElementType() == MVT::i64) 5739 PromotedVT = VT.changeVectorElementType(MVT::i64); 5740 5741 // Promote vector operands. 5742 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 5743 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index); 5744 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask); 5745 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal); 5746 5747 // A promoted value type forces the need for a truncating store. 5748 if (PromotedVT != VT) 5749 Truncating = true; 5750 5751 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT); 5752 5753 // Convert fixed length vector operands to scalable. 5754 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType()); 5755 Index = convertToScalableVector(DAG, ContainerVT, Index); 5756 Mask = convertFixedMaskToScalableVector(Mask, DAG); 5757 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal); 5758 5759 // Emit equivalent scalable vector scatter. 5760 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale}; 5761 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops, 5762 MSC->getMemOperand(), IndexType, Truncating); 5763 } 5764 5765 // Everything else is legal. 5766 return Op; 5767 } 5768 5769 SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const { 5770 SDLoc DL(Op); 5771 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op); 5772 assert(LoadNode && "Expected custom lowering of a masked load node"); 5773 EVT VT = Op->getValueType(0); 5774 5775 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) 5776 return LowerFixedLengthVectorMLoadToSVE(Op, DAG); 5777 5778 SDValue PassThru = LoadNode->getPassThru(); 5779 SDValue Mask = LoadNode->getMask(); 5780 5781 if (PassThru->isUndef() || isZerosVector(PassThru.getNode())) 5782 return Op; 5783 5784 SDValue Load = DAG.getMaskedLoad( 5785 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(), 5786 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(), 5787 LoadNode->getMemOperand(), LoadNode->getAddressingMode(), 5788 LoadNode->getExtensionType()); 5789 5790 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru); 5791 5792 return DAG.getMergeValues({Result, Load.getValue(1)}, DL); 5793 } 5794 5795 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. 5796 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, 5797 EVT VT, EVT MemVT, 5798 SelectionDAG &DAG) { 5799 assert(VT.isVector() && "VT should be a vector type"); 5800 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); 5801 5802 SDValue Value = ST->getValue(); 5803 5804 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract 5805 // the word lane which represent the v4i8 subvector. It optimizes the store 5806 // to: 5807 // 5808 // xtn v0.8b, v0.8h 5809 // str s0, [x0] 5810 5811 SDValue Undef = DAG.getUNDEF(MVT::i16); 5812 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, 5813 {Undef, Undef, Undef, Undef}); 5814 5815 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, 5816 Value, UndefVec); 5817 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); 5818 5819 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); 5820 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, 5821 Trunc, DAG.getConstant(0, DL, MVT::i64)); 5822 5823 return DAG.getStore(ST->getChain(), DL, ExtractTrunc, 5824 ST->getBasePtr(), ST->getMemOperand()); 5825 } 5826 5827 // Custom lowering for any store, vector or scalar and/or default or with 5828 // a truncate operations. Currently only custom lower truncate operation 5829 // from vector v4i16 to v4i8 or volatile stores of i128. 5830 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, 5831 SelectionDAG &DAG) const { 5832 SDLoc Dl(Op); 5833 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 5834 assert (StoreNode && "Can only custom lower store nodes"); 5835 5836 SDValue Value = StoreNode->getValue(); 5837 5838 EVT VT = Value.getValueType(); 5839 EVT MemVT = StoreNode->getMemoryVT(); 5840 5841 if (VT.isVector()) { 5842 if (useSVEForFixedLengthVectorVT( 5843 VT, 5844 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) 5845 return LowerFixedLengthVectorStoreToSVE(Op, DAG); 5846 5847 unsigned AS = StoreNode->getAddressSpace(); 5848 Align Alignment = StoreNode->getAlign(); 5849 if (Alignment < MemVT.getStoreSize() && 5850 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment, 5851 StoreNode->getMemOperand()->getFlags(), 5852 nullptr)) { 5853 return scalarizeVectorStore(StoreNode, DAG); 5854 } 5855 5856 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 && 5857 MemVT == MVT::v4i8) { 5858 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); 5859 } 5860 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of 5861 // the custom lowering, as there are no un-paired non-temporal stores and 5862 // legalization will break up 256 bit inputs. 5863 ElementCount EC = MemVT.getVectorElementCount(); 5864 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && 5865 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() && 5866 (MemVT.getScalarSizeInBits() == 8u || 5867 MemVT.getScalarSizeInBits() == 16u || 5868 MemVT.getScalarSizeInBits() == 32u || 5869 MemVT.getScalarSizeInBits() == 64u)) { 5870 SDValue Lo = 5871 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, 5872 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), 5873 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64)); 5874 SDValue Hi = 5875 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, 5876 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), 5877 StoreNode->getValue(), 5878 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64)); 5879 SDValue Result = DAG.getMemIntrinsicNode( 5880 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), 5881 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, 5882 StoreNode->getMemoryVT(), StoreNode->getMemOperand()); 5883 return Result; 5884 } 5885 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { 5886 return LowerStore128(Op, DAG); 5887 } else if (MemVT == MVT::i64x8) { 5888 SDValue Value = StoreNode->getValue(); 5889 assert(Value->getValueType(0) == MVT::i64x8); 5890 SDValue Chain = StoreNode->getChain(); 5891 SDValue Base = StoreNode->getBasePtr(); 5892 EVT PtrVT = Base.getValueType(); 5893 for (unsigned i = 0; i < 8; i++) { 5894 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64, 5895 Value, DAG.getConstant(i, Dl, MVT::i32)); 5896 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base, 5897 DAG.getConstant(i * 8, Dl, PtrVT)); 5898 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(), 5899 StoreNode->getOriginalAlign()); 5900 } 5901 return Chain; 5902 } 5903 5904 return SDValue(); 5905 } 5906 5907 /// Lower atomic or volatile 128-bit stores to a single STP instruction. 5908 SDValue AArch64TargetLowering::LowerStore128(SDValue Op, 5909 SelectionDAG &DAG) const { 5910 MemSDNode *StoreNode = cast<MemSDNode>(Op); 5911 assert(StoreNode->getMemoryVT() == MVT::i128); 5912 assert(StoreNode->isVolatile() || StoreNode->isAtomic()); 5913 5914 bool IsStoreRelease = 5915 StoreNode->getMergedOrdering() == AtomicOrdering::Release; 5916 if (StoreNode->isAtomic()) 5917 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) && 5918 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) || 5919 StoreNode->getMergedOrdering() == AtomicOrdering::Unordered || 5920 StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic); 5921 5922 SDValue Value = (StoreNode->getOpcode() == ISD::STORE || 5923 StoreNode->getOpcode() == ISD::ATOMIC_STORE) 5924 ? StoreNode->getOperand(1) 5925 : StoreNode->getOperand(2); 5926 SDLoc DL(Op); 5927 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64); 5928 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP; 5929 if (DAG.getDataLayout().isBigEndian()) 5930 std::swap(StoreValue.first, StoreValue.second); 5931 SDValue Result = DAG.getMemIntrinsicNode( 5932 Opcode, DL, DAG.getVTList(MVT::Other), 5933 {StoreNode->getChain(), StoreValue.first, StoreValue.second, 5934 StoreNode->getBasePtr()}, 5935 StoreNode->getMemoryVT(), StoreNode->getMemOperand()); 5936 return Result; 5937 } 5938 5939 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, 5940 SelectionDAG &DAG) const { 5941 SDLoc DL(Op); 5942 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 5943 assert(LoadNode && "Expected custom lowering of a load node"); 5944 5945 if (LoadNode->getMemoryVT() == MVT::i64x8) { 5946 SmallVector<SDValue, 8> Ops; 5947 SDValue Base = LoadNode->getBasePtr(); 5948 SDValue Chain = LoadNode->getChain(); 5949 EVT PtrVT = Base.getValueType(); 5950 for (unsigned i = 0; i < 8; i++) { 5951 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, 5952 DAG.getConstant(i * 8, DL, PtrVT)); 5953 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr, 5954 LoadNode->getPointerInfo(), 5955 LoadNode->getOriginalAlign()); 5956 Ops.push_back(Part); 5957 Chain = SDValue(Part.getNode(), 1); 5958 } 5959 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops); 5960 return DAG.getMergeValues({Loaded, Chain}, DL); 5961 } 5962 5963 // Custom lowering for extending v4i8 vector loads. 5964 EVT VT = Op->getValueType(0); 5965 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32"); 5966 5967 if (LoadNode->getMemoryVT() != MVT::v4i8) 5968 return SDValue(); 5969 5970 unsigned ExtType; 5971 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) 5972 ExtType = ISD::SIGN_EXTEND; 5973 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD || 5974 LoadNode->getExtensionType() == ISD::EXTLOAD) 5975 ExtType = ISD::ZERO_EXTEND; 5976 else 5977 return SDValue(); 5978 5979 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(), 5980 LoadNode->getBasePtr(), MachinePointerInfo()); 5981 SDValue Chain = Load.getValue(1); 5982 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load); 5983 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec); 5984 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC); 5985 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext, 5986 DAG.getConstant(0, DL, MVT::i64)); 5987 if (VT == MVT::v4i32) 5988 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext); 5989 return DAG.getMergeValues({Ext, Chain}, DL); 5990 } 5991 5992 // Generate SUBS and CSEL for integer abs. 5993 SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { 5994 MVT VT = Op.getSimpleValueType(); 5995 5996 if (VT.isVector()) 5997 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU); 5998 5999 SDLoc DL(Op); 6000 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 6001 Op.getOperand(0)); 6002 // Generate SUBS & CSEL. 6003 SDValue Cmp = 6004 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), 6005 Op.getOperand(0), DAG.getConstant(0, DL, VT)); 6006 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg, 6007 DAG.getConstant(AArch64CC::PL, DL, MVT::i32), 6008 Cmp.getValue(1)); 6009 } 6010 6011 static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 6012 SDValue Chain = Op.getOperand(0); 6013 SDValue Cond = Op.getOperand(1); 6014 SDValue Dest = Op.getOperand(2); 6015 6016 AArch64CC::CondCode CC; 6017 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) { 6018 SDLoc dl(Op); 6019 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32); 6020 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 6021 Cmp); 6022 } 6023 6024 return SDValue(); 6025 } 6026 6027 // Treat FSHR with constant shifts as legal operation, otherwise it is expanded 6028 // FSHL is converted to FSHR before deciding what to do with it 6029 static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) { 6030 SDValue Shifts = Op.getOperand(2); 6031 // Check if the shift amount is a constant 6032 // If opcode is FSHL, convert it to FSHR 6033 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) { 6034 SDLoc DL(Op); 6035 MVT VT = Op.getSimpleValueType(); 6036 6037 if (Op.getOpcode() == ISD::FSHL) { 6038 unsigned int NewShiftNo = 6039 VT.getFixedSizeInBits() - ShiftNo->getZExtValue(); 6040 return DAG.getNode( 6041 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1), 6042 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType())); 6043 } else if (Op.getOpcode() == ISD::FSHR) { 6044 return Op; 6045 } 6046 } 6047 6048 return SDValue(); 6049 } 6050 6051 static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) { 6052 SDValue X = Op.getOperand(0); 6053 EVT XScalarTy = X.getValueType(); 6054 SDValue Exp = Op.getOperand(1); 6055 6056 SDLoc DL(Op); 6057 EVT XVT, ExpVT; 6058 switch (Op.getSimpleValueType().SimpleTy) { 6059 default: 6060 return SDValue(); 6061 case MVT::f16: 6062 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X); 6063 [[fallthrough]]; 6064 case MVT::f32: 6065 XVT = MVT::nxv4f32; 6066 ExpVT = MVT::nxv4i32; 6067 break; 6068 case MVT::f64: 6069 XVT = MVT::nxv2f64; 6070 ExpVT = MVT::nxv2i64; 6071 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp); 6072 break; 6073 } 6074 6075 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 6076 SDValue VX = 6077 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero); 6078 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT, 6079 DAG.getUNDEF(ExpVT), Exp, Zero); 6080 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1), 6081 AArch64SVEPredPattern::all); 6082 SDValue FScale = 6083 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XVT, 6084 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), 6085 VPg, VX, VExp); 6086 SDValue Final = 6087 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero); 6088 if (X.getValueType() != XScalarTy) 6089 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final, 6090 DAG.getIntPtrConstant(1, SDLoc(Op))); 6091 return Final; 6092 } 6093 6094 SDValue AArch64TargetLowering::LowerOperation(SDValue Op, 6095 SelectionDAG &DAG) const { 6096 LLVM_DEBUG(dbgs() << "Custom lowering: "); 6097 LLVM_DEBUG(Op.dump()); 6098 6099 switch (Op.getOpcode()) { 6100 default: 6101 llvm_unreachable("unimplemented operand"); 6102 return SDValue(); 6103 case ISD::BITCAST: 6104 return LowerBITCAST(Op, DAG); 6105 case ISD::GlobalAddress: 6106 return LowerGlobalAddress(Op, DAG); 6107 case ISD::GlobalTLSAddress: 6108 return LowerGlobalTLSAddress(Op, DAG); 6109 case ISD::SETCC: 6110 case ISD::STRICT_FSETCC: 6111 case ISD::STRICT_FSETCCS: 6112 return LowerSETCC(Op, DAG); 6113 case ISD::SETCCCARRY: 6114 return LowerSETCCCARRY(Op, DAG); 6115 case ISD::BRCOND: 6116 return LowerBRCOND(Op, DAG); 6117 case ISD::BR_CC: 6118 return LowerBR_CC(Op, DAG); 6119 case ISD::SELECT: 6120 return LowerSELECT(Op, DAG); 6121 case ISD::SELECT_CC: 6122 return LowerSELECT_CC(Op, DAG); 6123 case ISD::JumpTable: 6124 return LowerJumpTable(Op, DAG); 6125 case ISD::BR_JT: 6126 return LowerBR_JT(Op, DAG); 6127 case ISD::ConstantPool: 6128 return LowerConstantPool(Op, DAG); 6129 case ISD::BlockAddress: 6130 return LowerBlockAddress(Op, DAG); 6131 case ISD::VASTART: 6132 return LowerVASTART(Op, DAG); 6133 case ISD::VACOPY: 6134 return LowerVACOPY(Op, DAG); 6135 case ISD::VAARG: 6136 return LowerVAARG(Op, DAG); 6137 case ISD::UADDO_CARRY: 6138 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/); 6139 case ISD::USUBO_CARRY: 6140 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/); 6141 case ISD::SADDO_CARRY: 6142 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/); 6143 case ISD::SSUBO_CARRY: 6144 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/); 6145 case ISD::SADDO: 6146 case ISD::UADDO: 6147 case ISD::SSUBO: 6148 case ISD::USUBO: 6149 case ISD::SMULO: 6150 case ISD::UMULO: 6151 return LowerXALUO(Op, DAG); 6152 case ISD::FADD: 6153 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); 6154 case ISD::FSUB: 6155 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED); 6156 case ISD::FMUL: 6157 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED); 6158 case ISD::FMA: 6159 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); 6160 case ISD::FDIV: 6161 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED); 6162 case ISD::FNEG: 6163 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU); 6164 case ISD::FCEIL: 6165 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU); 6166 case ISD::FFLOOR: 6167 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU); 6168 case ISD::FNEARBYINT: 6169 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU); 6170 case ISD::FRINT: 6171 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU); 6172 case ISD::FROUND: 6173 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU); 6174 case ISD::FROUNDEVEN: 6175 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU); 6176 case ISD::FTRUNC: 6177 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU); 6178 case ISD::FSQRT: 6179 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU); 6180 case ISD::FABS: 6181 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU); 6182 case ISD::FP_ROUND: 6183 case ISD::STRICT_FP_ROUND: 6184 return LowerFP_ROUND(Op, DAG); 6185 case ISD::FP_EXTEND: 6186 return LowerFP_EXTEND(Op, DAG); 6187 case ISD::FRAMEADDR: 6188 return LowerFRAMEADDR(Op, DAG); 6189 case ISD::SPONENTRY: 6190 return LowerSPONENTRY(Op, DAG); 6191 case ISD::RETURNADDR: 6192 return LowerRETURNADDR(Op, DAG); 6193 case ISD::ADDROFRETURNADDR: 6194 return LowerADDROFRETURNADDR(Op, DAG); 6195 case ISD::CONCAT_VECTORS: 6196 return LowerCONCAT_VECTORS(Op, DAG); 6197 case ISD::INSERT_VECTOR_ELT: 6198 return LowerINSERT_VECTOR_ELT(Op, DAG); 6199 case ISD::EXTRACT_VECTOR_ELT: 6200 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 6201 case ISD::BUILD_VECTOR: 6202 return LowerBUILD_VECTOR(Op, DAG); 6203 case ISD::ZERO_EXTEND_VECTOR_INREG: 6204 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG); 6205 case ISD::VECTOR_SHUFFLE: 6206 return LowerVECTOR_SHUFFLE(Op, DAG); 6207 case ISD::SPLAT_VECTOR: 6208 return LowerSPLAT_VECTOR(Op, DAG); 6209 case ISD::EXTRACT_SUBVECTOR: 6210 return LowerEXTRACT_SUBVECTOR(Op, DAG); 6211 case ISD::INSERT_SUBVECTOR: 6212 return LowerINSERT_SUBVECTOR(Op, DAG); 6213 case ISD::SDIV: 6214 case ISD::UDIV: 6215 return LowerDIV(Op, DAG); 6216 case ISD::SMIN: 6217 case ISD::UMIN: 6218 case ISD::SMAX: 6219 case ISD::UMAX: 6220 return LowerMinMax(Op, DAG); 6221 case ISD::SRA: 6222 case ISD::SRL: 6223 case ISD::SHL: 6224 return LowerVectorSRA_SRL_SHL(Op, DAG); 6225 case ISD::SHL_PARTS: 6226 case ISD::SRL_PARTS: 6227 case ISD::SRA_PARTS: 6228 return LowerShiftParts(Op, DAG); 6229 case ISD::CTPOP: 6230 case ISD::PARITY: 6231 return LowerCTPOP_PARITY(Op, DAG); 6232 case ISD::FCOPYSIGN: 6233 return LowerFCOPYSIGN(Op, DAG); 6234 case ISD::OR: 6235 return LowerVectorOR(Op, DAG); 6236 case ISD::XOR: 6237 return LowerXOR(Op, DAG); 6238 case ISD::PREFETCH: 6239 return LowerPREFETCH(Op, DAG); 6240 case ISD::SINT_TO_FP: 6241 case ISD::UINT_TO_FP: 6242 case ISD::STRICT_SINT_TO_FP: 6243 case ISD::STRICT_UINT_TO_FP: 6244 return LowerINT_TO_FP(Op, DAG); 6245 case ISD::FP_TO_SINT: 6246 case ISD::FP_TO_UINT: 6247 case ISD::STRICT_FP_TO_SINT: 6248 case ISD::STRICT_FP_TO_UINT: 6249 return LowerFP_TO_INT(Op, DAG); 6250 case ISD::FP_TO_SINT_SAT: 6251 case ISD::FP_TO_UINT_SAT: 6252 return LowerFP_TO_INT_SAT(Op, DAG); 6253 case ISD::FSINCOS: 6254 return LowerFSINCOS(Op, DAG); 6255 case ISD::GET_ROUNDING: 6256 return LowerGET_ROUNDING(Op, DAG); 6257 case ISD::SET_ROUNDING: 6258 return LowerSET_ROUNDING(Op, DAG); 6259 case ISD::MUL: 6260 return LowerMUL(Op, DAG); 6261 case ISD::MULHS: 6262 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED); 6263 case ISD::MULHU: 6264 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED); 6265 case ISD::INTRINSIC_W_CHAIN: 6266 return LowerINTRINSIC_W_CHAIN(Op, DAG); 6267 case ISD::INTRINSIC_WO_CHAIN: 6268 return LowerINTRINSIC_WO_CHAIN(Op, DAG); 6269 case ISD::INTRINSIC_VOID: 6270 return LowerINTRINSIC_VOID(Op, DAG); 6271 case ISD::ATOMIC_STORE: 6272 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) { 6273 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3()); 6274 return LowerStore128(Op, DAG); 6275 } 6276 return SDValue(); 6277 case ISD::STORE: 6278 return LowerSTORE(Op, DAG); 6279 case ISD::MSTORE: 6280 return LowerFixedLengthVectorMStoreToSVE(Op, DAG); 6281 case ISD::MGATHER: 6282 return LowerMGATHER(Op, DAG); 6283 case ISD::MSCATTER: 6284 return LowerMSCATTER(Op, DAG); 6285 case ISD::VECREDUCE_SEQ_FADD: 6286 return LowerVECREDUCE_SEQ_FADD(Op, DAG); 6287 case ISD::VECREDUCE_ADD: 6288 case ISD::VECREDUCE_AND: 6289 case ISD::VECREDUCE_OR: 6290 case ISD::VECREDUCE_XOR: 6291 case ISD::VECREDUCE_SMAX: 6292 case ISD::VECREDUCE_SMIN: 6293 case ISD::VECREDUCE_UMAX: 6294 case ISD::VECREDUCE_UMIN: 6295 case ISD::VECREDUCE_FADD: 6296 case ISD::VECREDUCE_FMAX: 6297 case ISD::VECREDUCE_FMIN: 6298 case ISD::VECREDUCE_FMAXIMUM: 6299 case ISD::VECREDUCE_FMINIMUM: 6300 return LowerVECREDUCE(Op, DAG); 6301 case ISD::ATOMIC_LOAD_AND: 6302 return LowerATOMIC_LOAD_AND(Op, DAG); 6303 case ISD::DYNAMIC_STACKALLOC: 6304 return LowerDYNAMIC_STACKALLOC(Op, DAG); 6305 case ISD::VSCALE: 6306 return LowerVSCALE(Op, DAG); 6307 case ISD::ANY_EXTEND: 6308 case ISD::SIGN_EXTEND: 6309 case ISD::ZERO_EXTEND: 6310 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG); 6311 case ISD::SIGN_EXTEND_INREG: { 6312 // Only custom lower when ExtraVT has a legal byte based element type. 6313 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 6314 EVT ExtraEltVT = ExtraVT.getVectorElementType(); 6315 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) && 6316 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64)) 6317 return SDValue(); 6318 6319 return LowerToPredicatedOp(Op, DAG, 6320 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU); 6321 } 6322 case ISD::TRUNCATE: 6323 return LowerTRUNCATE(Op, DAG); 6324 case ISD::MLOAD: 6325 return LowerMLOAD(Op, DAG); 6326 case ISD::LOAD: 6327 if (useSVEForFixedLengthVectorVT(Op.getValueType(), 6328 !Subtarget->isNeonAvailable())) 6329 return LowerFixedLengthVectorLoadToSVE(Op, DAG); 6330 return LowerLOAD(Op, DAG); 6331 case ISD::ADD: 6332 case ISD::AND: 6333 case ISD::SUB: 6334 return LowerToScalableOp(Op, DAG); 6335 case ISD::FMAXIMUM: 6336 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED); 6337 case ISD::FMAXNUM: 6338 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED); 6339 case ISD::FMINIMUM: 6340 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED); 6341 case ISD::FMINNUM: 6342 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED); 6343 case ISD::VSELECT: 6344 return LowerFixedLengthVectorSelectToSVE(Op, DAG); 6345 case ISD::ABS: 6346 return LowerABS(Op, DAG); 6347 case ISD::ABDS: 6348 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED); 6349 case ISD::ABDU: 6350 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED); 6351 case ISD::AVGFLOORS: 6352 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED); 6353 case ISD::AVGFLOORU: 6354 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED); 6355 case ISD::AVGCEILS: 6356 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED); 6357 case ISD::AVGCEILU: 6358 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED); 6359 case ISD::BITREVERSE: 6360 return LowerBitreverse(Op, DAG); 6361 case ISD::BSWAP: 6362 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU); 6363 case ISD::CTLZ: 6364 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU); 6365 case ISD::CTTZ: 6366 return LowerCTTZ(Op, DAG); 6367 case ISD::VECTOR_SPLICE: 6368 return LowerVECTOR_SPLICE(Op, DAG); 6369 case ISD::VECTOR_DEINTERLEAVE: 6370 return LowerVECTOR_DEINTERLEAVE(Op, DAG); 6371 case ISD::VECTOR_INTERLEAVE: 6372 return LowerVECTOR_INTERLEAVE(Op, DAG); 6373 case ISD::LROUND: 6374 case ISD::LLROUND: 6375 case ISD::LRINT: 6376 case ISD::LLRINT: { 6377 assert(Op.getOperand(0).getValueType() == MVT::f16 && 6378 "Expected custom lowering of rounding operations only for f16"); 6379 SDLoc DL(Op); 6380 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0)); 6381 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext); 6382 } 6383 case ISD::STRICT_LROUND: 6384 case ISD::STRICT_LLROUND: 6385 case ISD::STRICT_LRINT: 6386 case ISD::STRICT_LLRINT: { 6387 assert(Op.getOperand(1).getValueType() == MVT::f16 && 6388 "Expected custom lowering of rounding operations only for f16"); 6389 SDLoc DL(Op); 6390 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other}, 6391 {Op.getOperand(0), Op.getOperand(1)}); 6392 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other}, 6393 {Ext.getValue(1), Ext.getValue(0)}); 6394 } 6395 case ISD::WRITE_REGISTER: { 6396 assert(Op.getOperand(2).getValueType() == MVT::i128 && 6397 "WRITE_REGISTER custom lowering is only for 128-bit sysregs"); 6398 SDLoc DL(Op); 6399 6400 SDValue Chain = Op.getOperand(0); 6401 SDValue SysRegName = Op.getOperand(1); 6402 std::pair<SDValue, SDValue> Pair = 6403 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64); 6404 6405 // chain = MSRR(chain, sysregname, lo, hi) 6406 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain, 6407 SysRegName, Pair.first, Pair.second); 6408 6409 return Result; 6410 } 6411 case ISD::FSHL: 6412 case ISD::FSHR: 6413 return LowerFunnelShift(Op, DAG); 6414 case ISD::FLDEXP: 6415 return LowerFLDEXP(Op, DAG); 6416 } 6417 } 6418 6419 bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const { 6420 return !Subtarget->useSVEForFixedLengthVectors(); 6421 } 6422 6423 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( 6424 EVT VT, bool OverrideNEON) const { 6425 if (!VT.isFixedLengthVector() || !VT.isSimple()) 6426 return false; 6427 6428 // Don't use SVE for vectors we cannot scalarize if required. 6429 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 6430 // Fixed length predicates should be promoted to i8. 6431 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work. 6432 case MVT::i1: 6433 default: 6434 return false; 6435 case MVT::i8: 6436 case MVT::i16: 6437 case MVT::i32: 6438 case MVT::i64: 6439 case MVT::f16: 6440 case MVT::f32: 6441 case MVT::f64: 6442 break; 6443 } 6444 6445 // NEON-sized vectors can be emulated using SVE instructions. 6446 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector())) 6447 return Subtarget->hasSVEorSME(); 6448 6449 // Ensure NEON MVTs only belong to a single register class. 6450 if (VT.getFixedSizeInBits() <= 128) 6451 return false; 6452 6453 // Ensure wider than NEON code generation is enabled. 6454 if (!Subtarget->useSVEForFixedLengthVectors()) 6455 return false; 6456 6457 // Don't use SVE for types that don't fit. 6458 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) 6459 return false; 6460 6461 // TODO: Perhaps an artificial restriction, but worth having whilst getting 6462 // the base fixed length SVE support in place. 6463 if (!VT.isPow2VectorType()) 6464 return false; 6465 6466 return true; 6467 } 6468 6469 //===----------------------------------------------------------------------===// 6470 // Calling Convention Implementation 6471 //===----------------------------------------------------------------------===// 6472 6473 static unsigned getIntrinsicID(const SDNode *N) { 6474 unsigned Opcode = N->getOpcode(); 6475 switch (Opcode) { 6476 default: 6477 return Intrinsic::not_intrinsic; 6478 case ISD::INTRINSIC_WO_CHAIN: { 6479 unsigned IID = N->getConstantOperandVal(0); 6480 if (IID < Intrinsic::num_intrinsics) 6481 return IID; 6482 return Intrinsic::not_intrinsic; 6483 } 6484 } 6485 } 6486 6487 bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, 6488 SDValue N1) const { 6489 if (!N0.hasOneUse()) 6490 return false; 6491 6492 unsigned IID = getIntrinsicID(N1.getNode()); 6493 // Avoid reassociating expressions that can be lowered to smlal/umlal. 6494 if (IID == Intrinsic::aarch64_neon_umull || 6495 N1.getOpcode() == AArch64ISD::UMULL || 6496 IID == Intrinsic::aarch64_neon_smull || 6497 N1.getOpcode() == AArch64ISD::SMULL) 6498 return N0.getOpcode() != ISD::ADD; 6499 6500 return true; 6501 } 6502 6503 /// Selects the correct CCAssignFn for a given CallingConvention value. 6504 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 6505 bool IsVarArg) const { 6506 switch (CC) { 6507 default: 6508 report_fatal_error("Unsupported calling convention."); 6509 case CallingConv::GHC: 6510 return CC_AArch64_GHC; 6511 case CallingConv::C: 6512 case CallingConv::Fast: 6513 case CallingConv::PreserveMost: 6514 case CallingConv::PreserveAll: 6515 case CallingConv::CXX_FAST_TLS: 6516 case CallingConv::Swift: 6517 case CallingConv::SwiftTail: 6518 case CallingConv::Tail: 6519 case CallingConv::GRAAL: 6520 if (Subtarget->isTargetWindows()) { 6521 if (IsVarArg) { 6522 if (Subtarget->isWindowsArm64EC()) 6523 return CC_AArch64_Arm64EC_VarArg; 6524 return CC_AArch64_Win64_VarArg; 6525 } 6526 return CC_AArch64_Win64PCS; 6527 } 6528 if (!Subtarget->isTargetDarwin()) 6529 return CC_AArch64_AAPCS; 6530 if (!IsVarArg) 6531 return CC_AArch64_DarwinPCS; 6532 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg 6533 : CC_AArch64_DarwinPCS_VarArg; 6534 case CallingConv::Win64: 6535 if (IsVarArg) { 6536 if (Subtarget->isWindowsArm64EC()) 6537 return CC_AArch64_Arm64EC_VarArg; 6538 return CC_AArch64_Win64_VarArg; 6539 } 6540 return CC_AArch64_Win64PCS; 6541 case CallingConv::CFGuard_Check: 6542 return CC_AArch64_Win64_CFGuard_Check; 6543 case CallingConv::AArch64_VectorCall: 6544 case CallingConv::AArch64_SVE_VectorCall: 6545 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0: 6546 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2: 6547 return CC_AArch64_AAPCS; 6548 } 6549 } 6550 6551 CCAssignFn * 6552 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { 6553 return RetCC_AArch64_AAPCS; 6554 } 6555 6556 6557 unsigned 6558 AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, 6559 SelectionDAG &DAG) const { 6560 MachineFunction &MF = DAG.getMachineFunction(); 6561 MachineFrameInfo &MFI = MF.getFrameInfo(); 6562 6563 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case) 6564 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, 6565 DAG.getConstant(1, DL, MVT::i32)); 6566 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N); 6567 SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)}; 6568 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other); 6569 SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops); 6570 Chain = Buffer.getValue(1); 6571 MFI.CreateVariableSizedObject(Align(1), nullptr); 6572 6573 // Allocate an additional TPIDR2 object on the stack (16 bytes) 6574 unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false); 6575 6576 // Store the buffer pointer to the TPIDR2 stack object. 6577 MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); 6578 SDValue Ptr = DAG.getFrameIndex( 6579 TPIDR2Obj, 6580 DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); 6581 Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI); 6582 6583 // Set the reserved bytes (10-15) to zero 6584 EVT PtrTy = Ptr.getValueType(); 6585 SDValue ReservedPtr = 6586 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy)); 6587 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr, 6588 MPI); 6589 ReservedPtr = 6590 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy)); 6591 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr, 6592 MPI); 6593 6594 return TPIDR2Obj; 6595 } 6596 6597 SDValue AArch64TargetLowering::LowerFormalArguments( 6598 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 6599 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 6600 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 6601 MachineFunction &MF = DAG.getMachineFunction(); 6602 const Function &F = MF.getFunction(); 6603 MachineFrameInfo &MFI = MF.getFrameInfo(); 6604 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv()); 6605 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 6606 6607 SmallVector<ISD::OutputArg, 4> Outs; 6608 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs, 6609 DAG.getTargetLoweringInfo(), MF.getDataLayout()); 6610 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); })) 6611 FuncInfo->setIsSVECC(true); 6612 6613 // Assign locations to all of the incoming arguments. 6614 SmallVector<CCValAssign, 16> ArgLocs; 6615 DenseMap<unsigned, SDValue> CopiedRegs; 6616 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 6617 6618 // At this point, Ins[].VT may already be promoted to i32. To correctly 6619 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 6620 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 6621 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here 6622 // we use a special version of AnalyzeFormalArguments to pass in ValVT and 6623 // LocVT. 6624 unsigned NumArgs = Ins.size(); 6625 Function::const_arg_iterator CurOrigArg = F.arg_begin(); 6626 unsigned CurArgIdx = 0; 6627 for (unsigned i = 0; i != NumArgs; ++i) { 6628 MVT ValVT = Ins[i].VT; 6629 if (Ins[i].isOrigArg()) { 6630 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); 6631 CurArgIdx = Ins[i].getOrigArgIndex(); 6632 6633 // Get type of the original argument. 6634 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), 6635 /*AllowUnknown*/ true); 6636 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; 6637 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 6638 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 6639 ValVT = MVT::i8; 6640 else if (ActualMVT == MVT::i16) 6641 ValVT = MVT::i16; 6642 } 6643 bool UseVarArgCC = false; 6644 if (IsWin64) 6645 UseVarArgCC = isVarArg; 6646 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC); 6647 bool Res = 6648 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); 6649 assert(!Res && "Call operand has unhandled type"); 6650 (void)Res; 6651 } 6652 6653 SMEAttrs Attrs(MF.getFunction()); 6654 bool IsLocallyStreaming = 6655 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody(); 6656 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); 6657 SDValue Glue = Chain.getValue(1); 6658 6659 SmallVector<SDValue, 16> ArgValues; 6660 unsigned ExtraArgLocs = 0; 6661 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 6662 CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; 6663 6664 if (Ins[i].Flags.isByVal()) { 6665 // Byval is used for HFAs in the PCS, but the system should work in a 6666 // non-compliant manner for larger structs. 6667 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6668 int Size = Ins[i].Flags.getByValSize(); 6669 unsigned NumRegs = (Size + 7) / 8; 6670 6671 // FIXME: This works on big-endian for composite byvals, which are the common 6672 // case. It should also work for fundamental types too. 6673 unsigned FrameIdx = 6674 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); 6675 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); 6676 InVals.push_back(FrameIdxN); 6677 6678 continue; 6679 } 6680 6681 if (Ins[i].Flags.isSwiftAsync()) 6682 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true); 6683 6684 SDValue ArgValue; 6685 if (VA.isRegLoc()) { 6686 // Arguments stored in registers. 6687 EVT RegVT = VA.getLocVT(); 6688 const TargetRegisterClass *RC; 6689 6690 if (RegVT == MVT::i32) 6691 RC = &AArch64::GPR32RegClass; 6692 else if (RegVT == MVT::i64) 6693 RC = &AArch64::GPR64RegClass; 6694 else if (RegVT == MVT::f16 || RegVT == MVT::bf16) 6695 RC = &AArch64::FPR16RegClass; 6696 else if (RegVT == MVT::f32) 6697 RC = &AArch64::FPR32RegClass; 6698 else if (RegVT == MVT::f64 || RegVT.is64BitVector()) 6699 RC = &AArch64::FPR64RegClass; 6700 else if (RegVT == MVT::f128 || RegVT.is128BitVector()) 6701 RC = &AArch64::FPR128RegClass; 6702 else if (RegVT.isScalableVector() && 6703 RegVT.getVectorElementType() == MVT::i1) { 6704 FuncInfo->setIsSVECC(true); 6705 RC = &AArch64::PPRRegClass; 6706 } else if (RegVT == MVT::aarch64svcount) { 6707 FuncInfo->setIsSVECC(true); 6708 RC = &AArch64::PPRRegClass; 6709 } else if (RegVT.isScalableVector()) { 6710 FuncInfo->setIsSVECC(true); 6711 RC = &AArch64::ZPRRegClass; 6712 } else 6713 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 6714 6715 // Transform the arguments in physical registers into virtual ones. 6716 Register Reg = MF.addLiveIn(VA.getLocReg(), RC); 6717 6718 if (IsLocallyStreaming) { 6719 // LocallyStreamingFunctions must insert the SMSTART in the correct 6720 // position, so we use Glue to ensure no instructions can be scheduled 6721 // between the chain of: 6722 // t0: ch,glue = EntryNode 6723 // t1: res,ch,glue = CopyFromReg 6724 // ... 6725 // tn: res,ch,glue = CopyFromReg t(n-1), .. 6726 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2 6727 // ^^^^^^ 6728 // This will be the new Chain/Root node. 6729 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue); 6730 Glue = ArgValue.getValue(2); 6731 } else 6732 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); 6733 6734 // If this is an 8, 16 or 32-bit value, it is really passed promoted 6735 // to 64 bits. Insert an assert[sz]ext to capture this, then 6736 // truncate to the right size. 6737 switch (VA.getLocInfo()) { 6738 default: 6739 llvm_unreachable("Unknown loc info!"); 6740 case CCValAssign::Full: 6741 break; 6742 case CCValAssign::Indirect: 6743 assert( 6744 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) && 6745 "Indirect arguments should be scalable on most subtargets"); 6746 break; 6747 case CCValAssign::BCvt: 6748 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); 6749 break; 6750 case CCValAssign::AExt: 6751 case CCValAssign::SExt: 6752 case CCValAssign::ZExt: 6753 break; 6754 case CCValAssign::AExtUpper: 6755 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue, 6756 DAG.getConstant(32, DL, RegVT)); 6757 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT()); 6758 break; 6759 } 6760 } else { // VA.isRegLoc() 6761 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); 6762 unsigned ArgOffset = VA.getLocMemOffset(); 6763 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect 6764 ? VA.getLocVT().getSizeInBits() 6765 : VA.getValVT().getSizeInBits()) / 8; 6766 6767 uint32_t BEAlign = 0; 6768 if (!Subtarget->isLittleEndian() && ArgSize < 8 && 6769 !Ins[i].Flags.isInConsecutiveRegs()) 6770 BEAlign = 8 - ArgSize; 6771 6772 SDValue FIN; 6773 MachinePointerInfo PtrInfo; 6774 if (isVarArg && Subtarget->isWindowsArm64EC()) { 6775 // In the ARM64EC varargs convention, fixed arguments on the stack are 6776 // accessed relative to x4, not sp. 6777 unsigned ObjOffset = ArgOffset + BEAlign; 6778 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); 6779 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 6780 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val, 6781 DAG.getConstant(ObjOffset, DL, MVT::i64)); 6782 PtrInfo = MachinePointerInfo::getUnknownStack(MF); 6783 } else { 6784 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); 6785 6786 // Create load nodes to retrieve arguments from the stack. 6787 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 6788 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 6789 } 6790 6791 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 6792 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 6793 MVT MemVT = VA.getValVT(); 6794 6795 switch (VA.getLocInfo()) { 6796 default: 6797 break; 6798 case CCValAssign::Trunc: 6799 case CCValAssign::BCvt: 6800 MemVT = VA.getLocVT(); 6801 break; 6802 case CCValAssign::Indirect: 6803 assert((VA.getValVT().isScalableVector() || 6804 Subtarget->isWindowsArm64EC()) && 6805 "Indirect arguments should be scalable on most subtargets"); 6806 MemVT = VA.getLocVT(); 6807 break; 6808 case CCValAssign::SExt: 6809 ExtType = ISD::SEXTLOAD; 6810 break; 6811 case CCValAssign::ZExt: 6812 ExtType = ISD::ZEXTLOAD; 6813 break; 6814 case CCValAssign::AExt: 6815 ExtType = ISD::EXTLOAD; 6816 break; 6817 } 6818 6819 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo, 6820 MemVT); 6821 } 6822 6823 if (VA.getLocInfo() == CCValAssign::Indirect) { 6824 assert((VA.getValVT().isScalableVT() || 6825 Subtarget->isWindowsArm64EC()) && 6826 "Indirect arguments should be scalable on most subtargets"); 6827 6828 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue(); 6829 unsigned NumParts = 1; 6830 if (Ins[i].Flags.isInConsecutiveRegs()) { 6831 assert(!Ins[i].Flags.isInConsecutiveRegsLast()); 6832 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) 6833 ++NumParts; 6834 } 6835 6836 MVT PartLoad = VA.getValVT(); 6837 SDValue Ptr = ArgValue; 6838 6839 // Ensure we generate all loads for each tuple part, whilst updating the 6840 // pointer after each load correctly using vscale. 6841 while (NumParts > 0) { 6842 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo()); 6843 InVals.push_back(ArgValue); 6844 NumParts--; 6845 if (NumParts > 0) { 6846 SDValue BytesIncrement; 6847 if (PartLoad.isScalableVector()) { 6848 BytesIncrement = DAG.getVScale( 6849 DL, Ptr.getValueType(), 6850 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize)); 6851 } else { 6852 BytesIncrement = DAG.getConstant( 6853 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL, 6854 Ptr.getValueType()); 6855 } 6856 SDNodeFlags Flags; 6857 Flags.setNoUnsignedWrap(true); 6858 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 6859 BytesIncrement, Flags); 6860 ExtraArgLocs++; 6861 i++; 6862 } 6863 } 6864 } else { 6865 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) 6866 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), 6867 ArgValue, DAG.getValueType(MVT::i32)); 6868 6869 // i1 arguments are zero-extended to i8 by the caller. Emit a 6870 // hint to reflect this. 6871 if (Ins[i].isOrigArg()) { 6872 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex()); 6873 if (OrigArg->getType()->isIntegerTy(1)) { 6874 if (!Ins[i].Flags.isZExt()) { 6875 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL, 6876 ArgValue.getValueType(), ArgValue); 6877 } 6878 } 6879 } 6880 6881 InVals.push_back(ArgValue); 6882 } 6883 } 6884 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); 6885 6886 // Insert the SMSTART if this is a locally streaming function and 6887 // make sure it is Glued to the last CopyFromReg value. 6888 if (IsLocallyStreaming) { 6889 Chain = 6890 changeStreamingMode(DAG, DL, /*Enable*/ true, DAG.getRoot(), Glue, 6891 DAG.getConstant(0, DL, MVT::i64), /*Entry*/ true); 6892 6893 // Ensure that the SMSTART happens after the CopyWithChain such that its 6894 // chain result is used. 6895 for (unsigned I=0; I<InVals.size(); ++I) { 6896 Register Reg = MF.getRegInfo().createVirtualRegister( 6897 getRegClassFor(InVals[I].getValueType().getSimpleVT())); 6898 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]); 6899 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg, 6900 InVals[I].getValueType()); 6901 } 6902 } 6903 6904 // varargs 6905 if (isVarArg) { 6906 if (!Subtarget->isTargetDarwin() || IsWin64) { 6907 // The AAPCS variadic function ABI is identical to the non-variadic 6908 // one. As a result there may be more arguments in registers and we should 6909 // save them for future reference. 6910 // Win64 variadic functions also pass arguments in registers, but all float 6911 // arguments are passed in integer registers. 6912 saveVarArgRegisters(CCInfo, DAG, DL, Chain); 6913 } 6914 6915 // This will point to the next argument passed via stack. 6916 unsigned VarArgsOffset = CCInfo.getStackSize(); 6917 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32 6918 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8); 6919 FuncInfo->setVarArgsStackOffset(VarArgsOffset); 6920 FuncInfo->setVarArgsStackIndex( 6921 MFI.CreateFixedObject(4, VarArgsOffset, true)); 6922 6923 if (MFI.hasMustTailInVarArgFunc()) { 6924 SmallVector<MVT, 2> RegParmTypes; 6925 RegParmTypes.push_back(MVT::i64); 6926 RegParmTypes.push_back(MVT::f128); 6927 // Compute the set of forwarded registers. The rest are scratch. 6928 SmallVectorImpl<ForwardedRegister> &Forwards = 6929 FuncInfo->getForwardedMustTailRegParms(); 6930 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, 6931 CC_AArch64_AAPCS); 6932 6933 // Conservatively forward X8, since it might be used for aggregate return. 6934 if (!CCInfo.isAllocated(AArch64::X8)) { 6935 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); 6936 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); 6937 } 6938 } 6939 } 6940 6941 // On Windows, InReg pointers must be returned, so record the pointer in a 6942 // virtual register at the start of the function so it can be returned in the 6943 // epilogue. 6944 if (IsWin64) { 6945 for (unsigned I = 0, E = Ins.size(); I != E; ++I) { 6946 if (Ins[I].Flags.isInReg() && Ins[I].Flags.isSRet()) { 6947 assert(!FuncInfo->getSRetReturnReg()); 6948 6949 MVT PtrTy = getPointerTy(DAG.getDataLayout()); 6950 Register Reg = 6951 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 6952 FuncInfo->setSRetReturnReg(Reg); 6953 6954 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]); 6955 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain); 6956 break; 6957 } 6958 } 6959 } 6960 6961 unsigned StackArgSize = CCInfo.getStackSize(); 6962 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 6963 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 6964 // This is a non-standard ABI so by fiat I say we're allowed to make full 6965 // use of the stack area to be popped, which must be aligned to 16 bytes in 6966 // any case: 6967 StackArgSize = alignTo(StackArgSize, 16); 6968 6969 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 6970 // a multiple of 16. 6971 FuncInfo->setArgumentStackToRestore(StackArgSize); 6972 6973 // This realignment carries over to the available bytes below. Our own 6974 // callers will guarantee the space is free by giving an aligned value to 6975 // CALLSEQ_START. 6976 } 6977 // Even if we're not expected to free up the space, it's useful to know how 6978 // much is there while considering tail calls (because we can reuse it). 6979 FuncInfo->setBytesInStackArgArea(StackArgSize); 6980 6981 if (Subtarget->hasCustomCallingConv()) 6982 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); 6983 6984 // Conservatively assume the function requires the lazy-save mechanism. 6985 if (SMEAttrs(MF.getFunction()).hasZAState()) { 6986 unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG); 6987 FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj); 6988 } 6989 6990 return Chain; 6991 } 6992 6993 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, 6994 SelectionDAG &DAG, 6995 const SDLoc &DL, 6996 SDValue &Chain) const { 6997 MachineFunction &MF = DAG.getMachineFunction(); 6998 MachineFrameInfo &MFI = MF.getFrameInfo(); 6999 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 7000 auto PtrVT = getPointerTy(DAG.getDataLayout()); 7001 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); 7002 7003 SmallVector<SDValue, 8> MemOps; 7004 7005 auto GPRArgRegs = AArch64::getGPRArgRegs(); 7006 unsigned NumGPRArgRegs = GPRArgRegs.size(); 7007 if (Subtarget->isWindowsArm64EC()) { 7008 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs 7009 // functions. 7010 NumGPRArgRegs = 4; 7011 } 7012 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); 7013 7014 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); 7015 int GPRIdx = 0; 7016 if (GPRSaveSize != 0) { 7017 if (IsWin64) { 7018 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); 7019 if (GPRSaveSize & 15) 7020 // The extra size here, if triggered, will always be 8. 7021 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); 7022 } else 7023 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false); 7024 7025 SDValue FIN; 7026 if (Subtarget->isWindowsArm64EC()) { 7027 // With the Arm64EC ABI, we reserve the save area as usual, but we 7028 // compute its address relative to x4. For a normal AArch64->AArch64 7029 // call, x4 == sp on entry, but calls from an entry thunk can pass in a 7030 // different address. 7031 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); 7032 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 7033 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val, 7034 DAG.getConstant(GPRSaveSize, DL, MVT::i64)); 7035 } else { 7036 FIN = DAG.getFrameIndex(GPRIdx, PtrVT); 7037 } 7038 7039 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { 7040 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); 7041 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 7042 SDValue Store = 7043 DAG.getStore(Val.getValue(1), DL, Val, FIN, 7044 IsWin64 ? MachinePointerInfo::getFixedStack( 7045 MF, GPRIdx, (i - FirstVariadicGPR) * 8) 7046 : MachinePointerInfo::getStack(MF, i * 8)); 7047 MemOps.push_back(Store); 7048 FIN = 7049 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); 7050 } 7051 } 7052 FuncInfo->setVarArgsGPRIndex(GPRIdx); 7053 FuncInfo->setVarArgsGPRSize(GPRSaveSize); 7054 7055 if (Subtarget->hasFPARMv8() && !IsWin64) { 7056 auto FPRArgRegs = AArch64::getFPRArgRegs(); 7057 const unsigned NumFPRArgRegs = FPRArgRegs.size(); 7058 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); 7059 7060 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 7061 int FPRIdx = 0; 7062 if (FPRSaveSize != 0) { 7063 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false); 7064 7065 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); 7066 7067 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 7068 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); 7069 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 7070 7071 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, 7072 MachinePointerInfo::getStack(MF, i * 16)); 7073 MemOps.push_back(Store); 7074 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, 7075 DAG.getConstant(16, DL, PtrVT)); 7076 } 7077 } 7078 FuncInfo->setVarArgsFPRIndex(FPRIdx); 7079 FuncInfo->setVarArgsFPRSize(FPRSaveSize); 7080 } 7081 7082 if (!MemOps.empty()) { 7083 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 7084 } 7085 } 7086 7087 /// LowerCallResult - Lower the result values of a call into the 7088 /// appropriate copies out of appropriate physical registers. 7089 SDValue AArch64TargetLowering::LowerCallResult( 7090 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, 7091 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL, 7092 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 7093 SDValue ThisVal) const { 7094 DenseMap<unsigned, SDValue> CopiedRegs; 7095 // Copy all of the result registers out of their specified physreg. 7096 for (unsigned i = 0; i != RVLocs.size(); ++i) { 7097 CCValAssign VA = RVLocs[i]; 7098 7099 // Pass 'this' value directly from the argument to return value, to avoid 7100 // reg unit interference 7101 if (i == 0 && isThisReturn) { 7102 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && 7103 "unexpected return calling convention register assignment"); 7104 InVals.push_back(ThisVal); 7105 continue; 7106 } 7107 7108 // Avoid copying a physreg twice since RegAllocFast is incompetent and only 7109 // allows one use of a physreg per block. 7110 SDValue Val = CopiedRegs.lookup(VA.getLocReg()); 7111 if (!Val) { 7112 Val = 7113 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue); 7114 Chain = Val.getValue(1); 7115 InGlue = Val.getValue(2); 7116 CopiedRegs[VA.getLocReg()] = Val; 7117 } 7118 7119 switch (VA.getLocInfo()) { 7120 default: 7121 llvm_unreachable("Unknown loc info!"); 7122 case CCValAssign::Full: 7123 break; 7124 case CCValAssign::BCvt: 7125 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 7126 break; 7127 case CCValAssign::AExtUpper: 7128 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val, 7129 DAG.getConstant(32, DL, VA.getLocVT())); 7130 [[fallthrough]]; 7131 case CCValAssign::AExt: 7132 [[fallthrough]]; 7133 case CCValAssign::ZExt: 7134 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT()); 7135 break; 7136 } 7137 7138 InVals.push_back(Val); 7139 } 7140 7141 return Chain; 7142 } 7143 7144 /// Return true if the calling convention is one that we can guarantee TCO for. 7145 static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) { 7146 return (CC == CallingConv::Fast && GuaranteeTailCalls) || 7147 CC == CallingConv::Tail || CC == CallingConv::SwiftTail; 7148 } 7149 7150 /// Return true if we might ever do TCO for calls with this calling convention. 7151 static bool mayTailCallThisCC(CallingConv::ID CC) { 7152 switch (CC) { 7153 case CallingConv::C: 7154 case CallingConv::AArch64_SVE_VectorCall: 7155 case CallingConv::PreserveMost: 7156 case CallingConv::PreserveAll: 7157 case CallingConv::Swift: 7158 case CallingConv::SwiftTail: 7159 case CallingConv::Tail: 7160 case CallingConv::Fast: 7161 return true; 7162 default: 7163 return false; 7164 } 7165 } 7166 7167 static void analyzeCallOperands(const AArch64TargetLowering &TLI, 7168 const AArch64Subtarget *Subtarget, 7169 const TargetLowering::CallLoweringInfo &CLI, 7170 CCState &CCInfo) { 7171 const SelectionDAG &DAG = CLI.DAG; 7172 CallingConv::ID CalleeCC = CLI.CallConv; 7173 bool IsVarArg = CLI.IsVarArg; 7174 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 7175 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); 7176 7177 unsigned NumArgs = Outs.size(); 7178 for (unsigned i = 0; i != NumArgs; ++i) { 7179 MVT ArgVT = Outs[i].VT; 7180 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 7181 7182 bool UseVarArgCC = false; 7183 if (IsVarArg) { 7184 // On Windows, the fixed arguments in a vararg call are passed in GPRs 7185 // too, so use the vararg CC to force them to integer registers. 7186 if (IsCalleeWin64) { 7187 UseVarArgCC = true; 7188 } else { 7189 UseVarArgCC = !Outs[i].IsFixed; 7190 } 7191 } 7192 7193 if (!UseVarArgCC) { 7194 // Get type of the original argument. 7195 EVT ActualVT = 7196 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty, 7197 /*AllowUnknown*/ true); 7198 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT; 7199 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 7200 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 7201 ArgVT = MVT::i8; 7202 else if (ActualMVT == MVT::i16) 7203 ArgVT = MVT::i16; 7204 } 7205 7206 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC); 7207 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); 7208 assert(!Res && "Call operand has unhandled type"); 7209 (void)Res; 7210 } 7211 } 7212 7213 bool AArch64TargetLowering::isEligibleForTailCallOptimization( 7214 const CallLoweringInfo &CLI) const { 7215 CallingConv::ID CalleeCC = CLI.CallConv; 7216 if (!mayTailCallThisCC(CalleeCC)) 7217 return false; 7218 7219 SDValue Callee = CLI.Callee; 7220 bool IsVarArg = CLI.IsVarArg; 7221 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 7222 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 7223 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 7224 const SelectionDAG &DAG = CLI.DAG; 7225 MachineFunction &MF = DAG.getMachineFunction(); 7226 const Function &CallerF = MF.getFunction(); 7227 CallingConv::ID CallerCC = CallerF.getCallingConv(); 7228 7229 // SME Streaming functions are not eligible for TCO as they may require 7230 // the streaming mode or ZA to be restored after returning from the call. 7231 SMEAttrs CallerAttrs(MF.getFunction()); 7232 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal); 7233 if (CallerAttrs.requiresSMChange(CalleeAttrs) || 7234 CallerAttrs.requiresLazySave(CalleeAttrs) || 7235 CallerAttrs.hasStreamingBody()) 7236 return false; 7237 7238 // Functions using the C or Fast calling convention that have an SVE signature 7239 // preserve more registers and should assume the SVE_VectorCall CC. 7240 // The check for matching callee-saved regs will determine whether it is 7241 // eligible for TCO. 7242 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) && 7243 MF.getInfo<AArch64FunctionInfo>()->isSVECC()) 7244 CallerCC = CallingConv::AArch64_SVE_VectorCall; 7245 7246 bool CCMatch = CallerCC == CalleeCC; 7247 7248 // When using the Windows calling convention on a non-windows OS, we want 7249 // to back up and restore X18 in such functions; we can't do a tail call 7250 // from those functions. 7251 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() && 7252 CalleeCC != CallingConv::Win64) 7253 return false; 7254 7255 // Byval parameters hand the function a pointer directly into the stack area 7256 // we want to reuse during a tail call. Working around this *is* possible (see 7257 // X86) but less efficient and uglier in LowerCall. 7258 for (Function::const_arg_iterator i = CallerF.arg_begin(), 7259 e = CallerF.arg_end(); 7260 i != e; ++i) { 7261 if (i->hasByValAttr()) 7262 return false; 7263 7264 // On Windows, "inreg" attributes signify non-aggregate indirect returns. 7265 // In this case, it is necessary to save/restore X0 in the callee. Tail 7266 // call opt interferes with this. So we disable tail call opt when the 7267 // caller has an argument with "inreg" attribute. 7268 7269 // FIXME: Check whether the callee also has an "inreg" argument. 7270 if (i->hasInRegAttr()) 7271 return false; 7272 } 7273 7274 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) 7275 return CCMatch; 7276 7277 // Externally-defined functions with weak linkage should not be 7278 // tail-called on AArch64 when the OS does not support dynamic 7279 // pre-emption of symbols, as the AAELF spec requires normal calls 7280 // to undefined weak functions to be replaced with a NOP or jump to the 7281 // next instruction. The behaviour of branch instructions in this 7282 // situation (as used for tail calls) is implementation-defined, so we 7283 // cannot rely on the linker replacing the tail call with a return. 7284 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 7285 const GlobalValue *GV = G->getGlobal(); 7286 const Triple &TT = getTargetMachine().getTargetTriple(); 7287 if (GV->hasExternalWeakLinkage() && 7288 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 7289 return false; 7290 } 7291 7292 // Now we search for cases where we can use a tail call without changing the 7293 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 7294 // concept. 7295 7296 // I want anyone implementing a new calling convention to think long and hard 7297 // about this assert. 7298 assert((!IsVarArg || CalleeCC == CallingConv::C) && 7299 "Unexpected variadic calling convention"); 7300 7301 LLVMContext &C = *DAG.getContext(); 7302 // Check that the call results are passed in the same way. 7303 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 7304 CCAssignFnForCall(CalleeCC, IsVarArg), 7305 CCAssignFnForCall(CallerCC, IsVarArg))) 7306 return false; 7307 // The callee has to preserve all registers the caller needs to preserve. 7308 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 7309 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 7310 if (!CCMatch) { 7311 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 7312 if (Subtarget->hasCustomCallingConv()) { 7313 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); 7314 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); 7315 } 7316 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 7317 return false; 7318 } 7319 7320 // Nothing more to check if the callee is taking no arguments 7321 if (Outs.empty()) 7322 return true; 7323 7324 SmallVector<CCValAssign, 16> ArgLocs; 7325 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C); 7326 7327 analyzeCallOperands(*this, Subtarget, CLI, CCInfo); 7328 7329 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) { 7330 // When we are musttail, additional checks have been done and we can safely ignore this check 7331 // At least two cases here: if caller is fastcc then we can't have any 7332 // memory arguments (we'd be expected to clean up the stack afterwards). If 7333 // caller is C then we could potentially use its argument area. 7334 7335 // FIXME: for now we take the most conservative of these in both cases: 7336 // disallow all variadic memory operands. 7337 for (const CCValAssign &ArgLoc : ArgLocs) 7338 if (!ArgLoc.isRegLoc()) 7339 return false; 7340 } 7341 7342 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 7343 7344 // If any of the arguments is passed indirectly, it must be SVE, so the 7345 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to 7346 // allocate space on the stack. That is why we determine this explicitly here 7347 // the call cannot be a tailcall. 7348 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) { 7349 assert((A.getLocInfo() != CCValAssign::Indirect || 7350 A.getValVT().isScalableVector() || 7351 Subtarget->isWindowsArm64EC()) && 7352 "Expected value to be scalable"); 7353 return A.getLocInfo() == CCValAssign::Indirect; 7354 })) 7355 return false; 7356 7357 // If the stack arguments for this call do not fit into our own save area then 7358 // the call cannot be made tail. 7359 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) 7360 return false; 7361 7362 const MachineRegisterInfo &MRI = MF.getRegInfo(); 7363 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 7364 return false; 7365 7366 return true; 7367 } 7368 7369 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 7370 SelectionDAG &DAG, 7371 MachineFrameInfo &MFI, 7372 int ClobberedFI) const { 7373 SmallVector<SDValue, 8> ArgChains; 7374 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); 7375 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; 7376 7377 // Include the original chain at the beginning of the list. When this is 7378 // used by target LowerCall hooks, this helps legalize find the 7379 // CALLSEQ_BEGIN node. 7380 ArgChains.push_back(Chain); 7381 7382 // Add a chain value for each stack argument corresponding 7383 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) 7384 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) 7385 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 7386 if (FI->getIndex() < 0) { 7387 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); 7388 int64_t InLastByte = InFirstByte; 7389 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; 7390 7391 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 7392 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 7393 ArgChains.push_back(SDValue(L, 1)); 7394 } 7395 7396 // Build a tokenfactor for all the chains. 7397 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 7398 } 7399 7400 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 7401 bool TailCallOpt) const { 7402 return (CallCC == CallingConv::Fast && TailCallOpt) || 7403 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail; 7404 } 7405 7406 // Check if the value is zero-extended from i1 to i8 7407 static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { 7408 unsigned SizeInBits = Arg.getValueType().getSizeInBits(); 7409 if (SizeInBits < 8) 7410 return false; 7411 7412 APInt RequredZero(SizeInBits, 0xFE); 7413 KnownBits Bits = DAG.computeKnownBits(Arg, 4); 7414 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero; 7415 return ZExtBool; 7416 } 7417 7418 void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 7419 SDNode *Node) const { 7420 // Live-in physreg copies that are glued to SMSTART are applied as 7421 // implicit-def's in the InstrEmitter. Here we remove them, allowing the 7422 // register allocator to pass call args in callee saved regs, without extra 7423 // copies to avoid these fake clobbers of actually-preserved GPRs. 7424 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 || 7425 MI.getOpcode() == AArch64::MSRpstatePseudo) 7426 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I) 7427 if (MachineOperand &MO = MI.getOperand(I); 7428 MO.isReg() && MO.isImplicit() && MO.isDef() && 7429 (AArch64::GPR32RegClass.contains(MO.getReg()) || 7430 AArch64::GPR64RegClass.contains(MO.getReg()))) 7431 MI.removeOperand(I); 7432 } 7433 7434 SDValue AArch64TargetLowering::changeStreamingMode( 7435 SelectionDAG &DAG, SDLoc DL, bool Enable, 7436 SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const { 7437 MachineFunction &MF = DAG.getMachineFunction(); 7438 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 7439 FuncInfo->setHasStreamingModeChanges(true); 7440 7441 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 7442 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()); 7443 SDValue MSROp = 7444 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32); 7445 7446 SDValue ExpectedSMVal = 7447 DAG.getTargetConstant(Entry ? Enable : !Enable, DL, MVT::i64); 7448 SmallVector<SDValue> Ops = {Chain, MSROp, PStateSM, ExpectedSMVal, RegMask}; 7449 7450 if (InGlue) 7451 Ops.push_back(InGlue); 7452 7453 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP; 7454 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); 7455 } 7456 7457 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, 7458 /// and add input and output parameter nodes. 7459 SDValue 7460 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 7461 SmallVectorImpl<SDValue> &InVals) const { 7462 SelectionDAG &DAG = CLI.DAG; 7463 SDLoc &DL = CLI.DL; 7464 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 7465 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 7466 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 7467 SDValue Chain = CLI.Chain; 7468 SDValue Callee = CLI.Callee; 7469 bool &IsTailCall = CLI.IsTailCall; 7470 CallingConv::ID &CallConv = CLI.CallConv; 7471 bool IsVarArg = CLI.IsVarArg; 7472 7473 MachineFunction &MF = DAG.getMachineFunction(); 7474 MachineFunction::CallSiteInfo CSInfo; 7475 bool IsThisReturn = false; 7476 7477 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 7478 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 7479 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType; 7480 bool IsSibCall = false; 7481 bool GuardWithBTI = false; 7482 7483 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) && 7484 !Subtarget->noBTIAtReturnTwice()) { 7485 GuardWithBTI = FuncInfo->branchTargetEnforcement(); 7486 } 7487 7488 // Analyze operands of the call, assigning locations to each operand. 7489 SmallVector<CCValAssign, 16> ArgLocs; 7490 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); 7491 7492 if (IsVarArg) { 7493 unsigned NumArgs = Outs.size(); 7494 7495 for (unsigned i = 0; i != NumArgs; ++i) { 7496 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector()) 7497 report_fatal_error("Passing SVE types to variadic functions is " 7498 "currently not supported"); 7499 } 7500 } 7501 7502 analyzeCallOperands(*this, Subtarget, CLI, CCInfo); 7503 7504 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); 7505 // Assign locations to each value returned by this call. 7506 SmallVector<CCValAssign, 16> RVLocs; 7507 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, 7508 *DAG.getContext()); 7509 RetCCInfo.AnalyzeCallResult(Ins, RetCC); 7510 7511 // Check callee args/returns for SVE registers and set calling convention 7512 // accordingly. 7513 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) { 7514 auto HasSVERegLoc = [](CCValAssign &Loc) { 7515 if (!Loc.isRegLoc()) 7516 return false; 7517 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) || 7518 AArch64::PPRRegClass.contains(Loc.getLocReg()); 7519 }; 7520 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc)) 7521 CallConv = CallingConv::AArch64_SVE_VectorCall; 7522 } 7523 7524 if (IsTailCall) { 7525 // Check if it's really possible to do a tail call. 7526 IsTailCall = isEligibleForTailCallOptimization(CLI); 7527 7528 // A sibling call is one where we're under the usual C ABI and not planning 7529 // to change that but can still do a tail call: 7530 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail && 7531 CallConv != CallingConv::SwiftTail) 7532 IsSibCall = true; 7533 7534 if (IsTailCall) 7535 ++NumTailCalls; 7536 } 7537 7538 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) 7539 report_fatal_error("failed to perform tail call elimination on a call " 7540 "site marked musttail"); 7541 7542 // Get a count of how many bytes are to be pushed on the stack. 7543 unsigned NumBytes = CCInfo.getStackSize(); 7544 7545 if (IsSibCall) { 7546 // Since we're not changing the ABI to make this a tail call, the memory 7547 // operands are already available in the caller's incoming argument space. 7548 NumBytes = 0; 7549 } 7550 7551 // FPDiff is the byte offset of the call's argument area from the callee's. 7552 // Stores to callee stack arguments will be placed in FixedStackSlots offset 7553 // by this amount for a tail call. In a sibling call it must be 0 because the 7554 // caller will deallocate the entire stack and the callee still expects its 7555 // arguments to begin at SP+0. Completely unused for non-tail calls. 7556 int FPDiff = 0; 7557 7558 if (IsTailCall && !IsSibCall) { 7559 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 7560 7561 // Since callee will pop argument stack as a tail call, we must keep the 7562 // popped size 16-byte aligned. 7563 NumBytes = alignTo(NumBytes, 16); 7564 7565 // FPDiff will be negative if this tail call requires more space than we 7566 // would automatically have in our incoming argument space. Positive if we 7567 // can actually shrink the stack. 7568 FPDiff = NumReusableBytes - NumBytes; 7569 7570 // Update the required reserved area if this is the tail call requiring the 7571 // most argument stack space. 7572 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff) 7573 FuncInfo->setTailCallReservedStack(-FPDiff); 7574 7575 // The stack pointer must be 16-byte aligned at all times it's used for a 7576 // memory operation, which in practice means at *all* times and in 7577 // particular across call boundaries. Therefore our own arguments started at 7578 // a 16-byte aligned SP and the delta applied for the tail call should 7579 // satisfy the same constraint. 7580 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 7581 } 7582 7583 // Determine whether we need any streaming mode changes. 7584 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction()); 7585 if (CLI.CB) 7586 CalleeAttrs = SMEAttrs(*CLI.CB); 7587 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) 7588 CalleeAttrs = SMEAttrs(ES->getSymbol()); 7589 7590 auto DescribeCallsite = 7591 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & { 7592 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '"; 7593 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) 7594 R << ore::NV("Callee", ES->getSymbol()); 7595 else if (CLI.CB && CLI.CB->getCalledFunction()) 7596 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName()); 7597 else 7598 R << "unknown callee"; 7599 R << "'"; 7600 return R; 7601 }; 7602 7603 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); 7604 if (RequiresLazySave) { 7605 SDValue NumZaSaveSlices; 7606 if (!CalleeAttrs.preservesZA()) { 7607 // Set up a lazy save mechanism by storing the runtime live slices 7608 // (worst-case SVL) to the TPIDR2 stack object. 7609 NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, 7610 DAG.getConstant(1, DL, MVT::i32)); 7611 } else if (CalleeAttrs.preservesZA()) { 7612 NumZaSaveSlices = DAG.getConstant(0, DL, MVT::i64); 7613 } 7614 7615 unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); 7616 MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); 7617 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj, 7618 DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); 7619 SDValue NumZaSaveSlicesAddr = 7620 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, 7621 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType())); 7622 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr, 7623 MPI, MVT::i16); 7624 Chain = DAG.getNode( 7625 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, 7626 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), 7627 TPIDR2ObjAddr); 7628 OptimizationRemarkEmitter ORE(&MF.getFunction()); 7629 ORE.emit([&]() { 7630 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA", 7631 CLI.CB) 7632 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA", 7633 &MF.getFunction()); 7634 DescribeCallsite(R) << " sets up a lazy save for ZA"; 7635 if (CalleeAttrs.preservesZA()) 7636 R << ", but callee preserves ZA, so we request 0 slices to be saved"; 7637 else 7638 R << ", and we request that all slices be saved"; 7639 R << ore::setExtraArgs() 7640 << ore::NV("CalleePreservesZA", CalleeAttrs.preservesZA()); 7641 return R; 7642 }); 7643 } 7644 7645 SDValue PStateSM; 7646 std::optional<bool> RequiresSMChange = 7647 CallerAttrs.requiresSMChange(CalleeAttrs); 7648 if (RequiresSMChange) { 7649 PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64); 7650 OptimizationRemarkEmitter ORE(&MF.getFunction()); 7651 ORE.emit([&]() { 7652 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition", 7653 CLI.CB) 7654 : OptimizationRemarkAnalysis("sme", "SMETransition", 7655 &MF.getFunction()); 7656 DescribeCallsite(R) << " requires a streaming mode transition"; 7657 return R; 7658 }); 7659 } 7660 7661 // Adjust the stack pointer for the new arguments... 7662 // These operations are automatically eliminated by the prolog/epilog pass 7663 if (!IsSibCall) 7664 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL); 7665 7666 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, 7667 getPointerTy(DAG.getDataLayout())); 7668 7669 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 7670 SmallSet<unsigned, 8> RegsUsed; 7671 SmallVector<SDValue, 8> MemOpChains; 7672 auto PtrVT = getPointerTy(DAG.getDataLayout()); 7673 7674 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) { 7675 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); 7676 for (const auto &F : Forwards) { 7677 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); 7678 RegsToPass.emplace_back(F.PReg, Val); 7679 } 7680 } 7681 7682 // Walk the register/memloc assignments, inserting copies/loads. 7683 unsigned ExtraArgLocs = 0; 7684 for (unsigned i = 0, e = Outs.size(); i != e; ++i) { 7685 CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; 7686 SDValue Arg = OutVals[i]; 7687 ISD::ArgFlagsTy Flags = Outs[i].Flags; 7688 7689 // Promote the value if needed. 7690 switch (VA.getLocInfo()) { 7691 default: 7692 llvm_unreachable("Unknown loc info!"); 7693 case CCValAssign::Full: 7694 break; 7695 case CCValAssign::SExt: 7696 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 7697 break; 7698 case CCValAssign::ZExt: 7699 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 7700 break; 7701 case CCValAssign::AExt: 7702 if (Outs[i].ArgVT == MVT::i1) { 7703 // AAPCS requires i1 to be zero-extended to 8-bits by the caller. 7704 // 7705 // Check if we actually have to do this, because the value may 7706 // already be zero-extended. 7707 // 7708 // We cannot just emit a (zext i8 (trunc (assert-zext i8))) 7709 // and rely on DAGCombiner to fold this, because the following 7710 // (anyext i32) is combined with (zext i8) in DAG.getNode: 7711 // 7712 // (ext (zext x)) -> (zext x) 7713 // 7714 // This will give us (zext i32), which we cannot remove, so 7715 // try to check this beforehand. 7716 if (!checkZExtBool(Arg, DAG)) { 7717 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 7718 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); 7719 } 7720 } 7721 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 7722 break; 7723 case CCValAssign::AExtUpper: 7724 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); 7725 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 7726 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, 7727 DAG.getConstant(32, DL, VA.getLocVT())); 7728 break; 7729 case CCValAssign::BCvt: 7730 Arg = DAG.getBitcast(VA.getLocVT(), Arg); 7731 break; 7732 case CCValAssign::Trunc: 7733 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); 7734 break; 7735 case CCValAssign::FPExt: 7736 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 7737 break; 7738 case CCValAssign::Indirect: 7739 bool isScalable = VA.getValVT().isScalableVT(); 7740 assert((isScalable || Subtarget->isWindowsArm64EC()) && 7741 "Indirect arguments should be scalable on most subtargets"); 7742 7743 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue(); 7744 uint64_t PartSize = StoreSize; 7745 unsigned NumParts = 1; 7746 if (Outs[i].Flags.isInConsecutiveRegs()) { 7747 assert(!Outs[i].Flags.isInConsecutiveRegsLast()); 7748 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) 7749 ++NumParts; 7750 StoreSize *= NumParts; 7751 } 7752 7753 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); 7754 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); 7755 MachineFrameInfo &MFI = MF.getFrameInfo(); 7756 int FI = MFI.CreateStackObject(StoreSize, Alignment, false); 7757 if (isScalable) 7758 MFI.setStackID(FI, TargetStackID::ScalableVector); 7759 7760 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); 7761 SDValue Ptr = DAG.getFrameIndex( 7762 FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); 7763 SDValue SpillSlot = Ptr; 7764 7765 // Ensure we generate all stores for each tuple part, whilst updating the 7766 // pointer after each store correctly using vscale. 7767 while (NumParts) { 7768 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI); 7769 MemOpChains.push_back(Store); 7770 7771 NumParts--; 7772 if (NumParts > 0) { 7773 SDValue BytesIncrement; 7774 if (isScalable) { 7775 BytesIncrement = DAG.getVScale( 7776 DL, Ptr.getValueType(), 7777 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize)); 7778 } else { 7779 BytesIncrement = DAG.getConstant( 7780 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL, 7781 Ptr.getValueType()); 7782 } 7783 SDNodeFlags Flags; 7784 Flags.setNoUnsignedWrap(true); 7785 7786 MPI = MachinePointerInfo(MPI.getAddrSpace()); 7787 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 7788 BytesIncrement, Flags); 7789 ExtraArgLocs++; 7790 i++; 7791 } 7792 } 7793 7794 Arg = SpillSlot; 7795 break; 7796 } 7797 7798 if (VA.isRegLoc()) { 7799 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 7800 Outs[0].VT == MVT::i64) { 7801 assert(VA.getLocVT() == MVT::i64 && 7802 "unexpected calling convention register assignment"); 7803 assert(!Ins.empty() && Ins[0].VT == MVT::i64 && 7804 "unexpected use of 'returned'"); 7805 IsThisReturn = true; 7806 } 7807 if (RegsUsed.count(VA.getLocReg())) { 7808 // If this register has already been used then we're trying to pack 7809 // parts of an [N x i32] into an X-register. The extension type will 7810 // take care of putting the two halves in the right place but we have to 7811 // combine them. 7812 SDValue &Bits = 7813 llvm::find_if(RegsToPass, 7814 [=](const std::pair<unsigned, SDValue> &Elt) { 7815 return Elt.first == VA.getLocReg(); 7816 }) 7817 ->second; 7818 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); 7819 // Call site info is used for function's parameter entry value 7820 // tracking. For now we track only simple cases when parameter 7821 // is transferred through whole register. 7822 llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) { 7823 return ArgReg.Reg == VA.getLocReg(); 7824 }); 7825 } else { 7826 RegsToPass.emplace_back(VA.getLocReg(), Arg); 7827 RegsUsed.insert(VA.getLocReg()); 7828 const TargetOptions &Options = DAG.getTarget().Options; 7829 if (Options.EmitCallSiteInfo) 7830 CSInfo.emplace_back(VA.getLocReg(), i); 7831 } 7832 } else { 7833 assert(VA.isMemLoc()); 7834 7835 SDValue DstAddr; 7836 MachinePointerInfo DstInfo; 7837 7838 // FIXME: This works on big-endian for composite byvals, which are the 7839 // common case. It should also work for fundamental types too. 7840 uint32_t BEAlign = 0; 7841 unsigned OpSize; 7842 if (VA.getLocInfo() == CCValAssign::Indirect || 7843 VA.getLocInfo() == CCValAssign::Trunc) 7844 OpSize = VA.getLocVT().getFixedSizeInBits(); 7845 else 7846 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 7847 : VA.getValVT().getSizeInBits(); 7848 OpSize = (OpSize + 7) / 8; 7849 if (!Subtarget->isLittleEndian() && !Flags.isByVal() && 7850 !Flags.isInConsecutiveRegs()) { 7851 if (OpSize < 8) 7852 BEAlign = 8 - OpSize; 7853 } 7854 unsigned LocMemOffset = VA.getLocMemOffset(); 7855 int32_t Offset = LocMemOffset + BEAlign; 7856 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 7857 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 7858 7859 if (IsTailCall) { 7860 Offset = Offset + FPDiff; 7861 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); 7862 7863 DstAddr = DAG.getFrameIndex(FI, PtrVT); 7864 DstInfo = MachinePointerInfo::getFixedStack(MF, FI); 7865 7866 // Make sure any stack arguments overlapping with where we're storing 7867 // are loaded before this eventual operation. Otherwise they'll be 7868 // clobbered. 7869 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 7870 } else { 7871 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 7872 7873 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 7874 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); 7875 } 7876 7877 if (Outs[i].Flags.isByVal()) { 7878 SDValue SizeNode = 7879 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); 7880 SDValue Cpy = DAG.getMemcpy( 7881 Chain, DL, DstAddr, Arg, SizeNode, 7882 Outs[i].Flags.getNonZeroByValAlign(), 7883 /*isVol = */ false, /*AlwaysInline = */ false, 7884 /*isTailCall = */ false, DstInfo, MachinePointerInfo()); 7885 7886 MemOpChains.push_back(Cpy); 7887 } else { 7888 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already 7889 // promoted to a legal register type i32, we should truncate Arg back to 7890 // i1/i8/i16. 7891 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || 7892 VA.getValVT() == MVT::i16) 7893 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); 7894 7895 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); 7896 MemOpChains.push_back(Store); 7897 } 7898 } 7899 } 7900 7901 if (IsVarArg && Subtarget->isWindowsArm64EC()) { 7902 // For vararg calls, the Arm64EC ABI requires values in x4 and x5 7903 // describing the argument list. x4 contains the address of the 7904 // first stack parameter. x5 contains the size in bytes of all parameters 7905 // passed on the stack. 7906 RegsToPass.emplace_back(AArch64::X4, StackPtr); 7907 RegsToPass.emplace_back(AArch64::X5, 7908 DAG.getConstant(NumBytes, DL, MVT::i64)); 7909 } 7910 7911 if (!MemOpChains.empty()) 7912 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 7913 7914 SDValue InGlue; 7915 if (RequiresSMChange) { 7916 SDValue NewChain = changeStreamingMode(DAG, DL, *RequiresSMChange, Chain, 7917 InGlue, PStateSM, true); 7918 Chain = NewChain.getValue(0); 7919 InGlue = NewChain.getValue(1); 7920 } 7921 7922 // Build a sequence of copy-to-reg nodes chained together with token chain 7923 // and flag operands which copy the outgoing args into the appropriate regs. 7924 for (auto &RegToPass : RegsToPass) { 7925 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, 7926 RegToPass.second, InGlue); 7927 InGlue = Chain.getValue(1); 7928 } 7929 7930 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 7931 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 7932 // node so that legalize doesn't hack it. 7933 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 7934 auto GV = G->getGlobal(); 7935 unsigned OpFlags = 7936 Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()); 7937 if (OpFlags & AArch64II::MO_GOT) { 7938 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); 7939 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 7940 } else { 7941 const GlobalValue *GV = G->getGlobal(); 7942 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); 7943 } 7944 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 7945 if (getTargetMachine().getCodeModel() == CodeModel::Large && 7946 Subtarget->isTargetMachO()) { 7947 const char *Sym = S->getSymbol(); 7948 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); 7949 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 7950 } else { 7951 const char *Sym = S->getSymbol(); 7952 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); 7953 } 7954 } 7955 7956 // We don't usually want to end the call-sequence here because we would tidy 7957 // the frame up *after* the call, however in the ABI-changing tail-call case 7958 // we've carefully laid out the parameters so that when sp is reset they'll be 7959 // in the correct location. 7960 if (IsTailCall && !IsSibCall) { 7961 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL); 7962 InGlue = Chain.getValue(1); 7963 } 7964 7965 std::vector<SDValue> Ops; 7966 Ops.push_back(Chain); 7967 Ops.push_back(Callee); 7968 7969 if (IsTailCall) { 7970 // Each tail call may have to adjust the stack by a different amount, so 7971 // this information must travel along with the operation for eventual 7972 // consumption by emitEpilogue. 7973 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); 7974 } 7975 7976 // Add argument registers to the end of the list so that they are known live 7977 // into the call. 7978 for (auto &RegToPass : RegsToPass) 7979 Ops.push_back(DAG.getRegister(RegToPass.first, 7980 RegToPass.second.getValueType())); 7981 7982 // Add a register mask operand representing the call-preserved registers. 7983 const uint32_t *Mask; 7984 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 7985 if (IsThisReturn) { 7986 // For 'this' returns, use the X0-preserving mask if applicable 7987 Mask = TRI->getThisReturnPreservedMask(MF, CallConv); 7988 if (!Mask) { 7989 IsThisReturn = false; 7990 Mask = TRI->getCallPreservedMask(MF, CallConv); 7991 } 7992 } else 7993 Mask = TRI->getCallPreservedMask(MF, CallConv); 7994 7995 if (Subtarget->hasCustomCallingConv()) 7996 TRI->UpdateCustomCallPreservedMask(MF, &Mask); 7997 7998 if (TRI->isAnyArgRegReserved(MF)) 7999 TRI->emitReservedArgRegCallError(MF); 8000 8001 assert(Mask && "Missing call preserved mask for calling convention"); 8002 Ops.push_back(DAG.getRegisterMask(Mask)); 8003 8004 if (InGlue.getNode()) 8005 Ops.push_back(InGlue); 8006 8007 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8008 8009 // If we're doing a tall call, use a TC_RETURN here rather than an 8010 // actual call instruction. 8011 if (IsTailCall) { 8012 MF.getFrameInfo().setHasTailCall(); 8013 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); 8014 8015 if (IsCFICall) 8016 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); 8017 8018 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); 8019 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 8020 return Ret; 8021 } 8022 8023 unsigned CallOpc = AArch64ISD::CALL; 8024 // Calls with operand bundle "clang.arc.attachedcall" are special. They should 8025 // be expanded to the call, directly followed by a special marker sequence and 8026 // a call to an ObjC library function. Use CALL_RVMARKER to do that. 8027 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { 8028 assert(!IsTailCall && 8029 "tail calls cannot be marked with clang.arc.attachedcall"); 8030 CallOpc = AArch64ISD::CALL_RVMARKER; 8031 8032 // Add a target global address for the retainRV/claimRV runtime function 8033 // just before the call target. 8034 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB); 8035 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT); 8036 Ops.insert(Ops.begin() + 1, GA); 8037 } else if (GuardWithBTI) 8038 CallOpc = AArch64ISD::CALL_BTI; 8039 8040 // Returns a chain and a flag for retval copy to use. 8041 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops); 8042 8043 if (IsCFICall) 8044 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); 8045 8046 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); 8047 InGlue = Chain.getValue(1); 8048 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 8049 8050 uint64_t CalleePopBytes = 8051 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; 8052 8053 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL); 8054 InGlue = Chain.getValue(1); 8055 8056 // Handle result values, copying them out of physregs into vregs that we 8057 // return. 8058 SDValue Result = LowerCallResult(Chain, InGlue, CallConv, IsVarArg, RVLocs, 8059 DL, DAG, InVals, IsThisReturn, 8060 IsThisReturn ? OutVals[0] : SDValue()); 8061 8062 if (!Ins.empty()) 8063 InGlue = Result.getValue(Result->getNumValues() - 1); 8064 8065 if (RequiresSMChange) { 8066 assert(PStateSM && "Expected a PStateSM to be set"); 8067 Result = changeStreamingMode(DAG, DL, !*RequiresSMChange, Result, InGlue, 8068 PStateSM, false); 8069 } 8070 8071 if (RequiresLazySave) { 8072 if (!CalleeAttrs.preservesZA()) { 8073 // Unconditionally resume ZA. 8074 Result = DAG.getNode( 8075 AArch64ISD::SMSTART, DL, MVT::Other, Result, 8076 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), 8077 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); 8078 8079 // Conditionally restore the lazy save using a pseudo node. 8080 unsigned FI = FuncInfo->getLazySaveTPIDR2Obj(); 8081 SDValue RegMask = DAG.getRegisterMask( 8082 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); 8083 SDValue RestoreRoutine = DAG.getTargetExternalSymbol( 8084 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout())); 8085 SDValue TPIDR2_EL0 = DAG.getNode( 8086 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, 8087 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); 8088 8089 // Copy the address of the TPIDR2 block into X0 before 'calling' the 8090 // RESTORE_ZA pseudo. 8091 SDValue Glue; 8092 SDValue TPIDR2Block = DAG.getFrameIndex( 8093 FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); 8094 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); 8095 Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, 8096 {Result, TPIDR2_EL0, 8097 DAG.getRegister(AArch64::X0, MVT::i64), 8098 RestoreRoutine, RegMask, Result.getValue(1)}); 8099 } 8100 // Finally reset the TPIDR2_EL0 register to 0. 8101 Result = DAG.getNode( 8102 ISD::INTRINSIC_VOID, DL, MVT::Other, Result, 8103 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), 8104 DAG.getConstant(0, DL, MVT::i64)); 8105 } 8106 8107 if (RequiresSMChange || RequiresLazySave) { 8108 for (unsigned I = 0; I < InVals.size(); ++I) { 8109 // The smstart/smstop is chained as part of the call, but when the 8110 // resulting chain is discarded (which happens when the call is not part 8111 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the 8112 // smstart/smstop is chained to the result value. We can do that by doing 8113 // a vreg -> vreg copy. 8114 Register Reg = MF.getRegInfo().createVirtualRegister( 8115 getRegClassFor(InVals[I].getValueType().getSimpleVT())); 8116 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]); 8117 InVals[I] = DAG.getCopyFromReg(X, DL, Reg, 8118 InVals[I].getValueType()); 8119 } 8120 } 8121 8122 return Result; 8123 } 8124 8125 bool AArch64TargetLowering::CanLowerReturn( 8126 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 8127 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 8128 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); 8129 SmallVector<CCValAssign, 16> RVLocs; 8130 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 8131 return CCInfo.CheckReturn(Outs, RetCC); 8132 } 8133 8134 SDValue 8135 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 8136 bool isVarArg, 8137 const SmallVectorImpl<ISD::OutputArg> &Outs, 8138 const SmallVectorImpl<SDValue> &OutVals, 8139 const SDLoc &DL, SelectionDAG &DAG) const { 8140 auto &MF = DAG.getMachineFunction(); 8141 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 8142 8143 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); 8144 SmallVector<CCValAssign, 16> RVLocs; 8145 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); 8146 CCInfo.AnalyzeReturn(Outs, RetCC); 8147 8148 // Copy the result values into the output registers. 8149 SDValue Glue; 8150 SmallVector<std::pair<unsigned, SDValue>, 4> RetVals; 8151 SmallSet<unsigned, 4> RegsUsed; 8152 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); 8153 ++i, ++realRVLocIdx) { 8154 CCValAssign &VA = RVLocs[i]; 8155 assert(VA.isRegLoc() && "Can only return in registers!"); 8156 SDValue Arg = OutVals[realRVLocIdx]; 8157 8158 switch (VA.getLocInfo()) { 8159 default: 8160 llvm_unreachable("Unknown loc info!"); 8161 case CCValAssign::Full: 8162 if (Outs[i].ArgVT == MVT::i1) { 8163 // AAPCS requires i1 to be zero-extended to i8 by the producer of the 8164 // value. This is strictly redundant on Darwin (which uses "zeroext 8165 // i1"), but will be optimised out before ISel. 8166 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 8167 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 8168 } 8169 break; 8170 case CCValAssign::BCvt: 8171 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 8172 break; 8173 case CCValAssign::AExt: 8174 case CCValAssign::ZExt: 8175 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); 8176 break; 8177 case CCValAssign::AExtUpper: 8178 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); 8179 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); 8180 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, 8181 DAG.getConstant(32, DL, VA.getLocVT())); 8182 break; 8183 } 8184 8185 if (RegsUsed.count(VA.getLocReg())) { 8186 SDValue &Bits = 8187 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) { 8188 return Elt.first == VA.getLocReg(); 8189 })->second; 8190 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); 8191 } else { 8192 RetVals.emplace_back(VA.getLocReg(), Arg); 8193 RegsUsed.insert(VA.getLocReg()); 8194 } 8195 } 8196 8197 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 8198 8199 // Emit SMSTOP before returning from a locally streaming function 8200 SMEAttrs FuncAttrs(MF.getFunction()); 8201 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) { 8202 Chain = changeStreamingMode( 8203 DAG, DL, /*Enable*/ false, Chain, /*Glue*/ SDValue(), 8204 DAG.getConstant(1, DL, MVT::i64), /*Entry*/ true); 8205 Glue = Chain.getValue(1); 8206 } 8207 8208 SmallVector<SDValue, 4> RetOps(1, Chain); 8209 for (auto &RetVal : RetVals) { 8210 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue); 8211 Glue = Chain.getValue(1); 8212 RetOps.push_back( 8213 DAG.getRegister(RetVal.first, RetVal.second.getValueType())); 8214 } 8215 8216 // Windows AArch64 ABIs require that for returning structs by value we copy 8217 // the sret argument into X0 for the return. 8218 // We saved the argument into a virtual register in the entry block, 8219 // so now we copy the value out and into X0. 8220 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { 8221 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg, 8222 getPointerTy(MF.getDataLayout())); 8223 8224 unsigned RetValReg = AArch64::X0; 8225 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue); 8226 Glue = Chain.getValue(1); 8227 8228 RetOps.push_back( 8229 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); 8230 } 8231 8232 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF); 8233 if (I) { 8234 for (; *I; ++I) { 8235 if (AArch64::GPR64RegClass.contains(*I)) 8236 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 8237 else if (AArch64::FPR64RegClass.contains(*I)) 8238 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 8239 else 8240 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 8241 } 8242 } 8243 8244 RetOps[0] = Chain; // Update chain. 8245 8246 // Add the glue if we have it. 8247 if (Glue.getNode()) 8248 RetOps.push_back(Glue); 8249 8250 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps); 8251 } 8252 8253 //===----------------------------------------------------------------------===// 8254 // Other Lowering Code 8255 //===----------------------------------------------------------------------===// 8256 8257 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, 8258 SelectionDAG &DAG, 8259 unsigned Flag) const { 8260 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 8261 N->getOffset(), Flag); 8262 } 8263 8264 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, 8265 SelectionDAG &DAG, 8266 unsigned Flag) const { 8267 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); 8268 } 8269 8270 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, 8271 SelectionDAG &DAG, 8272 unsigned Flag) const { 8273 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(), 8274 N->getOffset(), Flag); 8275 } 8276 8277 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, 8278 SelectionDAG &DAG, 8279 unsigned Flag) const { 8280 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); 8281 } 8282 8283 // (loadGOT sym) 8284 template <class NodeTy> 8285 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG, 8286 unsigned Flags) const { 8287 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); 8288 SDLoc DL(N); 8289 EVT Ty = getPointerTy(DAG.getDataLayout()); 8290 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags); 8291 // FIXME: Once remat is capable of dealing with instructions with register 8292 // operands, expand this into two nodes instead of using a wrapper node. 8293 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); 8294 } 8295 8296 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) 8297 template <class NodeTy> 8298 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG, 8299 unsigned Flags) const { 8300 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); 8301 SDLoc DL(N); 8302 EVT Ty = getPointerTy(DAG.getDataLayout()); 8303 const unsigned char MO_NC = AArch64II::MO_NC; 8304 return DAG.getNode( 8305 AArch64ISD::WrapperLarge, DL, Ty, 8306 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags), 8307 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags), 8308 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags), 8309 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags)); 8310 } 8311 8312 // (addlow (adrp %hi(sym)) %lo(sym)) 8313 template <class NodeTy> 8314 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, 8315 unsigned Flags) const { 8316 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); 8317 SDLoc DL(N); 8318 EVT Ty = getPointerTy(DAG.getDataLayout()); 8319 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags); 8320 SDValue Lo = getTargetNode(N, Ty, DAG, 8321 AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags); 8322 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); 8323 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); 8324 } 8325 8326 // (adr sym) 8327 template <class NodeTy> 8328 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG, 8329 unsigned Flags) const { 8330 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n"); 8331 SDLoc DL(N); 8332 EVT Ty = getPointerTy(DAG.getDataLayout()); 8333 SDValue Sym = getTargetNode(N, Ty, DAG, Flags); 8334 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym); 8335 } 8336 8337 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, 8338 SelectionDAG &DAG) const { 8339 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 8340 const GlobalValue *GV = GN->getGlobal(); 8341 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 8342 8343 if (OpFlags != AArch64II::MO_NO_FLAG) 8344 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && 8345 "unexpected offset in global node"); 8346 8347 // This also catches the large code model case for Darwin, and tiny code 8348 // model with got relocations. 8349 if ((OpFlags & AArch64II::MO_GOT) != 0) { 8350 return getGOT(GN, DAG, OpFlags); 8351 } 8352 8353 SDValue Result; 8354 if (getTargetMachine().getCodeModel() == CodeModel::Large && 8355 !getTargetMachine().isPositionIndependent()) { 8356 Result = getAddrLarge(GN, DAG, OpFlags); 8357 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 8358 Result = getAddrTiny(GN, DAG, OpFlags); 8359 } else { 8360 Result = getAddr(GN, DAG, OpFlags); 8361 } 8362 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8363 SDLoc DL(GN); 8364 if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_DLLIMPORTAUX | 8365 AArch64II::MO_COFFSTUB)) 8366 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 8367 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 8368 return Result; 8369 } 8370 8371 /// Convert a TLS address reference into the correct sequence of loads 8372 /// and calls to compute the variable's address (for Darwin, currently) and 8373 /// return an SDValue containing the final node. 8374 8375 /// Darwin only has one TLS scheme which must be capable of dealing with the 8376 /// fully general situation, in the worst case. This means: 8377 /// + "extern __thread" declaration. 8378 /// + Defined in a possibly unknown dynamic library. 8379 /// 8380 /// The general system is that each __thread variable has a [3 x i64] descriptor 8381 /// which contains information used by the runtime to calculate the address. The 8382 /// only part of this the compiler needs to know about is the first xword, which 8383 /// contains a function pointer that must be called with the address of the 8384 /// entire descriptor in "x0". 8385 /// 8386 /// Since this descriptor may be in a different unit, in general even the 8387 /// descriptor must be accessed via an indirect load. The "ideal" code sequence 8388 /// is: 8389 /// adrp x0, _var@TLVPPAGE 8390 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor 8391 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, 8392 /// ; the function pointer 8393 /// blr x1 ; Uses descriptor address in x0 8394 /// ; Address of _var is now in x0. 8395 /// 8396 /// If the address of _var's descriptor *is* known to the linker, then it can 8397 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for 8398 /// a slight efficiency gain. 8399 SDValue 8400 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, 8401 SelectionDAG &DAG) const { 8402 assert(Subtarget->isTargetDarwin() && 8403 "This function expects a Darwin target"); 8404 8405 SDLoc DL(Op); 8406 MVT PtrVT = getPointerTy(DAG.getDataLayout()); 8407 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout()); 8408 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 8409 8410 SDValue TLVPAddr = 8411 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 8412 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); 8413 8414 // The first entry in the descriptor is a function pointer that we must call 8415 // to obtain the address of the variable. 8416 SDValue Chain = DAG.getEntryNode(); 8417 SDValue FuncTLVGet = DAG.getLoad( 8418 PtrMemVT, DL, Chain, DescAddr, 8419 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 8420 Align(PtrMemVT.getSizeInBits() / 8), 8421 MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); 8422 Chain = FuncTLVGet.getValue(1); 8423 8424 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer. 8425 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT); 8426 8427 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8428 MFI.setAdjustsStack(true); 8429 8430 // TLS calls preserve all registers except those that absolutely must be 8431 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 8432 // silly). 8433 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 8434 const uint32_t *Mask = TRI->getTLSCallPreservedMask(); 8435 if (Subtarget->hasCustomCallingConv()) 8436 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); 8437 8438 // Finally, we can make the call. This is just a degenerate version of a 8439 // normal AArch64 call node: x0 takes the address of the descriptor, and 8440 // returns the address of the variable in this thread. 8441 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); 8442 Chain = 8443 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 8444 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), 8445 DAG.getRegisterMask(Mask), Chain.getValue(1)); 8446 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); 8447 } 8448 8449 /// Convert a thread-local variable reference into a sequence of instructions to 8450 /// compute the variable's address for the local exec TLS model of ELF targets. 8451 /// The sequence depends on the maximum TLS area size. 8452 SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV, 8453 SDValue ThreadBase, 8454 const SDLoc &DL, 8455 SelectionDAG &DAG) const { 8456 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8457 SDValue TPOff, Addr; 8458 8459 switch (DAG.getTarget().Options.TLSSize) { 8460 default: 8461 llvm_unreachable("Unexpected TLS size"); 8462 8463 case 12: { 8464 // mrs x0, TPIDR_EL0 8465 // add x0, x0, :tprel_lo12:a 8466 SDValue Var = DAG.getTargetGlobalAddress( 8467 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF); 8468 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 8469 Var, 8470 DAG.getTargetConstant(0, DL, MVT::i32)), 8471 0); 8472 } 8473 8474 case 24: { 8475 // mrs x0, TPIDR_EL0 8476 // add x0, x0, :tprel_hi12:a 8477 // add x0, x0, :tprel_lo12_nc:a 8478 SDValue HiVar = DAG.getTargetGlobalAddress( 8479 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 8480 SDValue LoVar = DAG.getTargetGlobalAddress( 8481 GV, DL, PtrVT, 0, 8482 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 8483 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 8484 HiVar, 8485 DAG.getTargetConstant(0, DL, MVT::i32)), 8486 0); 8487 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr, 8488 LoVar, 8489 DAG.getTargetConstant(0, DL, MVT::i32)), 8490 0); 8491 } 8492 8493 case 32: { 8494 // mrs x1, TPIDR_EL0 8495 // movz x0, #:tprel_g1:a 8496 // movk x0, #:tprel_g0_nc:a 8497 // add x0, x1, x0 8498 SDValue HiVar = DAG.getTargetGlobalAddress( 8499 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1); 8500 SDValue LoVar = DAG.getTargetGlobalAddress( 8501 GV, DL, PtrVT, 0, 8502 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); 8503 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, 8504 DAG.getTargetConstant(16, DL, MVT::i32)), 8505 0); 8506 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, 8507 DAG.getTargetConstant(0, DL, MVT::i32)), 8508 0); 8509 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 8510 } 8511 8512 case 48: { 8513 // mrs x1, TPIDR_EL0 8514 // movz x0, #:tprel_g2:a 8515 // movk x0, #:tprel_g1_nc:a 8516 // movk x0, #:tprel_g0_nc:a 8517 // add x0, x1, x0 8518 SDValue HiVar = DAG.getTargetGlobalAddress( 8519 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2); 8520 SDValue MiVar = DAG.getTargetGlobalAddress( 8521 GV, DL, PtrVT, 0, 8522 AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC); 8523 SDValue LoVar = DAG.getTargetGlobalAddress( 8524 GV, DL, PtrVT, 0, 8525 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); 8526 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, 8527 DAG.getTargetConstant(32, DL, MVT::i32)), 8528 0); 8529 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar, 8530 DAG.getTargetConstant(16, DL, MVT::i32)), 8531 0); 8532 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, 8533 DAG.getTargetConstant(0, DL, MVT::i32)), 8534 0); 8535 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 8536 } 8537 } 8538 } 8539 8540 /// When accessing thread-local variables under either the general-dynamic or 8541 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will 8542 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry 8543 /// is a function pointer to carry out the resolution. 8544 /// 8545 /// The sequence is: 8546 /// adrp x0, :tlsdesc:var 8547 /// ldr x1, [x0, #:tlsdesc_lo12:var] 8548 /// add x0, x0, #:tlsdesc_lo12:var 8549 /// .tlsdesccall var 8550 /// blr x1 8551 /// (TPIDR_EL0 offset now in x0) 8552 /// 8553 /// The above sequence must be produced unscheduled, to enable the linker to 8554 /// optimize/relax this sequence. 8555 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the 8556 /// above sequence, and expanded really late in the compilation flow, to ensure 8557 /// the sequence is produced as per above. 8558 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, 8559 const SDLoc &DL, 8560 SelectionDAG &DAG) const { 8561 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8562 8563 SDValue Chain = DAG.getEntryNode(); 8564 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8565 8566 Chain = 8567 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); 8568 SDValue Glue = Chain.getValue(1); 8569 8570 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 8571 } 8572 8573 SDValue 8574 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, 8575 SelectionDAG &DAG) const { 8576 assert(Subtarget->isTargetELF() && "This function expects an ELF target"); 8577 8578 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 8579 8580 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 8581 8582 if (!EnableAArch64ELFLocalDynamicTLSGeneration) { 8583 if (Model == TLSModel::LocalDynamic) 8584 Model = TLSModel::GeneralDynamic; 8585 } 8586 8587 if (getTargetMachine().getCodeModel() == CodeModel::Large && 8588 Model != TLSModel::LocalExec) 8589 report_fatal_error("ELF TLS only supported in small memory model or " 8590 "in local exec TLS model"); 8591 // Different choices can be made for the maximum size of the TLS area for a 8592 // module. For the small address model, the default TLS size is 16MiB and the 8593 // maximum TLS size is 4GiB. 8594 // FIXME: add tiny and large code model support for TLS access models other 8595 // than local exec. We currently generate the same code as small for tiny, 8596 // which may be larger than needed. 8597 8598 SDValue TPOff; 8599 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8600 SDLoc DL(Op); 8601 const GlobalValue *GV = GA->getGlobal(); 8602 8603 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 8604 8605 if (Model == TLSModel::LocalExec) { 8606 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG); 8607 } else if (Model == TLSModel::InitialExec) { 8608 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 8609 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); 8610 } else if (Model == TLSModel::LocalDynamic) { 8611 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 8612 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 8613 // the beginning of the module's TLS region, followed by a DTPREL offset 8614 // calculation. 8615 8616 // These accesses will need deduplicating if there's more than one. 8617 AArch64FunctionInfo *MFI = 8618 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 8619 MFI->incNumLocalDynamicTLSAccesses(); 8620 8621 // The call needs a relocation too for linker relaxation. It doesn't make 8622 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 8623 // the address. 8624 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 8625 AArch64II::MO_TLS); 8626 8627 // Now we can calculate the offset from TPIDR_EL0 to this module's 8628 // thread-local area. 8629 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 8630 8631 // Now use :dtprel_whatever: operations to calculate this variable's offset 8632 // in its thread-storage area. 8633 SDValue HiVar = DAG.getTargetGlobalAddress( 8634 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 8635 SDValue LoVar = DAG.getTargetGlobalAddress( 8636 GV, DL, MVT::i64, 0, 8637 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 8638 8639 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, 8640 DAG.getTargetConstant(0, DL, MVT::i32)), 8641 0); 8642 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, 8643 DAG.getTargetConstant(0, DL, MVT::i32)), 8644 0); 8645 } else if (Model == TLSModel::GeneralDynamic) { 8646 // The call needs a relocation too for linker relaxation. It doesn't make 8647 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 8648 // the address. 8649 SDValue SymAddr = 8650 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 8651 8652 // Finally we can make a call to calculate the offset from tpidr_el0. 8653 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 8654 } else 8655 llvm_unreachable("Unsupported ELF TLS access model"); 8656 8657 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 8658 } 8659 8660 SDValue 8661 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op, 8662 SelectionDAG &DAG) const { 8663 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 8664 8665 SDValue Chain = DAG.getEntryNode(); 8666 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8667 SDLoc DL(Op); 8668 8669 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64); 8670 8671 // Load the ThreadLocalStoragePointer from the TEB 8672 // A pointer to the TLS array is located at offset 0x58 from the TEB. 8673 SDValue TLSArray = 8674 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL)); 8675 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 8676 Chain = TLSArray.getValue(1); 8677 8678 // Load the TLS index from the C runtime; 8679 // This does the same as getAddr(), but without having a GlobalAddressSDNode. 8680 // This also does the same as LOADgot, but using a generic i32 load, 8681 // while LOADgot only loads i64. 8682 SDValue TLSIndexHi = 8683 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE); 8684 SDValue TLSIndexLo = DAG.getTargetExternalSymbol( 8685 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 8686 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi); 8687 SDValue TLSIndex = 8688 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo); 8689 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo()); 8690 Chain = TLSIndex.getValue(1); 8691 8692 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8 8693 // offset into the TLSArray. 8694 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex); 8695 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 8696 DAG.getConstant(3, DL, PtrVT)); 8697 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 8698 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 8699 MachinePointerInfo()); 8700 Chain = TLS.getValue(1); 8701 8702 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 8703 const GlobalValue *GV = GA->getGlobal(); 8704 SDValue TGAHi = DAG.getTargetGlobalAddress( 8705 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 8706 SDValue TGALo = DAG.getTargetGlobalAddress( 8707 GV, DL, PtrVT, 0, 8708 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 8709 8710 // Add the offset from the start of the .tls section (section base). 8711 SDValue Addr = 8712 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi, 8713 DAG.getTargetConstant(0, DL, MVT::i32)), 8714 0); 8715 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo); 8716 return Addr; 8717 } 8718 8719 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 8720 SelectionDAG &DAG) const { 8721 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 8722 if (DAG.getTarget().useEmulatedTLS()) 8723 return LowerToTLSEmulatedModel(GA, DAG); 8724 8725 if (Subtarget->isTargetDarwin()) 8726 return LowerDarwinGlobalTLSAddress(Op, DAG); 8727 if (Subtarget->isTargetELF()) 8728 return LowerELFGlobalTLSAddress(Op, DAG); 8729 if (Subtarget->isTargetWindows()) 8730 return LowerWindowsGlobalTLSAddress(Op, DAG); 8731 8732 llvm_unreachable("Unexpected platform trying to use TLS"); 8733 } 8734 8735 // Looks through \param Val to determine the bit that can be used to 8736 // check the sign of the value. It returns the unextended value and 8737 // the sign bit position. 8738 std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) { 8739 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG) 8740 return {Val.getOperand(0), 8741 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() - 8742 1}; 8743 8744 if (Val.getOpcode() == ISD::SIGN_EXTEND) 8745 return {Val.getOperand(0), 8746 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1}; 8747 8748 return {Val, Val.getValueSizeInBits() - 1}; 8749 } 8750 8751 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 8752 SDValue Chain = Op.getOperand(0); 8753 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 8754 SDValue LHS = Op.getOperand(2); 8755 SDValue RHS = Op.getOperand(3); 8756 SDValue Dest = Op.getOperand(4); 8757 SDLoc dl(Op); 8758 8759 MachineFunction &MF = DAG.getMachineFunction(); 8760 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions 8761 // will not be produced, as they are conditional branch instructions that do 8762 // not set flags. 8763 bool ProduceNonFlagSettingCondBr = 8764 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); 8765 8766 // Handle f128 first, since lowering it will result in comparing the return 8767 // value of a libcall against zero, which is just what the rest of LowerBR_CC 8768 // is expecting to deal with. 8769 if (LHS.getValueType() == MVT::f128) { 8770 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); 8771 8772 // If softenSetCCOperands returned a scalar, we need to compare the result 8773 // against zero to select between true and false values. 8774 if (!RHS.getNode()) { 8775 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 8776 CC = ISD::SETNE; 8777 } 8778 } 8779 8780 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 8781 // instruction. 8782 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && 8783 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 8784 // Only lower legal XALUO ops. 8785 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 8786 return SDValue(); 8787 8788 // The actual operation with overflow check. 8789 AArch64CC::CondCode OFCC; 8790 SDValue Value, Overflow; 8791 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); 8792 8793 if (CC == ISD::SETNE) 8794 OFCC = getInvertedCondCode(OFCC); 8795 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); 8796 8797 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 8798 Overflow); 8799 } 8800 8801 if (LHS.getValueType().isInteger()) { 8802 assert((LHS.getValueType() == RHS.getValueType()) && 8803 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 8804 8805 // If the RHS of the comparison is zero, we can potentially fold this 8806 // to a specialized branch. 8807 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 8808 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) { 8809 if (CC == ISD::SETEQ) { 8810 // See if we can use a TBZ to fold in an AND as well. 8811 // TBZ has a smaller branch displacement than CBZ. If the offset is 8812 // out of bounds, a late MI-layer pass rewrites branches. 8813 // 403.gcc is an example that hits this case. 8814 if (LHS.getOpcode() == ISD::AND && 8815 isa<ConstantSDNode>(LHS.getOperand(1)) && 8816 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 8817 SDValue Test = LHS.getOperand(0); 8818 uint64_t Mask = LHS.getConstantOperandVal(1); 8819 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, 8820 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 8821 Dest); 8822 } 8823 8824 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); 8825 } else if (CC == ISD::SETNE) { 8826 // See if we can use a TBZ to fold in an AND as well. 8827 // TBZ has a smaller branch displacement than CBZ. If the offset is 8828 // out of bounds, a late MI-layer pass rewrites branches. 8829 // 403.gcc is an example that hits this case. 8830 if (LHS.getOpcode() == ISD::AND && 8831 isa<ConstantSDNode>(LHS.getOperand(1)) && 8832 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 8833 SDValue Test = LHS.getOperand(0); 8834 uint64_t Mask = LHS.getConstantOperandVal(1); 8835 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, 8836 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 8837 Dest); 8838 } 8839 8840 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); 8841 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { 8842 // Don't combine AND since emitComparison converts the AND to an ANDS 8843 // (a.k.a. TST) and the test in the test bit and branch instruction 8844 // becomes redundant. This would also increase register pressure. 8845 uint64_t SignBitPos; 8846 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); 8847 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, 8848 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); 8849 } 8850 } 8851 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && 8852 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) { 8853 // Don't combine AND since emitComparison converts the AND to an ANDS 8854 // (a.k.a. TST) and the test in the test bit and branch instruction 8855 // becomes redundant. This would also increase register pressure. 8856 uint64_t SignBitPos; 8857 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); 8858 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, 8859 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); 8860 } 8861 8862 SDValue CCVal; 8863 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 8864 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 8865 Cmp); 8866 } 8867 8868 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 || 8869 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 8870 8871 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 8872 // clean. Some of them require two branches to implement. 8873 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 8874 AArch64CC::CondCode CC1, CC2; 8875 changeFPCCToAArch64CC(CC, CC1, CC2); 8876 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 8877 SDValue BR1 = 8878 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); 8879 if (CC2 != AArch64CC::AL) { 8880 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 8881 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, 8882 Cmp); 8883 } 8884 8885 return BR1; 8886 } 8887 8888 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, 8889 SelectionDAG &DAG) const { 8890 if (!Subtarget->hasNEON()) 8891 return SDValue(); 8892 8893 EVT VT = Op.getValueType(); 8894 EVT IntVT = VT.changeTypeToInteger(); 8895 SDLoc DL(Op); 8896 8897 SDValue In1 = Op.getOperand(0); 8898 SDValue In2 = Op.getOperand(1); 8899 EVT SrcVT = In2.getValueType(); 8900 8901 if (!SrcVT.bitsEq(VT)) 8902 In2 = DAG.getFPExtendOrRound(In2, DL, VT); 8903 8904 if (VT.isScalableVector()) 8905 IntVT = 8906 getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger()); 8907 8908 if (VT.isFixedLengthVector() && 8909 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) { 8910 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 8911 8912 In1 = convertToScalableVector(DAG, ContainerVT, In1); 8913 In2 = convertToScalableVector(DAG, ContainerVT, In2); 8914 8915 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2); 8916 return convertFromScalableVector(DAG, VT, Res); 8917 } 8918 8919 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) { 8920 if (VT.isScalableVector()) 8921 return getSVESafeBitCast(VT, Op, DAG); 8922 8923 return DAG.getBitcast(VT, Op); 8924 }; 8925 8926 SDValue VecVal1, VecVal2; 8927 EVT VecVT; 8928 auto SetVecVal = [&](int Idx = -1) { 8929 if (!VT.isVector()) { 8930 VecVal1 = 8931 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1); 8932 VecVal2 = 8933 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2); 8934 } else { 8935 VecVal1 = BitCast(VecVT, In1, DAG); 8936 VecVal2 = BitCast(VecVT, In2, DAG); 8937 } 8938 }; 8939 if (VT.isVector()) { 8940 VecVT = IntVT; 8941 SetVecVal(); 8942 } else if (VT == MVT::f64) { 8943 VecVT = MVT::v2i64; 8944 SetVecVal(AArch64::dsub); 8945 } else if (VT == MVT::f32) { 8946 VecVT = MVT::v4i32; 8947 SetVecVal(AArch64::ssub); 8948 } else if (VT == MVT::f16) { 8949 VecVT = MVT::v8i16; 8950 SetVecVal(AArch64::hsub); 8951 } else { 8952 llvm_unreachable("Invalid type for copysign!"); 8953 } 8954 8955 unsigned BitWidth = In1.getScalarValueSizeInBits(); 8956 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT); 8957 8958 // We want to materialize a mask with every bit but the high bit set, but the 8959 // AdvSIMD immediate moves cannot materialize that in a single instruction for 8960 // 64-bit elements. Instead, materialize all bits set and then negate that. 8961 if (VT == MVT::f64 || VT == MVT::v2f64) { 8962 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT); 8963 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV); 8964 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV); 8965 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV); 8966 } 8967 8968 SDValue BSP = 8969 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2); 8970 if (VT == MVT::f16) 8971 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP); 8972 if (VT == MVT::f32) 8973 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP); 8974 if (VT == MVT::f64) 8975 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP); 8976 8977 return BitCast(VT, BSP, DAG); 8978 } 8979 8980 SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op, 8981 SelectionDAG &DAG) const { 8982 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 8983 Attribute::NoImplicitFloat)) 8984 return SDValue(); 8985 8986 if (!Subtarget->hasNEON()) 8987 return SDValue(); 8988 8989 bool IsParity = Op.getOpcode() == ISD::PARITY; 8990 SDValue Val = Op.getOperand(0); 8991 SDLoc DL(Op); 8992 EVT VT = Op.getValueType(); 8993 8994 // for i32, general parity function using EORs is more efficient compared to 8995 // using floating point 8996 if (VT == MVT::i32 && IsParity) 8997 return SDValue(); 8998 8999 // If there is no CNT instruction available, GPR popcount can 9000 // be more efficiently lowered to the following sequence that uses 9001 // AdvSIMD registers/instructions as long as the copies to/from 9002 // the AdvSIMD registers are cheap. 9003 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 9004 // CNT V0.8B, V0.8B // 8xbyte pop-counts 9005 // ADDV B0, V0.8B // sum 8xbyte pop-counts 9006 // UMOV X0, V0.B[0] // copy byte result back to integer reg 9007 if (VT == MVT::i32 || VT == MVT::i64) { 9008 if (VT == MVT::i32) 9009 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); 9010 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); 9011 9012 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); 9013 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop); 9014 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV, 9015 DAG.getConstant(0, DL, MVT::i64)); 9016 9017 if (IsParity) 9018 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV, 9019 DAG.getConstant(1, DL, MVT::i32)); 9020 9021 if (VT == MVT::i64) 9022 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); 9023 return UaddLV; 9024 } else if (VT == MVT::i128) { 9025 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val); 9026 9027 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val); 9028 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop); 9029 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV, 9030 DAG.getConstant(0, DL, MVT::i64)); 9031 9032 if (IsParity) 9033 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV, 9034 DAG.getConstant(1, DL, MVT::i32)); 9035 9036 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); 9037 } 9038 9039 assert(!IsParity && "ISD::PARITY of vector types not supported"); 9040 9041 if (VT.isScalableVector() || 9042 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) 9043 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU); 9044 9045 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 9046 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 9047 "Unexpected type for custom ctpop lowering"); 9048 9049 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 9050 Val = DAG.getBitcast(VT8Bit, Val); 9051 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val); 9052 9053 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 9054 unsigned EltSize = 8; 9055 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 9056 while (EltSize != VT.getScalarSizeInBits()) { 9057 EltSize *= 2; 9058 NumElts /= 2; 9059 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 9060 Val = DAG.getNode( 9061 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, 9062 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val); 9063 } 9064 9065 return Val; 9066 } 9067 9068 SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 9069 EVT VT = Op.getValueType(); 9070 assert(VT.isScalableVector() || 9071 useSVEForFixedLengthVectorVT( 9072 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())); 9073 9074 SDLoc DL(Op); 9075 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0)); 9076 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT); 9077 } 9078 9079 SDValue AArch64TargetLowering::LowerMinMax(SDValue Op, 9080 SelectionDAG &DAG) const { 9081 9082 EVT VT = Op.getValueType(); 9083 SDLoc DL(Op); 9084 unsigned Opcode = Op.getOpcode(); 9085 ISD::CondCode CC; 9086 switch (Opcode) { 9087 default: 9088 llvm_unreachable("Wrong instruction"); 9089 case ISD::SMAX: 9090 CC = ISD::SETGT; 9091 break; 9092 case ISD::SMIN: 9093 CC = ISD::SETLT; 9094 break; 9095 case ISD::UMAX: 9096 CC = ISD::SETUGT; 9097 break; 9098 case ISD::UMIN: 9099 CC = ISD::SETULT; 9100 break; 9101 } 9102 9103 if (VT.isScalableVector() || 9104 useSVEForFixedLengthVectorVT( 9105 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) { 9106 switch (Opcode) { 9107 default: 9108 llvm_unreachable("Wrong instruction"); 9109 case ISD::SMAX: 9110 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED); 9111 case ISD::SMIN: 9112 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED); 9113 case ISD::UMAX: 9114 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED); 9115 case ISD::UMIN: 9116 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED); 9117 } 9118 } 9119 9120 SDValue Op0 = Op.getOperand(0); 9121 SDValue Op1 = Op.getOperand(1); 9122 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC); 9123 return DAG.getSelect(DL, VT, Cond, Op0, Op1); 9124 } 9125 9126 SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op, 9127 SelectionDAG &DAG) const { 9128 EVT VT = Op.getValueType(); 9129 9130 if (VT.isScalableVector() || 9131 useSVEForFixedLengthVectorVT( 9132 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) 9133 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU); 9134 9135 SDLoc DL(Op); 9136 SDValue REVB; 9137 MVT VST; 9138 9139 switch (VT.getSimpleVT().SimpleTy) { 9140 default: 9141 llvm_unreachable("Invalid type for bitreverse!"); 9142 9143 case MVT::v2i32: { 9144 VST = MVT::v8i8; 9145 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); 9146 9147 break; 9148 } 9149 9150 case MVT::v4i32: { 9151 VST = MVT::v16i8; 9152 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0)); 9153 9154 break; 9155 } 9156 9157 case MVT::v1i64: { 9158 VST = MVT::v8i8; 9159 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); 9160 9161 break; 9162 } 9163 9164 case MVT::v2i64: { 9165 VST = MVT::v16i8; 9166 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0)); 9167 9168 break; 9169 } 9170 } 9171 9172 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, 9173 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB)); 9174 } 9175 9176 // Check whether the continuous comparison sequence. 9177 static bool 9178 isOrXorChain(SDValue N, unsigned &Num, 9179 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) { 9180 if (Num == MaxXors) 9181 return false; 9182 9183 // Skip the one-use zext 9184 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse()) 9185 N = N->getOperand(0); 9186 9187 // The leaf node must be XOR 9188 if (N->getOpcode() == ISD::XOR) { 9189 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1))); 9190 Num++; 9191 return true; 9192 } 9193 9194 // All the non-leaf nodes must be OR. 9195 if (N->getOpcode() != ISD::OR || !N->hasOneUse()) 9196 return false; 9197 9198 if (isOrXorChain(N->getOperand(0), Num, WorkList) && 9199 isOrXorChain(N->getOperand(1), Num, WorkList)) 9200 return true; 9201 return false; 9202 } 9203 9204 // Transform chains of ORs and XORs, which usually outlined by memcmp/bmp. 9205 static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) { 9206 SDValue LHS = N->getOperand(0); 9207 SDValue RHS = N->getOperand(1); 9208 SDLoc DL(N); 9209 EVT VT = N->getValueType(0); 9210 SmallVector<std::pair<SDValue, SDValue>, 16> WorkList; 9211 9212 // Only handle integer compares. 9213 if (N->getOpcode() != ISD::SETCC) 9214 return SDValue(); 9215 9216 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get(); 9217 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as: 9218 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag 9219 unsigned NumXors = 0; 9220 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) && 9221 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() && 9222 isOrXorChain(LHS, NumXors, WorkList)) { 9223 SDValue XOR0, XOR1; 9224 std::tie(XOR0, XOR1) = WorkList[0]; 9225 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR; 9226 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond); 9227 for (unsigned I = 1; I < WorkList.size(); I++) { 9228 std::tie(XOR0, XOR1) = WorkList[I]; 9229 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond); 9230 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain); 9231 } 9232 9233 // Exit early by inverting the condition, which help reduce indentations. 9234 return Cmp; 9235 } 9236 9237 return SDValue(); 9238 } 9239 9240 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 9241 9242 if (Op.getValueType().isVector()) 9243 return LowerVSETCC(Op, DAG); 9244 9245 bool IsStrict = Op->isStrictFPOpcode(); 9246 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; 9247 unsigned OpNo = IsStrict ? 1 : 0; 9248 SDValue Chain; 9249 if (IsStrict) 9250 Chain = Op.getOperand(0); 9251 SDValue LHS = Op.getOperand(OpNo + 0); 9252 SDValue RHS = Op.getOperand(OpNo + 1); 9253 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get(); 9254 SDLoc dl(Op); 9255 9256 // We chose ZeroOrOneBooleanContents, so use zero and one. 9257 EVT VT = Op.getValueType(); 9258 SDValue TVal = DAG.getConstant(1, dl, VT); 9259 SDValue FVal = DAG.getConstant(0, dl, VT); 9260 9261 // Handle f128 first, since one possible outcome is a normal integer 9262 // comparison which gets picked up by the next if statement. 9263 if (LHS.getValueType() == MVT::f128) { 9264 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain, 9265 IsSignaling); 9266 9267 // If softenSetCCOperands returned a scalar, use it. 9268 if (!RHS.getNode()) { 9269 assert(LHS.getValueType() == Op.getValueType() && 9270 "Unexpected setcc expansion!"); 9271 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS; 9272 } 9273 } 9274 9275 if (LHS.getValueType().isInteger()) { 9276 SDValue CCVal; 9277 SDValue Cmp = getAArch64Cmp( 9278 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl); 9279 9280 // Note that we inverted the condition above, so we reverse the order of 9281 // the true and false operands here. This will allow the setcc to be 9282 // matched to a single CSINC instruction. 9283 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); 9284 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; 9285 } 9286 9287 // Now we know we're dealing with FP values. 9288 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || 9289 LHS.getValueType() == MVT::f64); 9290 9291 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead 9292 // and do the comparison. 9293 SDValue Cmp; 9294 if (IsStrict) 9295 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling); 9296 else 9297 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 9298 9299 AArch64CC::CondCode CC1, CC2; 9300 changeFPCCToAArch64CC(CC, CC1, CC2); 9301 SDValue Res; 9302 if (CC2 == AArch64CC::AL) { 9303 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1, 9304 CC2); 9305 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 9306 9307 // Note that we inverted the condition above, so we reverse the order of 9308 // the true and false operands here. This will allow the setcc to be 9309 // matched to a single CSINC instruction. 9310 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); 9311 } else { 9312 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 9313 // totally clean. Some of them require two CSELs to implement. As is in 9314 // this case, we emit the first CSEL and then emit a second using the output 9315 // of the first as the RHS. We're effectively OR'ing the two CC's together. 9316 9317 // FIXME: It would be nice if we could match the two CSELs to two CSINCs. 9318 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 9319 SDValue CS1 = 9320 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 9321 9322 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 9323 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 9324 } 9325 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res; 9326 } 9327 9328 SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op, 9329 SelectionDAG &DAG) const { 9330 9331 SDValue LHS = Op.getOperand(0); 9332 SDValue RHS = Op.getOperand(1); 9333 EVT VT = LHS.getValueType(); 9334 if (VT != MVT::i32 && VT != MVT::i64) 9335 return SDValue(); 9336 9337 SDLoc DL(Op); 9338 SDValue Carry = Op.getOperand(2); 9339 // SBCS uses a carry not a borrow so the carry flag should be inverted first. 9340 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true); 9341 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue), 9342 LHS, RHS, InvCarry); 9343 9344 EVT OpVT = Op.getValueType(); 9345 SDValue TVal = DAG.getConstant(1, DL, OpVT); 9346 SDValue FVal = DAG.getConstant(0, DL, OpVT); 9347 9348 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get(); 9349 ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT); 9350 SDValue CCVal = 9351 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32); 9352 // Inputs are swapped because the condition is inverted. This will allow 9353 // matching with a single CSINC instruction. 9354 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal, 9355 Cmp.getValue(1)); 9356 } 9357 9358 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, 9359 SDValue RHS, SDValue TVal, 9360 SDValue FVal, const SDLoc &dl, 9361 SelectionDAG &DAG) const { 9362 // Handle f128 first, because it will result in a comparison of some RTLIB 9363 // call result against zero. 9364 if (LHS.getValueType() == MVT::f128) { 9365 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); 9366 9367 // If softenSetCCOperands returned a scalar, we need to compare the result 9368 // against zero to select between true and false values. 9369 if (!RHS.getNode()) { 9370 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 9371 CC = ISD::SETNE; 9372 } 9373 } 9374 9375 // Also handle f16, for which we need to do a f32 comparison. 9376 if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { 9377 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 9378 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 9379 } 9380 9381 // Next, handle integers. 9382 if (LHS.getValueType().isInteger()) { 9383 assert((LHS.getValueType() == RHS.getValueType()) && 9384 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 9385 9386 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 9387 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 9388 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 9389 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform 9390 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the 9391 // supported types. 9392 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal && 9393 CTVal->isOne() && CFVal->isAllOnes() && 9394 LHS.getValueType() == TVal.getValueType()) { 9395 EVT VT = LHS.getValueType(); 9396 SDValue Shift = 9397 DAG.getNode(ISD::SRA, dl, VT, LHS, 9398 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT)); 9399 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT)); 9400 } 9401 9402 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns. 9403 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1)) 9404 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1)) 9405 // Both require less instructions than compare and conditional select. 9406 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal && 9407 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() && 9408 LHS.getValueType() == RHS.getValueType()) { 9409 EVT VT = LHS.getValueType(); 9410 SDValue Shift = 9411 DAG.getNode(ISD::SRA, dl, VT, LHS, 9412 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT)); 9413 9414 if (CC == ISD::SETGT) 9415 Shift = DAG.getNOT(dl, Shift, VT); 9416 9417 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift); 9418 } 9419 9420 unsigned Opcode = AArch64ISD::CSEL; 9421 9422 // If both the TVal and the FVal are constants, see if we can swap them in 9423 // order to for a CSINV or CSINC out of them. 9424 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) { 9425 std::swap(TVal, FVal); 9426 std::swap(CTVal, CFVal); 9427 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 9428 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) { 9429 std::swap(TVal, FVal); 9430 std::swap(CTVal, CFVal); 9431 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 9432 } else if (TVal.getOpcode() == ISD::XOR) { 9433 // If TVal is a NOT we want to swap TVal and FVal so that we can match 9434 // with a CSINV rather than a CSEL. 9435 if (isAllOnesConstant(TVal.getOperand(1))) { 9436 std::swap(TVal, FVal); 9437 std::swap(CTVal, CFVal); 9438 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 9439 } 9440 } else if (TVal.getOpcode() == ISD::SUB) { 9441 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so 9442 // that we can match with a CSNEG rather than a CSEL. 9443 if (isNullConstant(TVal.getOperand(0))) { 9444 std::swap(TVal, FVal); 9445 std::swap(CTVal, CFVal); 9446 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 9447 } 9448 } else if (CTVal && CFVal) { 9449 const int64_t TrueVal = CTVal->getSExtValue(); 9450 const int64_t FalseVal = CFVal->getSExtValue(); 9451 bool Swap = false; 9452 9453 // If both TVal and FVal are constants, see if FVal is the 9454 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC 9455 // instead of a CSEL in that case. 9456 if (TrueVal == ~FalseVal) { 9457 Opcode = AArch64ISD::CSINV; 9458 } else if (FalseVal > std::numeric_limits<int64_t>::min() && 9459 TrueVal == -FalseVal) { 9460 Opcode = AArch64ISD::CSNEG; 9461 } else if (TVal.getValueType() == MVT::i32) { 9462 // If our operands are only 32-bit wide, make sure we use 32-bit 9463 // arithmetic for the check whether we can use CSINC. This ensures that 9464 // the addition in the check will wrap around properly in case there is 9465 // an overflow (which would not be the case if we do the check with 9466 // 64-bit arithmetic). 9467 const uint32_t TrueVal32 = CTVal->getZExtValue(); 9468 const uint32_t FalseVal32 = CFVal->getZExtValue(); 9469 9470 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { 9471 Opcode = AArch64ISD::CSINC; 9472 9473 if (TrueVal32 > FalseVal32) { 9474 Swap = true; 9475 } 9476 } 9477 } else { 9478 // 64-bit check whether we can use CSINC. 9479 const uint64_t TrueVal64 = TrueVal; 9480 const uint64_t FalseVal64 = FalseVal; 9481 9482 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) { 9483 Opcode = AArch64ISD::CSINC; 9484 9485 if (TrueVal > FalseVal) { 9486 Swap = true; 9487 } 9488 } 9489 } 9490 9491 // Swap TVal and FVal if necessary. 9492 if (Swap) { 9493 std::swap(TVal, FVal); 9494 std::swap(CTVal, CFVal); 9495 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 9496 } 9497 9498 if (Opcode != AArch64ISD::CSEL) { 9499 // Drop FVal since we can get its value by simply inverting/negating 9500 // TVal. 9501 FVal = TVal; 9502 } 9503 } 9504 9505 // Avoid materializing a constant when possible by reusing a known value in 9506 // a register. However, don't perform this optimization if the known value 9507 // is one, zero or negative one in the case of a CSEL. We can always 9508 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the 9509 // FVal, respectively. 9510 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS); 9511 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() && 9512 !RHSVal->isZero() && !RHSVal->isAllOnes()) { 9513 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 9514 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to 9515 // "a != C ? x : a" to avoid materializing C. 9516 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ) 9517 TVal = LHS; 9518 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE) 9519 FVal = LHS; 9520 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) { 9521 assert (CTVal && CFVal && "Expected constant operands for CSNEG."); 9522 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to 9523 // avoid materializing C. 9524 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 9525 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) { 9526 Opcode = AArch64ISD::CSINV; 9527 TVal = LHS; 9528 FVal = DAG.getConstant(0, dl, FVal.getValueType()); 9529 } 9530 } 9531 9532 SDValue CCVal; 9533 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 9534 EVT VT = TVal.getValueType(); 9535 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); 9536 } 9537 9538 // Now we know we're dealing with FP values. 9539 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || 9540 LHS.getValueType() == MVT::f64); 9541 assert(LHS.getValueType() == RHS.getValueType()); 9542 EVT VT = TVal.getValueType(); 9543 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 9544 9545 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 9546 // clean. Some of them require two CSELs to implement. 9547 AArch64CC::CondCode CC1, CC2; 9548 changeFPCCToAArch64CC(CC, CC1, CC2); 9549 9550 if (DAG.getTarget().Options.UnsafeFPMath) { 9551 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and 9552 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. 9553 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS); 9554 if (RHSVal && RHSVal->isZero()) { 9555 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal); 9556 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal); 9557 9558 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) && 9559 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType()) 9560 TVal = LHS; 9561 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) && 9562 CFVal && CFVal->isZero() && 9563 FVal.getValueType() == LHS.getValueType()) 9564 FVal = LHS; 9565 } 9566 } 9567 9568 // Emit first, and possibly only, CSEL. 9569 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 9570 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 9571 9572 // If we need a second CSEL, emit it, using the output of the first as the 9573 // RHS. We're effectively OR'ing the two CC's together. 9574 if (CC2 != AArch64CC::AL) { 9575 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 9576 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 9577 } 9578 9579 // Otherwise, return the output of the first CSEL. 9580 return CS1; 9581 } 9582 9583 SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op, 9584 SelectionDAG &DAG) const { 9585 EVT Ty = Op.getValueType(); 9586 auto Idx = Op.getConstantOperandAPInt(2); 9587 int64_t IdxVal = Idx.getSExtValue(); 9588 assert(Ty.isScalableVector() && 9589 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE"); 9590 9591 // We can use the splice instruction for certain index values where we are 9592 // able to efficiently generate the correct predicate. The index will be 9593 // inverted and used directly as the input to the ptrue instruction, i.e. 9594 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the 9595 // splice predicate. However, we can only do this if we can guarantee that 9596 // there are enough elements in the vector, hence we check the index <= min 9597 // number of elements. 9598 std::optional<unsigned> PredPattern; 9599 if (Ty.isScalableVector() && IdxVal < 0 && 9600 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) != 9601 std::nullopt) { 9602 SDLoc DL(Op); 9603 9604 // Create a predicate where all but the last -IdxVal elements are false. 9605 EVT PredVT = Ty.changeVectorElementType(MVT::i1); 9606 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern); 9607 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred); 9608 9609 // Now splice the two inputs together using the predicate. 9610 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0), 9611 Op.getOperand(1)); 9612 } 9613 9614 // This will select to an EXT instruction, which has a maximum immediate 9615 // value of 255, hence 2048-bits is the maximum value we can lower. 9616 if (IdxVal >= 0 && 9617 IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits())) 9618 return Op; 9619 9620 return SDValue(); 9621 } 9622 9623 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, 9624 SelectionDAG &DAG) const { 9625 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 9626 SDValue LHS = Op.getOperand(0); 9627 SDValue RHS = Op.getOperand(1); 9628 SDValue TVal = Op.getOperand(2); 9629 SDValue FVal = Op.getOperand(3); 9630 SDLoc DL(Op); 9631 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 9632 } 9633 9634 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, 9635 SelectionDAG &DAG) const { 9636 SDValue CCVal = Op->getOperand(0); 9637 SDValue TVal = Op->getOperand(1); 9638 SDValue FVal = Op->getOperand(2); 9639 SDLoc DL(Op); 9640 9641 EVT Ty = Op.getValueType(); 9642 if (Ty == MVT::aarch64svcount) { 9643 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal); 9644 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal); 9645 SDValue Sel = 9646 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal); 9647 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel); 9648 } 9649 9650 if (Ty.isScalableVector()) { 9651 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount()); 9652 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal); 9653 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); 9654 } 9655 9656 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) { 9657 // FIXME: Ideally this would be the same as above using i1 types, however 9658 // for the moment we can't deal with fixed i1 vector types properly, so 9659 // instead extend the predicate to a result type sized integer vector. 9660 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits()); 9661 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount()); 9662 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT); 9663 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal); 9664 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); 9665 } 9666 9667 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select 9668 // instruction. 9669 if (ISD::isOverflowIntrOpRes(CCVal)) { 9670 // Only lower legal XALUO ops. 9671 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) 9672 return SDValue(); 9673 9674 AArch64CC::CondCode OFCC; 9675 SDValue Value, Overflow; 9676 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); 9677 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); 9678 9679 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, 9680 CCVal, Overflow); 9681 } 9682 9683 // Lower it the same way as we would lower a SELECT_CC node. 9684 ISD::CondCode CC; 9685 SDValue LHS, RHS; 9686 if (CCVal.getOpcode() == ISD::SETCC) { 9687 LHS = CCVal.getOperand(0); 9688 RHS = CCVal.getOperand(1); 9689 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get(); 9690 } else { 9691 LHS = CCVal; 9692 RHS = DAG.getConstant(0, DL, CCVal.getValueType()); 9693 CC = ISD::SETNE; 9694 } 9695 9696 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in 9697 // order to use FCSELSrrr 9698 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { 9699 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, 9700 DAG.getUNDEF(MVT::f32), TVal); 9701 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, 9702 DAG.getUNDEF(MVT::f32), FVal); 9703 } 9704 9705 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 9706 9707 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { 9708 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res); 9709 } 9710 9711 return Res; 9712 } 9713 9714 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, 9715 SelectionDAG &DAG) const { 9716 // Jump table entries as PC relative offsets. No additional tweaking 9717 // is necessary here. Just get the address of the jump table. 9718 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 9719 9720 CodeModel::Model CM = getTargetMachine().getCodeModel(); 9721 if (CM == CodeModel::Large && !getTargetMachine().isPositionIndependent() && 9722 !Subtarget->isTargetMachO()) 9723 return getAddrLarge(JT, DAG); 9724 if (CM == CodeModel::Tiny) 9725 return getAddrTiny(JT, DAG); 9726 return getAddr(JT, DAG); 9727 } 9728 9729 SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op, 9730 SelectionDAG &DAG) const { 9731 // Jump table entries as PC relative offsets. No additional tweaking 9732 // is necessary here. Just get the address of the jump table. 9733 SDLoc DL(Op); 9734 SDValue JT = Op.getOperand(1); 9735 SDValue Entry = Op.getOperand(2); 9736 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex(); 9737 9738 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 9739 AFI->setJumpTableEntryInfo(JTI, 4, nullptr); 9740 9741 SDNode *Dest = 9742 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT, 9743 Entry, DAG.getTargetJumpTable(JTI, MVT::i32)); 9744 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL); 9745 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0)); 9746 } 9747 9748 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, 9749 SelectionDAG &DAG) const { 9750 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 9751 CodeModel::Model CM = getTargetMachine().getCodeModel(); 9752 if (CM == CodeModel::Large) { 9753 // Use the GOT for the large code model on iOS. 9754 if (Subtarget->isTargetMachO()) { 9755 return getGOT(CP, DAG); 9756 } 9757 if (!getTargetMachine().isPositionIndependent()) 9758 return getAddrLarge(CP, DAG); 9759 } else if (CM == CodeModel::Tiny) { 9760 return getAddrTiny(CP, DAG); 9761 } 9762 return getAddr(CP, DAG); 9763 } 9764 9765 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, 9766 SelectionDAG &DAG) const { 9767 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op); 9768 CodeModel::Model CM = getTargetMachine().getCodeModel(); 9769 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) { 9770 if (!getTargetMachine().isPositionIndependent()) 9771 return getAddrLarge(BA, DAG); 9772 } else if (CM == CodeModel::Tiny) { 9773 return getAddrTiny(BA, DAG); 9774 } 9775 return getAddr(BA, DAG); 9776 } 9777 9778 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, 9779 SelectionDAG &DAG) const { 9780 AArch64FunctionInfo *FuncInfo = 9781 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 9782 9783 SDLoc DL(Op); 9784 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), 9785 getPointerTy(DAG.getDataLayout())); 9786 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout())); 9787 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9788 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 9789 MachinePointerInfo(SV)); 9790 } 9791 9792 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, 9793 SelectionDAG &DAG) const { 9794 MachineFunction &MF = DAG.getMachineFunction(); 9795 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 9796 9797 SDLoc DL(Op); 9798 SDValue FR; 9799 if (Subtarget->isWindowsArm64EC()) { 9800 // With the Arm64EC ABI, we compute the address of the varargs save area 9801 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry, 9802 // but calls from an entry thunk can pass in a different address. 9803 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); 9804 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64); 9805 uint64_t StackOffset; 9806 if (FuncInfo->getVarArgsGPRSize() > 0) 9807 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize(); 9808 else 9809 StackOffset = FuncInfo->getVarArgsStackOffset(); 9810 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val, 9811 DAG.getConstant(StackOffset, DL, MVT::i64)); 9812 } else { 9813 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 9814 ? FuncInfo->getVarArgsGPRIndex() 9815 : FuncInfo->getVarArgsStackIndex(), 9816 getPointerTy(DAG.getDataLayout())); 9817 } 9818 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9819 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 9820 MachinePointerInfo(SV)); 9821 } 9822 9823 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, 9824 SelectionDAG &DAG) const { 9825 // The layout of the va_list struct is specified in the AArch64 Procedure Call 9826 // Standard, section B.3. 9827 MachineFunction &MF = DAG.getMachineFunction(); 9828 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 9829 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; 9830 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); 9831 auto PtrVT = getPointerTy(DAG.getDataLayout()); 9832 SDLoc DL(Op); 9833 9834 SDValue Chain = Op.getOperand(0); 9835 SDValue VAList = Op.getOperand(1); 9836 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9837 SmallVector<SDValue, 4> MemOps; 9838 9839 // void *__stack at offset 0 9840 unsigned Offset = 0; 9841 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); 9842 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT); 9843 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 9844 MachinePointerInfo(SV), Align(PtrSize))); 9845 9846 // void *__gr_top at offset 8 (4 on ILP32) 9847 Offset += PtrSize; 9848 int GPRSize = FuncInfo->getVarArgsGPRSize(); 9849 if (GPRSize > 0) { 9850 SDValue GRTop, GRTopAddr; 9851 9852 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 9853 DAG.getConstant(Offset, DL, PtrVT)); 9854 9855 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); 9856 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, 9857 DAG.getConstant(GPRSize, DL, PtrVT)); 9858 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT); 9859 9860 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 9861 MachinePointerInfo(SV, Offset), 9862 Align(PtrSize))); 9863 } 9864 9865 // void *__vr_top at offset 16 (8 on ILP32) 9866 Offset += PtrSize; 9867 int FPRSize = FuncInfo->getVarArgsFPRSize(); 9868 if (FPRSize > 0) { 9869 SDValue VRTop, VRTopAddr; 9870 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 9871 DAG.getConstant(Offset, DL, PtrVT)); 9872 9873 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); 9874 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, 9875 DAG.getConstant(FPRSize, DL, PtrVT)); 9876 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT); 9877 9878 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 9879 MachinePointerInfo(SV, Offset), 9880 Align(PtrSize))); 9881 } 9882 9883 // int __gr_offs at offset 24 (12 on ILP32) 9884 Offset += PtrSize; 9885 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 9886 DAG.getConstant(Offset, DL, PtrVT)); 9887 MemOps.push_back( 9888 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), 9889 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); 9890 9891 // int __vr_offs at offset 28 (16 on ILP32) 9892 Offset += 4; 9893 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 9894 DAG.getConstant(Offset, DL, PtrVT)); 9895 MemOps.push_back( 9896 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), 9897 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); 9898 9899 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 9900 } 9901 9902 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, 9903 SelectionDAG &DAG) const { 9904 MachineFunction &MF = DAG.getMachineFunction(); 9905 9906 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv())) 9907 return LowerWin64_VASTART(Op, DAG); 9908 else if (Subtarget->isTargetDarwin()) 9909 return LowerDarwin_VASTART(Op, DAG); 9910 else 9911 return LowerAAPCS_VASTART(Op, DAG); 9912 } 9913 9914 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, 9915 SelectionDAG &DAG) const { 9916 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single 9917 // pointer. 9918 SDLoc DL(Op); 9919 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; 9920 unsigned VaListSize = 9921 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) 9922 ? PtrSize 9923 : Subtarget->isTargetILP32() ? 20 : 32; 9924 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 9925 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 9926 9927 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), 9928 DAG.getConstant(VaListSize, DL, MVT::i32), 9929 Align(PtrSize), false, false, false, 9930 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); 9931 } 9932 9933 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 9934 assert(Subtarget->isTargetDarwin() && 9935 "automatic va_arg instruction only works on Darwin"); 9936 9937 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 9938 EVT VT = Op.getValueType(); 9939 SDLoc DL(Op); 9940 SDValue Chain = Op.getOperand(0); 9941 SDValue Addr = Op.getOperand(1); 9942 MaybeAlign Align(Op.getConstantOperandVal(3)); 9943 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8; 9944 auto PtrVT = getPointerTy(DAG.getDataLayout()); 9945 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); 9946 SDValue VAList = 9947 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V)); 9948 Chain = VAList.getValue(1); 9949 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT); 9950 9951 if (VT.isScalableVector()) 9952 report_fatal_error("Passing SVE types to variadic functions is " 9953 "currently not supported"); 9954 9955 if (Align && *Align > MinSlotSize) { 9956 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 9957 DAG.getConstant(Align->value() - 1, DL, PtrVT)); 9958 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, 9959 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT)); 9960 } 9961 9962 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); 9963 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); 9964 9965 // Scalar integer and FP values smaller than 64 bits are implicitly extended 9966 // up to 64 bits. At the very least, we have to increase the striding of the 9967 // vaargs list to match this, and for FP values we need to introduce 9968 // FP_ROUND nodes as well. 9969 if (VT.isInteger() && !VT.isVector()) 9970 ArgSize = std::max(ArgSize, MinSlotSize); 9971 bool NeedFPTrunc = false; 9972 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { 9973 ArgSize = 8; 9974 NeedFPTrunc = true; 9975 } 9976 9977 // Increment the pointer, VAList, to the next vaarg 9978 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 9979 DAG.getConstant(ArgSize, DL, PtrVT)); 9980 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT); 9981 9982 // Store the incremented VAList to the legalized pointer 9983 SDValue APStore = 9984 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); 9985 9986 // Load the actual argument out of the pointer VAList 9987 if (NeedFPTrunc) { 9988 // Load the value as an f64. 9989 SDValue WideFP = 9990 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo()); 9991 // Round the value down to an f32. 9992 SDValue NarrowFP = 9993 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), 9994 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true)); 9995 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; 9996 // Merge the rounded value with the chain output of the load. 9997 return DAG.getMergeValues(Ops, DL); 9998 } 9999 10000 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo()); 10001 } 10002 10003 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, 10004 SelectionDAG &DAG) const { 10005 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 10006 MFI.setFrameAddressIsTaken(true); 10007 10008 EVT VT = Op.getValueType(); 10009 SDLoc DL(Op); 10010 unsigned Depth = Op.getConstantOperandVal(0); 10011 SDValue FrameAddr = 10012 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64); 10013 while (Depth--) 10014 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, 10015 MachinePointerInfo()); 10016 10017 if (Subtarget->isTargetILP32()) 10018 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr, 10019 DAG.getValueType(VT)); 10020 10021 return FrameAddr; 10022 } 10023 10024 SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, 10025 SelectionDAG &DAG) const { 10026 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 10027 10028 EVT VT = getPointerTy(DAG.getDataLayout()); 10029 SDLoc DL(Op); 10030 int FI = MFI.CreateFixedObject(4, 0, false); 10031 return DAG.getFrameIndex(FI, VT); 10032 } 10033 10034 #define GET_REGISTER_MATCHER 10035 #include "AArch64GenAsmMatcher.inc" 10036 10037 // FIXME? Maybe this could be a TableGen attribute on some registers and 10038 // this table could be generated automatically from RegInfo. 10039 Register AArch64TargetLowering:: 10040 getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { 10041 Register Reg = MatchRegisterName(RegName); 10042 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { 10043 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo(); 10044 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false); 10045 if (!Subtarget->isXRegisterReserved(DwarfRegNum) && 10046 !MRI->isReservedReg(MF, Reg)) 10047 Reg = 0; 10048 } 10049 if (Reg) 10050 return Reg; 10051 report_fatal_error(Twine("Invalid register name \"" 10052 + StringRef(RegName) + "\".")); 10053 } 10054 10055 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op, 10056 SelectionDAG &DAG) const { 10057 DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true); 10058 10059 EVT VT = Op.getValueType(); 10060 SDLoc DL(Op); 10061 10062 SDValue FrameAddr = 10063 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); 10064 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 10065 10066 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset); 10067 } 10068 10069 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, 10070 SelectionDAG &DAG) const { 10071 MachineFunction &MF = DAG.getMachineFunction(); 10072 MachineFrameInfo &MFI = MF.getFrameInfo(); 10073 MFI.setReturnAddressIsTaken(true); 10074 10075 EVT VT = Op.getValueType(); 10076 SDLoc DL(Op); 10077 unsigned Depth = Op.getConstantOperandVal(0); 10078 SDValue ReturnAddress; 10079 if (Depth) { 10080 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 10081 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 10082 ReturnAddress = DAG.getLoad( 10083 VT, DL, DAG.getEntryNode(), 10084 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo()); 10085 } else { 10086 // Return LR, which contains the return address. Mark it an implicit 10087 // live-in. 10088 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); 10089 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 10090 } 10091 10092 // The XPACLRI instruction assembles to a hint-space instruction before 10093 // Armv8.3-A therefore this instruction can be safely used for any pre 10094 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use 10095 // that instead. 10096 SDNode *St; 10097 if (Subtarget->hasPAuth()) { 10098 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress); 10099 } else { 10100 // XPACLRI operates on LR therefore we must move the operand accordingly. 10101 SDValue Chain = 10102 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress); 10103 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain); 10104 } 10105 return SDValue(St, 0); 10106 } 10107 10108 /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two 10109 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 10110 SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op, 10111 SelectionDAG &DAG) const { 10112 SDValue Lo, Hi; 10113 expandShiftParts(Op.getNode(), Lo, Hi, DAG); 10114 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op)); 10115 } 10116 10117 bool AArch64TargetLowering::isOffsetFoldingLegal( 10118 const GlobalAddressSDNode *GA) const { 10119 // Offsets are folded in the DAG combine rather than here so that we can 10120 // intelligently choose an offset based on the uses. 10121 return false; 10122 } 10123 10124 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 10125 bool OptForSize) const { 10126 bool IsLegal = false; 10127 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and 10128 // 16-bit case when target has full fp16 support. 10129 // FIXME: We should be able to handle f128 as well with a clever lowering. 10130 const APInt ImmInt = Imm.bitcastToAPInt(); 10131 if (VT == MVT::f64) 10132 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero(); 10133 else if (VT == MVT::f32) 10134 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero(); 10135 else if (VT == MVT::f16 || VT == MVT::bf16) 10136 IsLegal = 10137 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) || 10138 Imm.isPosZero(); 10139 10140 // If we can not materialize in immediate field for fmov, check if the 10141 // value can be encoded as the immediate operand of a logical instruction. 10142 // The immediate value will be created with either MOVZ, MOVN, or ORR. 10143 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to 10144 // generate that fmov. 10145 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) { 10146 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr; 10147 // however the mov+fmov sequence is always better because of the reduced 10148 // cache pressure. The timings are still the same if you consider 10149 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the 10150 // movw+movk is fused). So we limit up to 2 instrdduction at most. 10151 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 10152 AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), Insn); 10153 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2)); 10154 IsLegal = Insn.size() <= Limit; 10155 } 10156 10157 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT 10158 << " imm value: "; Imm.dump();); 10159 return IsLegal; 10160 } 10161 10162 //===----------------------------------------------------------------------===// 10163 // AArch64 Optimization Hooks 10164 //===----------------------------------------------------------------------===// 10165 10166 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, 10167 SDValue Operand, SelectionDAG &DAG, 10168 int &ExtraSteps) { 10169 EVT VT = Operand.getValueType(); 10170 if ((ST->hasNEON() && 10171 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || 10172 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 || 10173 VT == MVT::v4f32)) || 10174 (ST->hasSVE() && 10175 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) { 10176 if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) 10177 // For the reciprocal estimates, convergence is quadratic, so the number 10178 // of digits is doubled after each iteration. In ARMv8, the accuracy of 10179 // the initial estimate is 2^-8. Thus the number of extra steps to refine 10180 // the result for float (23 mantissa bits) is 2 and for double (52 10181 // mantissa bits) is 3. 10182 ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2; 10183 10184 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); 10185 } 10186 10187 return SDValue(); 10188 } 10189 10190 SDValue 10191 AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG, 10192 const DenormalMode &Mode) const { 10193 SDLoc DL(Op); 10194 EVT VT = Op.getValueType(); 10195 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 10196 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); 10197 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); 10198 } 10199 10200 SDValue 10201 AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op, 10202 SelectionDAG &DAG) const { 10203 return Op; 10204 } 10205 10206 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, 10207 SelectionDAG &DAG, int Enabled, 10208 int &ExtraSteps, 10209 bool &UseOneConst, 10210 bool Reciprocal) const { 10211 if (Enabled == ReciprocalEstimate::Enabled || 10212 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt())) 10213 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand, 10214 DAG, ExtraSteps)) { 10215 SDLoc DL(Operand); 10216 EVT VT = Operand.getValueType(); 10217 10218 SDNodeFlags Flags; 10219 Flags.setAllowReassociation(true); 10220 10221 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) 10222 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) 10223 for (int i = ExtraSteps; i > 0; --i) { 10224 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, 10225 Flags); 10226 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); 10227 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); 10228 } 10229 if (!Reciprocal) 10230 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); 10231 10232 ExtraSteps = 0; 10233 return Estimate; 10234 } 10235 10236 return SDValue(); 10237 } 10238 10239 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, 10240 SelectionDAG &DAG, int Enabled, 10241 int &ExtraSteps) const { 10242 if (Enabled == ReciprocalEstimate::Enabled) 10243 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand, 10244 DAG, ExtraSteps)) { 10245 SDLoc DL(Operand); 10246 EVT VT = Operand.getValueType(); 10247 10248 SDNodeFlags Flags; 10249 Flags.setAllowReassociation(true); 10250 10251 // Newton reciprocal iteration: E * (2 - X * E) 10252 // AArch64 reciprocal iteration instruction: (2 - M * N) 10253 for (int i = ExtraSteps; i > 0; --i) { 10254 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, 10255 Estimate, Flags); 10256 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); 10257 } 10258 10259 ExtraSteps = 0; 10260 return Estimate; 10261 } 10262 10263 return SDValue(); 10264 } 10265 10266 //===----------------------------------------------------------------------===// 10267 // AArch64 Inline Assembly Support 10268 //===----------------------------------------------------------------------===// 10269 10270 // Table of Constraints 10271 // TODO: This is the current set of constraints supported by ARM for the 10272 // compiler, not all of them may make sense. 10273 // 10274 // r - A general register 10275 // w - An FP/SIMD register of some size in the range v0-v31 10276 // x - An FP/SIMD register of some size in the range v0-v15 10277 // I - Constant that can be used with an ADD instruction 10278 // J - Constant that can be used with a SUB instruction 10279 // K - Constant that can be used with a 32-bit logical instruction 10280 // L - Constant that can be used with a 64-bit logical instruction 10281 // M - Constant that can be used as a 32-bit MOV immediate 10282 // N - Constant that can be used as a 64-bit MOV immediate 10283 // Q - A memory reference with base register and no offset 10284 // S - A symbolic address 10285 // Y - Floating point constant zero 10286 // Z - Integer constant zero 10287 // 10288 // Note that general register operands will be output using their 64-bit x 10289 // register name, whatever the size of the variable, unless the asm operand 10290 // is prefixed by the %w modifier. Floating-point and SIMD register operands 10291 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or 10292 // %q modifier. 10293 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { 10294 // At this point, we have to lower this constraint to something else, so we 10295 // lower it to an "r" or "w". However, by doing this we will force the result 10296 // to be in register, while the X constraint is much more permissive. 10297 // 10298 // Although we are correct (we are free to emit anything, without 10299 // constraints), we might break use cases that would expect us to be more 10300 // efficient and emit something else. 10301 if (!Subtarget->hasFPARMv8()) 10302 return "r"; 10303 10304 if (ConstraintVT.isFloatingPoint()) 10305 return "w"; 10306 10307 if (ConstraintVT.isVector() && 10308 (ConstraintVT.getSizeInBits() == 64 || 10309 ConstraintVT.getSizeInBits() == 128)) 10310 return "w"; 10311 10312 return "r"; 10313 } 10314 10315 enum class PredicateConstraint { Uph, Upl, Upa }; 10316 10317 static std::optional<PredicateConstraint> 10318 parsePredicateConstraint(StringRef Constraint) { 10319 return StringSwitch<std::optional<PredicateConstraint>>(Constraint) 10320 .Case("Uph", PredicateConstraint::Uph) 10321 .Case("Upl", PredicateConstraint::Upl) 10322 .Case("Upa", PredicateConstraint::Upa) 10323 .Default(std::nullopt); 10324 } 10325 10326 static const TargetRegisterClass * 10327 getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) { 10328 if (VT != MVT::aarch64svcount && 10329 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)) 10330 return nullptr; 10331 10332 switch (Constraint) { 10333 case PredicateConstraint::Uph: 10334 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass 10335 : &AArch64::PPR_p8to15RegClass; 10336 case PredicateConstraint::Upl: 10337 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass 10338 : &AArch64::PPR_3bRegClass; 10339 case PredicateConstraint::Upa: 10340 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass 10341 : &AArch64::PPRRegClass; 10342 } 10343 10344 llvm_unreachable("Missing PredicateConstraint!"); 10345 } 10346 10347 enum class ReducedGprConstraint { Uci, Ucj }; 10348 10349 static std::optional<ReducedGprConstraint> 10350 parseReducedGprConstraint(StringRef Constraint) { 10351 return StringSwitch<std::optional<ReducedGprConstraint>>(Constraint) 10352 .Case("Uci", ReducedGprConstraint::Uci) 10353 .Case("Ucj", ReducedGprConstraint::Ucj) 10354 .Default(std::nullopt); 10355 } 10356 10357 static const TargetRegisterClass * 10358 getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT) { 10359 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64) 10360 return nullptr; 10361 10362 switch (Constraint) { 10363 case ReducedGprConstraint::Uci: 10364 return &AArch64::MatrixIndexGPR32_8_11RegClass; 10365 case ReducedGprConstraint::Ucj: 10366 return &AArch64::MatrixIndexGPR32_12_15RegClass; 10367 } 10368 10369 llvm_unreachable("Missing ReducedGprConstraint!"); 10370 } 10371 10372 // The set of cc code supported is from 10373 // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands 10374 static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) { 10375 AArch64CC::CondCode Cond = StringSwitch<AArch64CC::CondCode>(Constraint) 10376 .Case("{@cchi}", AArch64CC::HI) 10377 .Case("{@cccs}", AArch64CC::HS) 10378 .Case("{@cclo}", AArch64CC::LO) 10379 .Case("{@ccls}", AArch64CC::LS) 10380 .Case("{@cccc}", AArch64CC::LO) 10381 .Case("{@cceq}", AArch64CC::EQ) 10382 .Case("{@ccgt}", AArch64CC::GT) 10383 .Case("{@ccge}", AArch64CC::GE) 10384 .Case("{@cclt}", AArch64CC::LT) 10385 .Case("{@ccle}", AArch64CC::LE) 10386 .Case("{@cchs}", AArch64CC::HS) 10387 .Case("{@ccne}", AArch64CC::NE) 10388 .Case("{@ccvc}", AArch64CC::VC) 10389 .Case("{@ccpl}", AArch64CC::PL) 10390 .Case("{@ccvs}", AArch64CC::VS) 10391 .Case("{@ccmi}", AArch64CC::MI) 10392 .Default(AArch64CC::Invalid); 10393 return Cond; 10394 } 10395 10396 /// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, 10397 /// WZR, invert(<cond>)'. 10398 static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, 10399 SelectionDAG &DAG) { 10400 return DAG.getNode( 10401 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), 10402 DAG.getConstant(0, DL, MVT::i32), 10403 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV); 10404 } 10405 10406 // Lower @cc flag output via getSETCC. 10407 SDValue AArch64TargetLowering::LowerAsmOutputForConstraint( 10408 SDValue &Chain, SDValue &Glue, const SDLoc &DL, 10409 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const { 10410 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode); 10411 if (Cond == AArch64CC::Invalid) 10412 return SDValue(); 10413 // The output variable should be a scalar integer. 10414 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() || 10415 OpInfo.ConstraintVT.getSizeInBits() < 8) 10416 report_fatal_error("Flag output operand is of invalid type"); 10417 10418 // Get NZCV register. Only update chain when copyfrom is glued. 10419 if (Glue.getNode()) { 10420 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue); 10421 Chain = Glue.getValue(1); 10422 } else 10423 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32); 10424 // Extract CC code. 10425 SDValue CC = getSETCC(Cond, Glue, DL, DAG); 10426 10427 SDValue Result; 10428 10429 // Truncate or ZERO_EXTEND based on value types. 10430 if (OpInfo.ConstraintVT.getSizeInBits() <= 32) 10431 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC); 10432 else 10433 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC); 10434 10435 return Result; 10436 } 10437 10438 /// getConstraintType - Given a constraint letter, return the type of 10439 /// constraint it is for this target. 10440 AArch64TargetLowering::ConstraintType 10441 AArch64TargetLowering::getConstraintType(StringRef Constraint) const { 10442 if (Constraint.size() == 1) { 10443 switch (Constraint[0]) { 10444 default: 10445 break; 10446 case 'x': 10447 case 'w': 10448 case 'y': 10449 return C_RegisterClass; 10450 // An address with a single base register. Due to the way we 10451 // currently handle addresses it is the same as 'r'. 10452 case 'Q': 10453 return C_Memory; 10454 case 'I': 10455 case 'J': 10456 case 'K': 10457 case 'L': 10458 case 'M': 10459 case 'N': 10460 case 'Y': 10461 case 'Z': 10462 return C_Immediate; 10463 case 'z': 10464 case 'S': // A symbolic address 10465 return C_Other; 10466 } 10467 } else if (parsePredicateConstraint(Constraint)) 10468 return C_RegisterClass; 10469 else if (parseReducedGprConstraint(Constraint)) 10470 return C_RegisterClass; 10471 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid) 10472 return C_Other; 10473 return TargetLowering::getConstraintType(Constraint); 10474 } 10475 10476 /// Examine constraint type and operand type and determine a weight value. 10477 /// This object must already have been set up with the operand type 10478 /// and the current alternative constraint selected. 10479 TargetLowering::ConstraintWeight 10480 AArch64TargetLowering::getSingleConstraintMatchWeight( 10481 AsmOperandInfo &info, const char *constraint) const { 10482 ConstraintWeight weight = CW_Invalid; 10483 Value *CallOperandVal = info.CallOperandVal; 10484 // If we don't have a value, we can't do a match, 10485 // but allow it at the lowest weight. 10486 if (!CallOperandVal) 10487 return CW_Default; 10488 Type *type = CallOperandVal->getType(); 10489 // Look at the constraint type. 10490 switch (*constraint) { 10491 default: 10492 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 10493 break; 10494 case 'x': 10495 case 'w': 10496 case 'y': 10497 if (type->isFloatingPointTy() || type->isVectorTy()) 10498 weight = CW_Register; 10499 break; 10500 case 'z': 10501 weight = CW_Constant; 10502 break; 10503 case 'U': 10504 if (parsePredicateConstraint(constraint) || 10505 parseReducedGprConstraint(constraint)) 10506 weight = CW_Register; 10507 break; 10508 } 10509 return weight; 10510 } 10511 10512 std::pair<unsigned, const TargetRegisterClass *> 10513 AArch64TargetLowering::getRegForInlineAsmConstraint( 10514 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 10515 if (Constraint.size() == 1) { 10516 switch (Constraint[0]) { 10517 case 'r': 10518 if (VT.isScalableVector()) 10519 return std::make_pair(0U, nullptr); 10520 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512) 10521 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass); 10522 if (VT.getFixedSizeInBits() == 64) 10523 return std::make_pair(0U, &AArch64::GPR64commonRegClass); 10524 return std::make_pair(0U, &AArch64::GPR32commonRegClass); 10525 case 'w': { 10526 if (!Subtarget->hasFPARMv8()) 10527 break; 10528 if (VT.isScalableVector()) { 10529 if (VT.getVectorElementType() != MVT::i1) 10530 return std::make_pair(0U, &AArch64::ZPRRegClass); 10531 return std::make_pair(0U, nullptr); 10532 } 10533 uint64_t VTSize = VT.getFixedSizeInBits(); 10534 if (VTSize == 16) 10535 return std::make_pair(0U, &AArch64::FPR16RegClass); 10536 if (VTSize == 32) 10537 return std::make_pair(0U, &AArch64::FPR32RegClass); 10538 if (VTSize == 64) 10539 return std::make_pair(0U, &AArch64::FPR64RegClass); 10540 if (VTSize == 128) 10541 return std::make_pair(0U, &AArch64::FPR128RegClass); 10542 break; 10543 } 10544 // The instructions that this constraint is designed for can 10545 // only take 128-bit registers so just use that regclass. 10546 case 'x': 10547 if (!Subtarget->hasFPARMv8()) 10548 break; 10549 if (VT.isScalableVector()) 10550 return std::make_pair(0U, &AArch64::ZPR_4bRegClass); 10551 if (VT.getSizeInBits() == 128) 10552 return std::make_pair(0U, &AArch64::FPR128_loRegClass); 10553 break; 10554 case 'y': 10555 if (!Subtarget->hasFPARMv8()) 10556 break; 10557 if (VT.isScalableVector()) 10558 return std::make_pair(0U, &AArch64::ZPR_3bRegClass); 10559 break; 10560 } 10561 } else { 10562 if (const auto PC = parsePredicateConstraint(Constraint)) 10563 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT)) 10564 return std::make_pair(0U, RegClass); 10565 10566 if (const auto RGC = parseReducedGprConstraint(Constraint)) 10567 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT)) 10568 return std::make_pair(0U, RegClass); 10569 } 10570 if (StringRef("{cc}").equals_insensitive(Constraint) || 10571 parseConstraintCode(Constraint) != AArch64CC::Invalid) 10572 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); 10573 10574 // Use the default implementation in TargetLowering to convert the register 10575 // constraint into a member of a register class. 10576 std::pair<unsigned, const TargetRegisterClass *> Res; 10577 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 10578 10579 // Not found as a standard register? 10580 if (!Res.second) { 10581 unsigned Size = Constraint.size(); 10582 if ((Size == 4 || Size == 5) && Constraint[0] == '{' && 10583 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { 10584 int RegNo; 10585 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); 10586 if (!Failed && RegNo >= 0 && RegNo <= 31) { 10587 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. 10588 // By default we'll emit v0-v31 for this unless there's a modifier where 10589 // we'll emit the correct register as well. 10590 if (VT != MVT::Other && VT.getSizeInBits() == 64) { 10591 Res.first = AArch64::FPR64RegClass.getRegister(RegNo); 10592 Res.second = &AArch64::FPR64RegClass; 10593 } else { 10594 Res.first = AArch64::FPR128RegClass.getRegister(RegNo); 10595 Res.second = &AArch64::FPR128RegClass; 10596 } 10597 } 10598 } 10599 } 10600 10601 if (Res.second && !Subtarget->hasFPARMv8() && 10602 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) && 10603 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second)) 10604 return std::make_pair(0U, nullptr); 10605 10606 return Res; 10607 } 10608 10609 EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL, 10610 llvm::Type *Ty, 10611 bool AllowUnknown) const { 10612 if (Subtarget->hasLS64() && Ty->isIntegerTy(512)) 10613 return EVT(MVT::i64x8); 10614 10615 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown); 10616 } 10617 10618 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10619 /// vector. If it is invalid, don't add anything to Ops. 10620 void AArch64TargetLowering::LowerAsmOperandForConstraint( 10621 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops, 10622 SelectionDAG &DAG) const { 10623 SDValue Result; 10624 10625 // Currently only support length 1 constraints. 10626 if (Constraint.size() != 1) 10627 return; 10628 10629 char ConstraintLetter = Constraint[0]; 10630 switch (ConstraintLetter) { 10631 default: 10632 break; 10633 10634 // This set of constraints deal with valid constants for various instructions. 10635 // Validate and return a target constant for them if we can. 10636 case 'z': { 10637 // 'z' maps to xzr or wzr so it needs an input of 0. 10638 if (!isNullConstant(Op)) 10639 return; 10640 10641 if (Op.getValueType() == MVT::i64) 10642 Result = DAG.getRegister(AArch64::XZR, MVT::i64); 10643 else 10644 Result = DAG.getRegister(AArch64::WZR, MVT::i32); 10645 break; 10646 } 10647 case 'S': { 10648 // An absolute symbolic address or label reference. 10649 if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) { 10650 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), 10651 GA->getValueType(0)); 10652 } else if (const BlockAddressSDNode *BA = 10653 dyn_cast<BlockAddressSDNode>(Op)) { 10654 Result = 10655 DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0)); 10656 } else 10657 return; 10658 break; 10659 } 10660 10661 case 'I': 10662 case 'J': 10663 case 'K': 10664 case 'L': 10665 case 'M': 10666 case 'N': 10667 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 10668 if (!C) 10669 return; 10670 10671 // Grab the value and do some validation. 10672 uint64_t CVal = C->getZExtValue(); 10673 switch (ConstraintLetter) { 10674 // The I constraint applies only to simple ADD or SUB immediate operands: 10675 // i.e. 0 to 4095 with optional shift by 12 10676 // The J constraint applies only to ADD or SUB immediates that would be 10677 // valid when negated, i.e. if [an add pattern] were to be output as a SUB 10678 // instruction [or vice versa], in other words -1 to -4095 with optional 10679 // left shift by 12. 10680 case 'I': 10681 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) 10682 break; 10683 return; 10684 case 'J': { 10685 uint64_t NVal = -C->getSExtValue(); 10686 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { 10687 CVal = C->getSExtValue(); 10688 break; 10689 } 10690 return; 10691 } 10692 // The K and L constraints apply *only* to logical immediates, including 10693 // what used to be the MOVI alias for ORR (though the MOVI alias has now 10694 // been removed and MOV should be used). So these constraints have to 10695 // distinguish between bit patterns that are valid 32-bit or 64-bit 10696 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but 10697 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice 10698 // versa. 10699 case 'K': 10700 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 10701 break; 10702 return; 10703 case 'L': 10704 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 10705 break; 10706 return; 10707 // The M and N constraints are a superset of K and L respectively, for use 10708 // with the MOV (immediate) alias. As well as the logical immediates they 10709 // also match 32 or 64-bit immediates that can be loaded either using a 10710 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca 10711 // (M) or 64-bit 0x1234000000000000 (N) etc. 10712 // As a note some of this code is liberally stolen from the asm parser. 10713 case 'M': { 10714 if (!isUInt<32>(CVal)) 10715 return; 10716 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 10717 break; 10718 if ((CVal & 0xFFFF) == CVal) 10719 break; 10720 if ((CVal & 0xFFFF0000ULL) == CVal) 10721 break; 10722 uint64_t NCVal = ~(uint32_t)CVal; 10723 if ((NCVal & 0xFFFFULL) == NCVal) 10724 break; 10725 if ((NCVal & 0xFFFF0000ULL) == NCVal) 10726 break; 10727 return; 10728 } 10729 case 'N': { 10730 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 10731 break; 10732 if ((CVal & 0xFFFFULL) == CVal) 10733 break; 10734 if ((CVal & 0xFFFF0000ULL) == CVal) 10735 break; 10736 if ((CVal & 0xFFFF00000000ULL) == CVal) 10737 break; 10738 if ((CVal & 0xFFFF000000000000ULL) == CVal) 10739 break; 10740 uint64_t NCVal = ~CVal; 10741 if ((NCVal & 0xFFFFULL) == NCVal) 10742 break; 10743 if ((NCVal & 0xFFFF0000ULL) == NCVal) 10744 break; 10745 if ((NCVal & 0xFFFF00000000ULL) == NCVal) 10746 break; 10747 if ((NCVal & 0xFFFF000000000000ULL) == NCVal) 10748 break; 10749 return; 10750 } 10751 default: 10752 return; 10753 } 10754 10755 // All assembler immediates are 64-bit integers. 10756 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); 10757 break; 10758 } 10759 10760 if (Result.getNode()) { 10761 Ops.push_back(Result); 10762 return; 10763 } 10764 10765 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 10766 } 10767 10768 //===----------------------------------------------------------------------===// 10769 // AArch64 Advanced SIMD Support 10770 //===----------------------------------------------------------------------===// 10771 10772 /// WidenVector - Given a value in the V64 register class, produce the 10773 /// equivalent value in the V128 register class. 10774 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { 10775 EVT VT = V64Reg.getValueType(); 10776 unsigned NarrowSize = VT.getVectorNumElements(); 10777 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 10778 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); 10779 SDLoc DL(V64Reg); 10780 10781 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), 10782 V64Reg, DAG.getConstant(0, DL, MVT::i64)); 10783 } 10784 10785 /// getExtFactor - Determine the adjustment factor for the position when 10786 /// generating an "extract from vector registers" instruction. 10787 static unsigned getExtFactor(SDValue &V) { 10788 EVT EltType = V.getValueType().getVectorElementType(); 10789 return EltType.getSizeInBits() / 8; 10790 } 10791 10792 // Check if a vector is built from one vector via extracted elements of 10793 // another together with an AND mask, ensuring that all elements fit 10794 // within range. This can be reconstructed using AND and NEON's TBL1. 10795 SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) { 10796 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 10797 SDLoc dl(Op); 10798 EVT VT = Op.getValueType(); 10799 assert(!VT.isScalableVector() && 10800 "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); 10801 10802 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map 10803 // directly to TBL1. 10804 if (VT != MVT::v16i8 && VT != MVT::v8i8) 10805 return SDValue(); 10806 10807 unsigned NumElts = VT.getVectorNumElements(); 10808 assert((NumElts == 8 || NumElts == 16) && 10809 "Need to have exactly 8 or 16 elements in vector."); 10810 10811 SDValue SourceVec; 10812 SDValue MaskSourceVec; 10813 SmallVector<SDValue, 16> AndMaskConstants; 10814 10815 for (unsigned i = 0; i < NumElts; ++i) { 10816 SDValue V = Op.getOperand(i); 10817 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 10818 return SDValue(); 10819 10820 SDValue OperandSourceVec = V.getOperand(0); 10821 if (!SourceVec) 10822 SourceVec = OperandSourceVec; 10823 else if (SourceVec != OperandSourceVec) 10824 return SDValue(); 10825 10826 // This only looks at shuffles with elements that are 10827 // a) truncated by a constant AND mask extracted from a mask vector, or 10828 // b) extracted directly from a mask vector. 10829 SDValue MaskSource = V.getOperand(1); 10830 if (MaskSource.getOpcode() == ISD::AND) { 10831 if (!isa<ConstantSDNode>(MaskSource.getOperand(1))) 10832 return SDValue(); 10833 10834 AndMaskConstants.push_back(MaskSource.getOperand(1)); 10835 MaskSource = MaskSource->getOperand(0); 10836 } else if (!AndMaskConstants.empty()) { 10837 // Either all or no operands should have an AND mask. 10838 return SDValue(); 10839 } 10840 10841 // An ANY_EXTEND may be inserted between the AND and the source vector 10842 // extraction. We don't care about that, so we can just skip it. 10843 if (MaskSource.getOpcode() == ISD::ANY_EXTEND) 10844 MaskSource = MaskSource.getOperand(0); 10845 10846 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 10847 return SDValue(); 10848 10849 SDValue MaskIdx = MaskSource.getOperand(1); 10850 if (!isa<ConstantSDNode>(MaskIdx) || 10851 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i)) 10852 return SDValue(); 10853 10854 // We only apply this if all elements come from the same vector with the 10855 // same vector type. 10856 if (!MaskSourceVec) { 10857 MaskSourceVec = MaskSource->getOperand(0); 10858 if (MaskSourceVec.getValueType() != VT) 10859 return SDValue(); 10860 } else if (MaskSourceVec != MaskSource->getOperand(0)) { 10861 return SDValue(); 10862 } 10863 } 10864 10865 // We need a v16i8 for TBL, so we extend the source with a placeholder vector 10866 // for v8i8 to get a v16i8. As the pattern we are replacing is extract + 10867 // insert, we know that the index in the mask must be smaller than the number 10868 // of elements in the source, or we would have an out-of-bounds access. 10869 if (NumElts == 8) 10870 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec, 10871 DAG.getUNDEF(VT)); 10872 10873 // Preconditions met, so we can use a vector (AND +) TBL to build this vector. 10874 if (!AndMaskConstants.empty()) 10875 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec, 10876 DAG.getBuildVector(VT, dl, AndMaskConstants)); 10877 10878 return DAG.getNode( 10879 ISD::INTRINSIC_WO_CHAIN, dl, VT, 10880 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec, 10881 MaskSourceVec); 10882 } 10883 10884 // Gather data to see if the operation can be modelled as a 10885 // shuffle in combination with VEXTs. 10886 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, 10887 SelectionDAG &DAG) const { 10888 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 10889 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n"); 10890 SDLoc dl(Op); 10891 EVT VT = Op.getValueType(); 10892 assert(!VT.isScalableVector() && 10893 "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); 10894 unsigned NumElts = VT.getVectorNumElements(); 10895 10896 struct ShuffleSourceInfo { 10897 SDValue Vec; 10898 unsigned MinElt; 10899 unsigned MaxElt; 10900 10901 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 10902 // be compatible with the shuffle we intend to construct. As a result 10903 // ShuffleVec will be some sliding window into the original Vec. 10904 SDValue ShuffleVec; 10905 10906 // Code should guarantee that element i in Vec starts at element "WindowBase 10907 // + i * WindowScale in ShuffleVec". 10908 int WindowBase; 10909 int WindowScale; 10910 10911 ShuffleSourceInfo(SDValue Vec) 10912 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0), 10913 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} 10914 10915 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 10916 }; 10917 10918 // First gather all vectors used as an immediate source for this BUILD_VECTOR 10919 // node. 10920 SmallVector<ShuffleSourceInfo, 2> Sources; 10921 for (unsigned i = 0; i < NumElts; ++i) { 10922 SDValue V = Op.getOperand(i); 10923 if (V.isUndef()) 10924 continue; 10925 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 10926 !isa<ConstantSDNode>(V.getOperand(1)) || 10927 V.getOperand(0).getValueType().isScalableVector()) { 10928 LLVM_DEBUG( 10929 dbgs() << "Reshuffle failed: " 10930 "a shuffle can only come from building a vector from " 10931 "various elements of other fixed-width vectors, provided " 10932 "their indices are constant\n"); 10933 return SDValue(); 10934 } 10935 10936 // Add this element source to the list if it's not already there. 10937 SDValue SourceVec = V.getOperand(0); 10938 auto Source = find(Sources, SourceVec); 10939 if (Source == Sources.end()) 10940 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 10941 10942 // Update the minimum and maximum lane number seen. 10943 unsigned EltNo = V.getConstantOperandVal(1); 10944 Source->MinElt = std::min(Source->MinElt, EltNo); 10945 Source->MaxElt = std::max(Source->MaxElt, EltNo); 10946 } 10947 10948 // If we have 3 or 4 sources, try to generate a TBL, which will at least be 10949 // better than moving to/from gpr registers for larger vectors. 10950 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) { 10951 // Construct a mask for the tbl. We may need to adjust the index for types 10952 // larger than i8. 10953 SmallVector<unsigned, 16> Mask; 10954 unsigned OutputFactor = VT.getScalarSizeInBits() / 8; 10955 for (unsigned I = 0; I < NumElts; ++I) { 10956 SDValue V = Op.getOperand(I); 10957 if (V.isUndef()) { 10958 for (unsigned OF = 0; OF < OutputFactor; OF++) 10959 Mask.push_back(-1); 10960 continue; 10961 } 10962 // Set the Mask lanes adjusted for the size of the input and output 10963 // lanes. The Mask is always i8, so it will set OutputFactor lanes per 10964 // output element, adjusted in their positions per input and output types. 10965 unsigned Lane = V.getConstantOperandVal(1); 10966 for (unsigned S = 0; S < Sources.size(); S++) { 10967 if (V.getOperand(0) == Sources[S].Vec) { 10968 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits(); 10969 unsigned InputBase = 16 * S + Lane * InputSize / 8; 10970 for (unsigned OF = 0; OF < OutputFactor; OF++) 10971 Mask.push_back(InputBase + OF); 10972 break; 10973 } 10974 } 10975 } 10976 10977 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to 10978 // v16i8, and the TBLMask 10979 SmallVector<SDValue, 16> TBLOperands; 10980 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3 10981 ? Intrinsic::aarch64_neon_tbl3 10982 : Intrinsic::aarch64_neon_tbl4, 10983 dl, MVT::i32)); 10984 for (unsigned i = 0; i < Sources.size(); i++) { 10985 SDValue Src = Sources[i].Vec; 10986 EVT SrcVT = Src.getValueType(); 10987 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src); 10988 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) && 10989 "Expected a legally typed vector"); 10990 if (SrcVT.is64BitVector()) 10991 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src, 10992 DAG.getUNDEF(MVT::v8i8)); 10993 TBLOperands.push_back(Src); 10994 } 10995 10996 SmallVector<SDValue, 16> TBLMask; 10997 for (unsigned i = 0; i < Mask.size(); i++) 10998 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32)); 10999 assert((Mask.size() == 8 || Mask.size() == 16) && 11000 "Expected a v8i8 or v16i8 Mask"); 11001 TBLOperands.push_back( 11002 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask)); 11003 11004 SDValue Shuffle = 11005 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 11006 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands); 11007 return DAG.getBitcast(VT, Shuffle); 11008 } 11009 11010 if (Sources.size() > 2) { 11011 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something " 11012 << "sensible when at most two source vectors are " 11013 << "involved\n"); 11014 return SDValue(); 11015 } 11016 11017 // Find out the smallest element size among result and two sources, and use 11018 // it as element size to build the shuffle_vector. 11019 EVT SmallestEltTy = VT.getVectorElementType(); 11020 for (auto &Source : Sources) { 11021 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 11022 if (SrcEltTy.bitsLT(SmallestEltTy)) { 11023 SmallestEltTy = SrcEltTy; 11024 } 11025 } 11026 unsigned ResMultiplier = 11027 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits(); 11028 uint64_t VTSize = VT.getFixedSizeInBits(); 11029 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits(); 11030 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 11031 11032 // If the source vector is too wide or too narrow, we may nevertheless be able 11033 // to construct a compatible shuffle either by concatenating it with UNDEF or 11034 // extracting a suitable range of elements. 11035 for (auto &Src : Sources) { 11036 EVT SrcVT = Src.ShuffleVec.getValueType(); 11037 11038 TypeSize SrcVTSize = SrcVT.getSizeInBits(); 11039 if (SrcVTSize == TypeSize::getFixed(VTSize)) 11040 continue; 11041 11042 // This stage of the search produces a source with the same element type as 11043 // the original, but with a total width matching the BUILD_VECTOR output. 11044 EVT EltVT = SrcVT.getVectorElementType(); 11045 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); 11046 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 11047 11048 if (SrcVTSize.getFixedValue() < VTSize) { 11049 assert(2 * SrcVTSize == VTSize); 11050 // We can pad out the smaller vector for free, so if it's part of a 11051 // shuffle... 11052 Src.ShuffleVec = 11053 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 11054 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 11055 continue; 11056 } 11057 11058 if (SrcVTSize.getFixedValue() != 2 * VTSize) { 11059 LLVM_DEBUG( 11060 dbgs() << "Reshuffle failed: result vector too small to extract\n"); 11061 return SDValue(); 11062 } 11063 11064 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 11065 LLVM_DEBUG( 11066 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n"); 11067 return SDValue(); 11068 } 11069 11070 if (Src.MinElt >= NumSrcElts) { 11071 // The extraction can just take the second half 11072 Src.ShuffleVec = 11073 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 11074 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 11075 Src.WindowBase = -NumSrcElts; 11076 } else if (Src.MaxElt < NumSrcElts) { 11077 // The extraction can just take the first half 11078 Src.ShuffleVec = 11079 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 11080 DAG.getConstant(0, dl, MVT::i64)); 11081 } else { 11082 // An actual VEXT is needed 11083 SDValue VEXTSrc1 = 11084 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 11085 DAG.getConstant(0, dl, MVT::i64)); 11086 SDValue VEXTSrc2 = 11087 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 11088 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 11089 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); 11090 11091 if (!SrcVT.is64BitVector()) { 11092 LLVM_DEBUG( 11093 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT " 11094 "for SVE vectors."); 11095 return SDValue(); 11096 } 11097 11098 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, 11099 VEXTSrc2, 11100 DAG.getConstant(Imm, dl, MVT::i32)); 11101 Src.WindowBase = -Src.MinElt; 11102 } 11103 } 11104 11105 // Another possible incompatibility occurs from the vector element types. We 11106 // can fix this by bitcasting the source vectors to the same type we intend 11107 // for the shuffle. 11108 for (auto &Src : Sources) { 11109 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 11110 if (SrcEltTy == SmallestEltTy) 11111 continue; 11112 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 11113 if (DAG.getDataLayout().isBigEndian()) { 11114 Src.ShuffleVec = 11115 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec); 11116 } else { 11117 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 11118 } 11119 Src.WindowScale = 11120 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits(); 11121 Src.WindowBase *= Src.WindowScale; 11122 } 11123 11124 // Final check before we try to actually produce a shuffle. 11125 LLVM_DEBUG(for (auto Src 11126 : Sources) 11127 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 11128 11129 // The stars all align, our next step is to produce the mask for the shuffle. 11130 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 11131 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 11132 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 11133 SDValue Entry = Op.getOperand(i); 11134 if (Entry.isUndef()) 11135 continue; 11136 11137 auto Src = find(Sources, Entry.getOperand(0)); 11138 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 11139 11140 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 11141 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 11142 // segment. 11143 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 11144 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), 11145 VT.getScalarSizeInBits()); 11146 int LanesDefined = BitsDefined / BitsPerShuffleLane; 11147 11148 // This source is expected to fill ResMultiplier lanes of the final shuffle, 11149 // starting at the appropriate offset. 11150 int *LaneMask = &Mask[i * ResMultiplier]; 11151 11152 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 11153 ExtractBase += NumElts * (Src - Sources.begin()); 11154 for (int j = 0; j < LanesDefined; ++j) 11155 LaneMask[j] = ExtractBase + j; 11156 } 11157 11158 // Final check before we try to produce nonsense... 11159 if (!isShuffleMaskLegal(Mask, ShuffleVT)) { 11160 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n"); 11161 return SDValue(); 11162 } 11163 11164 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 11165 for (unsigned i = 0; i < Sources.size(); ++i) 11166 ShuffleOps[i] = Sources[i].ShuffleVec; 11167 11168 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 11169 ShuffleOps[1], Mask); 11170 SDValue V; 11171 if (DAG.getDataLayout().isBigEndian()) { 11172 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle); 11173 } else { 11174 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 11175 } 11176 11177 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump(); 11178 dbgs() << "Reshuffle, creating node: "; V.dump();); 11179 11180 return V; 11181 } 11182 11183 // check if an EXT instruction can handle the shuffle mask when the 11184 // vector sources of the shuffle are the same. 11185 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 11186 unsigned NumElts = VT.getVectorNumElements(); 11187 11188 // Assume that the first shuffle index is not UNDEF. Fail if it is. 11189 if (M[0] < 0) 11190 return false; 11191 11192 Imm = M[0]; 11193 11194 // If this is a VEXT shuffle, the immediate value is the index of the first 11195 // element. The other shuffle indices must be the successive elements after 11196 // the first one. 11197 unsigned ExpectedElt = Imm; 11198 for (unsigned i = 1; i < NumElts; ++i) { 11199 // Increment the expected index. If it wraps around, just follow it 11200 // back to index zero and keep going. 11201 ++ExpectedElt; 11202 if (ExpectedElt == NumElts) 11203 ExpectedElt = 0; 11204 11205 if (M[i] < 0) 11206 continue; // ignore UNDEF indices 11207 if (ExpectedElt != static_cast<unsigned>(M[i])) 11208 return false; 11209 } 11210 11211 return true; 11212 } 11213 11214 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from 11215 // v4i32s. This is really a truncate, which we can construct out of (legal) 11216 // concats and truncate nodes. 11217 static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) { 11218 if (V.getValueType() != MVT::v16i8) 11219 return SDValue(); 11220 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR"); 11221 11222 for (unsigned X = 0; X < 4; X++) { 11223 // Check the first item in each group is an extract from lane 0 of a v4i32 11224 // or v4i16. 11225 SDValue BaseExt = V.getOperand(X * 4); 11226 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 11227 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 && 11228 BaseExt.getOperand(0).getValueType() != MVT::v4i32) || 11229 !isa<ConstantSDNode>(BaseExt.getOperand(1)) || 11230 BaseExt.getConstantOperandVal(1) != 0) 11231 return SDValue(); 11232 SDValue Base = BaseExt.getOperand(0); 11233 // And check the other items are extracts from the same vector. 11234 for (unsigned Y = 1; Y < 4; Y++) { 11235 SDValue Ext = V.getOperand(X * 4 + Y); 11236 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 11237 Ext.getOperand(0) != Base || 11238 !isa<ConstantSDNode>(Ext.getOperand(1)) || 11239 Ext.getConstantOperandVal(1) != Y) 11240 return SDValue(); 11241 } 11242 } 11243 11244 // Turn the buildvector into a series of truncates and concates, which will 11245 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are 11246 // concat together to produce 2 v8i16. These are both truncated and concat 11247 // together. 11248 SDLoc DL(V); 11249 SDValue Trunc[4] = { 11250 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0), 11251 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)}; 11252 for (SDValue &V : Trunc) 11253 if (V.getValueType() == MVT::v4i32) 11254 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V); 11255 SDValue Concat0 = 11256 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]); 11257 SDValue Concat1 = 11258 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]); 11259 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0); 11260 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1); 11261 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1); 11262 } 11263 11264 /// Check if a vector shuffle corresponds to a DUP instructions with a larger 11265 /// element width than the vector lane type. If that is the case the function 11266 /// returns true and writes the value of the DUP instruction lane operand into 11267 /// DupLaneOp 11268 static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize, 11269 unsigned &DupLaneOp) { 11270 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && 11271 "Only possible block sizes for wide DUP are: 16, 32, 64"); 11272 11273 if (BlockSize <= VT.getScalarSizeInBits()) 11274 return false; 11275 if (BlockSize % VT.getScalarSizeInBits() != 0) 11276 return false; 11277 if (VT.getSizeInBits() % BlockSize != 0) 11278 return false; 11279 11280 size_t SingleVecNumElements = VT.getVectorNumElements(); 11281 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits(); 11282 size_t NumBlocks = VT.getSizeInBits() / BlockSize; 11283 11284 // We are looking for masks like 11285 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element 11286 // might be replaced by 'undefined'. BlockIndices will eventually contain 11287 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7] 11288 // for the above examples) 11289 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1); 11290 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++) 11291 for (size_t I = 0; I < NumEltsPerBlock; I++) { 11292 int Elt = M[BlockIndex * NumEltsPerBlock + I]; 11293 if (Elt < 0) 11294 continue; 11295 // For now we don't support shuffles that use the second operand 11296 if ((unsigned)Elt >= SingleVecNumElements) 11297 return false; 11298 if (BlockElts[I] < 0) 11299 BlockElts[I] = Elt; 11300 else if (BlockElts[I] != Elt) 11301 return false; 11302 } 11303 11304 // We found a candidate block (possibly with some undefs). It must be a 11305 // sequence of consecutive integers starting with a value divisible by 11306 // NumEltsPerBlock with some values possibly replaced by undef-s. 11307 11308 // Find first non-undef element 11309 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; }); 11310 assert(FirstRealEltIter != BlockElts.end() && 11311 "Shuffle with all-undefs must have been caught by previous cases, " 11312 "e.g. isSplat()"); 11313 if (FirstRealEltIter == BlockElts.end()) { 11314 DupLaneOp = 0; 11315 return true; 11316 } 11317 11318 // Index of FirstRealElt in BlockElts 11319 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin(); 11320 11321 if ((unsigned)*FirstRealEltIter < FirstRealIndex) 11322 return false; 11323 // BlockElts[0] must have the following value if it isn't undef: 11324 size_t Elt0 = *FirstRealEltIter - FirstRealIndex; 11325 11326 // Check the first element 11327 if (Elt0 % NumEltsPerBlock != 0) 11328 return false; 11329 // Check that the sequence indeed consists of consecutive integers (modulo 11330 // undefs) 11331 for (size_t I = 0; I < NumEltsPerBlock; I++) 11332 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I) 11333 return false; 11334 11335 DupLaneOp = Elt0 / NumEltsPerBlock; 11336 return true; 11337 } 11338 11339 // check if an EXT instruction can handle the shuffle mask when the 11340 // vector sources of the shuffle are different. 11341 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, 11342 unsigned &Imm) { 11343 // Look for the first non-undef element. 11344 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); 11345 11346 // Benefit form APInt to handle overflow when calculating expected element. 11347 unsigned NumElts = VT.getVectorNumElements(); 11348 unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); 11349 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); 11350 // The following shuffle indices must be the successive elements after the 11351 // first real element. 11352 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) { 11353 return Elt != ExpectedElt++ && Elt != -1; 11354 }); 11355 if (FoundWrongElt) 11356 return false; 11357 11358 // The index of an EXT is the first element if it is not UNDEF. 11359 // Watch out for the beginning UNDEFs. The EXT index should be the expected 11360 // value of the first element. E.g. 11361 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. 11362 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. 11363 // ExpectedElt is the last mask index plus 1. 11364 Imm = ExpectedElt.getZExtValue(); 11365 11366 // There are two difference cases requiring to reverse input vectors. 11367 // For example, for vector <4 x i32> we have the following cases, 11368 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) 11369 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) 11370 // For both cases, we finally use mask <5, 6, 7, 0>, which requires 11371 // to reverse two input vectors. 11372 if (Imm < NumElts) 11373 ReverseEXT = true; 11374 else 11375 Imm -= NumElts; 11376 11377 return true; 11378 } 11379 11380 /// isREVMask - Check if a vector shuffle corresponds to a REV 11381 /// instruction with the specified blocksize. (The order of the elements 11382 /// within each block of the vector is reversed.) 11383 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 11384 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 || 11385 BlockSize == 128) && 11386 "Only possible block sizes for REV are: 16, 32, 64, 128"); 11387 11388 unsigned EltSz = VT.getScalarSizeInBits(); 11389 unsigned NumElts = VT.getVectorNumElements(); 11390 unsigned BlockElts = M[0] + 1; 11391 // If the first shuffle index is UNDEF, be optimistic. 11392 if (M[0] < 0) 11393 BlockElts = BlockSize / EltSz; 11394 11395 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 11396 return false; 11397 11398 for (unsigned i = 0; i < NumElts; ++i) { 11399 if (M[i] < 0) 11400 continue; // ignore UNDEF indices 11401 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) 11402 return false; 11403 } 11404 11405 return true; 11406 } 11407 11408 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 11409 unsigned NumElts = VT.getVectorNumElements(); 11410 if (NumElts % 2 != 0) 11411 return false; 11412 WhichResult = (M[0] == 0 ? 0 : 1); 11413 unsigned Idx = WhichResult * NumElts / 2; 11414 for (unsigned i = 0; i != NumElts; i += 2) { 11415 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 11416 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) 11417 return false; 11418 Idx += 1; 11419 } 11420 11421 return true; 11422 } 11423 11424 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 11425 unsigned NumElts = VT.getVectorNumElements(); 11426 WhichResult = (M[0] == 0 ? 0 : 1); 11427 for (unsigned i = 0; i != NumElts; ++i) { 11428 if (M[i] < 0) 11429 continue; // ignore UNDEF indices 11430 if ((unsigned)M[i] != 2 * i + WhichResult) 11431 return false; 11432 } 11433 11434 return true; 11435 } 11436 11437 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 11438 unsigned NumElts = VT.getVectorNumElements(); 11439 if (NumElts % 2 != 0) 11440 return false; 11441 WhichResult = (M[0] == 0 ? 0 : 1); 11442 for (unsigned i = 0; i < NumElts; i += 2) { 11443 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 11444 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) 11445 return false; 11446 } 11447 return true; 11448 } 11449 11450 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of 11451 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 11452 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 11453 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 11454 unsigned NumElts = VT.getVectorNumElements(); 11455 if (NumElts % 2 != 0) 11456 return false; 11457 WhichResult = (M[0] == 0 ? 0 : 1); 11458 unsigned Idx = WhichResult * NumElts / 2; 11459 for (unsigned i = 0; i != NumElts; i += 2) { 11460 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 11461 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) 11462 return false; 11463 Idx += 1; 11464 } 11465 11466 return true; 11467 } 11468 11469 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of 11470 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 11471 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 11472 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 11473 unsigned Half = VT.getVectorNumElements() / 2; 11474 WhichResult = (M[0] == 0 ? 0 : 1); 11475 for (unsigned j = 0; j != 2; ++j) { 11476 unsigned Idx = WhichResult; 11477 for (unsigned i = 0; i != Half; ++i) { 11478 int MIdx = M[i + j * Half]; 11479 if (MIdx >= 0 && (unsigned)MIdx != Idx) 11480 return false; 11481 Idx += 2; 11482 } 11483 } 11484 11485 return true; 11486 } 11487 11488 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of 11489 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 11490 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 11491 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 11492 unsigned NumElts = VT.getVectorNumElements(); 11493 if (NumElts % 2 != 0) 11494 return false; 11495 WhichResult = (M[0] == 0 ? 0 : 1); 11496 for (unsigned i = 0; i < NumElts; i += 2) { 11497 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 11498 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) 11499 return false; 11500 } 11501 return true; 11502 } 11503 11504 static bool isINSMask(ArrayRef<int> M, int NumInputElements, 11505 bool &DstIsLeft, int &Anomaly) { 11506 if (M.size() != static_cast<size_t>(NumInputElements)) 11507 return false; 11508 11509 int NumLHSMatch = 0, NumRHSMatch = 0; 11510 int LastLHSMismatch = -1, LastRHSMismatch = -1; 11511 11512 for (int i = 0; i < NumInputElements; ++i) { 11513 if (M[i] == -1) { 11514 ++NumLHSMatch; 11515 ++NumRHSMatch; 11516 continue; 11517 } 11518 11519 if (M[i] == i) 11520 ++NumLHSMatch; 11521 else 11522 LastLHSMismatch = i; 11523 11524 if (M[i] == i + NumInputElements) 11525 ++NumRHSMatch; 11526 else 11527 LastRHSMismatch = i; 11528 } 11529 11530 if (NumLHSMatch == NumInputElements - 1) { 11531 DstIsLeft = true; 11532 Anomaly = LastLHSMismatch; 11533 return true; 11534 } else if (NumRHSMatch == NumInputElements - 1) { 11535 DstIsLeft = false; 11536 Anomaly = LastRHSMismatch; 11537 return true; 11538 } 11539 11540 return false; 11541 } 11542 11543 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { 11544 if (VT.getSizeInBits() != 128) 11545 return false; 11546 11547 unsigned NumElts = VT.getVectorNumElements(); 11548 11549 for (int I = 0, E = NumElts / 2; I != E; I++) { 11550 if (Mask[I] != I) 11551 return false; 11552 } 11553 11554 int Offset = NumElts / 2; 11555 for (int I = NumElts / 2, E = NumElts; I != E; I++) { 11556 if (Mask[I] != I + SplitLHS * Offset) 11557 return false; 11558 } 11559 11560 return true; 11561 } 11562 11563 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { 11564 SDLoc DL(Op); 11565 EVT VT = Op.getValueType(); 11566 SDValue V0 = Op.getOperand(0); 11567 SDValue V1 = Op.getOperand(1); 11568 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); 11569 11570 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || 11571 VT.getVectorElementType() != V1.getValueType().getVectorElementType()) 11572 return SDValue(); 11573 11574 bool SplitV0 = V0.getValueSizeInBits() == 128; 11575 11576 if (!isConcatMask(Mask, VT, SplitV0)) 11577 return SDValue(); 11578 11579 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 11580 if (SplitV0) { 11581 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, 11582 DAG.getConstant(0, DL, MVT::i64)); 11583 } 11584 if (V1.getValueSizeInBits() == 128) { 11585 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, 11586 DAG.getConstant(0, DL, MVT::i64)); 11587 } 11588 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); 11589 } 11590 11591 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 11592 /// the specified operations to build the shuffle. ID is the perfect-shuffle 11593 //ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle 11594 //table entry and LHS/RHS are the immediate inputs for this stage of the 11595 //shuffle. 11596 static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, 11597 SDValue V2, unsigned PFEntry, SDValue LHS, 11598 SDValue RHS, SelectionDAG &DAG, 11599 const SDLoc &dl) { 11600 unsigned OpNum = (PFEntry >> 26) & 0x0F; 11601 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); 11602 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); 11603 11604 enum { 11605 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 11606 OP_VREV, 11607 OP_VDUP0, 11608 OP_VDUP1, 11609 OP_VDUP2, 11610 OP_VDUP3, 11611 OP_VEXT1, 11612 OP_VEXT2, 11613 OP_VEXT3, 11614 OP_VUZPL, // VUZP, left result 11615 OP_VUZPR, // VUZP, right result 11616 OP_VZIPL, // VZIP, left result 11617 OP_VZIPR, // VZIP, right result 11618 OP_VTRNL, // VTRN, left result 11619 OP_VTRNR, // VTRN, right result 11620 OP_MOVLANE // Move lane. RHSID is the lane to move into 11621 }; 11622 11623 if (OpNum == OP_COPY) { 11624 if (LHSID == (1 * 9 + 2) * 9 + 3) 11625 return LHS; 11626 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); 11627 return RHS; 11628 } 11629 11630 if (OpNum == OP_MOVLANE) { 11631 // Decompose a PerfectShuffle ID to get the Mask for lane Elt 11632 auto getPFIDLane = [](unsigned ID, int Elt) -> int { 11633 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4"); 11634 Elt = 3 - Elt; 11635 while (Elt > 0) { 11636 ID /= 9; 11637 Elt--; 11638 } 11639 return (ID % 9 == 8) ? -1 : ID % 9; 11640 }; 11641 11642 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We 11643 // get the lane to move from the PFID, which is always from the 11644 // original vectors (V1 or V2). 11645 SDValue OpLHS = GeneratePerfectShuffle( 11646 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 11647 EVT VT = OpLHS.getValueType(); 11648 assert(RHSID < 8 && "Expected a lane index for RHSID!"); 11649 unsigned ExtLane = 0; 11650 SDValue Input; 11651 11652 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs 11653 // convert into a higher type. 11654 if (RHSID & 0x4) { 11655 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1; 11656 if (MaskElt == -1) 11657 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1; 11658 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); 11659 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2); 11660 Input = MaskElt < 2 ? V1 : V2; 11661 if (VT.getScalarSizeInBits() == 16) { 11662 Input = DAG.getBitcast(MVT::v2f32, Input); 11663 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS); 11664 } else { 11665 assert(VT.getScalarSizeInBits() == 32 && 11666 "Expected 16 or 32 bit shuffle elemements"); 11667 Input = DAG.getBitcast(MVT::v2f64, Input); 11668 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS); 11669 } 11670 } else { 11671 int MaskElt = getPFIDLane(ID, RHSID); 11672 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!"); 11673 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4); 11674 Input = MaskElt < 4 ? V1 : V2; 11675 // Be careful about creating illegal types. Use f16 instead of i16. 11676 if (VT == MVT::v4i16) { 11677 Input = DAG.getBitcast(MVT::v4f16, Input); 11678 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS); 11679 } 11680 } 11681 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, 11682 Input.getValueType().getVectorElementType(), 11683 Input, DAG.getVectorIdxConstant(ExtLane, dl)); 11684 SDValue Ins = 11685 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS, 11686 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl)); 11687 return DAG.getBitcast(VT, Ins); 11688 } 11689 11690 SDValue OpLHS, OpRHS; 11691 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, 11692 RHS, DAG, dl); 11693 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS, 11694 RHS, DAG, dl); 11695 EVT VT = OpLHS.getValueType(); 11696 11697 switch (OpNum) { 11698 default: 11699 llvm_unreachable("Unknown shuffle opcode!"); 11700 case OP_VREV: 11701 // VREV divides the vector in half and swaps within the half. 11702 if (VT.getVectorElementType() == MVT::i32 || 11703 VT.getVectorElementType() == MVT::f32) 11704 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); 11705 // vrev <4 x i16> -> REV32 11706 if (VT.getVectorElementType() == MVT::i16 || 11707 VT.getVectorElementType() == MVT::f16 || 11708 VT.getVectorElementType() == MVT::bf16) 11709 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); 11710 // vrev <4 x i8> -> REV16 11711 assert(VT.getVectorElementType() == MVT::i8); 11712 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); 11713 case OP_VDUP0: 11714 case OP_VDUP1: 11715 case OP_VDUP2: 11716 case OP_VDUP3: { 11717 EVT EltTy = VT.getVectorElementType(); 11718 unsigned Opcode; 11719 if (EltTy == MVT::i8) 11720 Opcode = AArch64ISD::DUPLANE8; 11721 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16) 11722 Opcode = AArch64ISD::DUPLANE16; 11723 else if (EltTy == MVT::i32 || EltTy == MVT::f32) 11724 Opcode = AArch64ISD::DUPLANE32; 11725 else if (EltTy == MVT::i64 || EltTy == MVT::f64) 11726 Opcode = AArch64ISD::DUPLANE64; 11727 else 11728 llvm_unreachable("Invalid vector element type?"); 11729 11730 if (VT.getSizeInBits() == 64) 11731 OpLHS = WidenVector(OpLHS, DAG); 11732 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); 11733 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); 11734 } 11735 case OP_VEXT1: 11736 case OP_VEXT2: 11737 case OP_VEXT3: { 11738 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); 11739 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, 11740 DAG.getConstant(Imm, dl, MVT::i32)); 11741 } 11742 case OP_VUZPL: 11743 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS); 11744 case OP_VUZPR: 11745 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS); 11746 case OP_VZIPL: 11747 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS); 11748 case OP_VZIPR: 11749 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS); 11750 case OP_VTRNL: 11751 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS); 11752 case OP_VTRNR: 11753 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS); 11754 } 11755 } 11756 11757 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, 11758 SelectionDAG &DAG) { 11759 // Check to see if we can use the TBL instruction. 11760 SDValue V1 = Op.getOperand(0); 11761 SDValue V2 = Op.getOperand(1); 11762 SDLoc DL(Op); 11763 11764 EVT EltVT = Op.getValueType().getVectorElementType(); 11765 unsigned BytesPerElt = EltVT.getSizeInBits() / 8; 11766 11767 bool Swap = false; 11768 if (V1.isUndef() || isZerosVector(V1.getNode())) { 11769 std::swap(V1, V2); 11770 Swap = true; 11771 } 11772 11773 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill 11774 // out of range values with 0s. We do need to make sure that any out-of-range 11775 // values are really out-of-range for a v16i8 vector. 11776 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode()); 11777 MVT IndexVT = MVT::v8i8; 11778 unsigned IndexLen = 8; 11779 if (Op.getValueSizeInBits() == 128) { 11780 IndexVT = MVT::v16i8; 11781 IndexLen = 16; 11782 } 11783 11784 SmallVector<SDValue, 8> TBLMask; 11785 for (int Val : ShuffleMask) { 11786 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 11787 unsigned Offset = Byte + Val * BytesPerElt; 11788 if (Swap) 11789 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen; 11790 if (IsUndefOrZero && Offset >= IndexLen) 11791 Offset = 255; 11792 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); 11793 } 11794 } 11795 11796 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); 11797 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); 11798 11799 SDValue Shuffle; 11800 if (IsUndefOrZero) { 11801 if (IndexLen == 8) 11802 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); 11803 Shuffle = DAG.getNode( 11804 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 11805 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 11806 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); 11807 } else { 11808 if (IndexLen == 8) { 11809 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); 11810 Shuffle = DAG.getNode( 11811 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 11812 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 11813 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); 11814 } else { 11815 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we 11816 // cannot currently represent the register constraints on the input 11817 // table registers. 11818 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, 11819 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], 11820 // IndexLen)); 11821 Shuffle = DAG.getNode( 11822 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 11823 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, 11824 V2Cst, 11825 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen))); 11826 } 11827 } 11828 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); 11829 } 11830 11831 static unsigned getDUPLANEOp(EVT EltType) { 11832 if (EltType == MVT::i8) 11833 return AArch64ISD::DUPLANE8; 11834 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16) 11835 return AArch64ISD::DUPLANE16; 11836 if (EltType == MVT::i32 || EltType == MVT::f32) 11837 return AArch64ISD::DUPLANE32; 11838 if (EltType == MVT::i64 || EltType == MVT::f64) 11839 return AArch64ISD::DUPLANE64; 11840 11841 llvm_unreachable("Invalid vector element type?"); 11842 } 11843 11844 static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, 11845 unsigned Opcode, SelectionDAG &DAG) { 11846 // Try to eliminate a bitcasted extract subvector before a DUPLANE. 11847 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) { 11848 // Match: dup (bitcast (extract_subv X, C)), LaneC 11849 if (BitCast.getOpcode() != ISD::BITCAST || 11850 BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) 11851 return false; 11852 11853 // The extract index must align in the destination type. That may not 11854 // happen if the bitcast is from narrow to wide type. 11855 SDValue Extract = BitCast.getOperand(0); 11856 unsigned ExtIdx = Extract.getConstantOperandVal(1); 11857 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); 11858 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; 11859 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); 11860 if (ExtIdxInBits % CastedEltBitWidth != 0) 11861 return false; 11862 11863 // Can't handle cases where vector size is not 128-bit 11864 if (!Extract.getOperand(0).getValueType().is128BitVector()) 11865 return false; 11866 11867 // Update the lane value by offsetting with the scaled extract index. 11868 LaneC += ExtIdxInBits / CastedEltBitWidth; 11869 11870 // Determine the casted vector type of the wide vector input. 11871 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC' 11872 // Examples: 11873 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3 11874 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5 11875 unsigned SrcVecNumElts = 11876 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth; 11877 CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(), 11878 SrcVecNumElts); 11879 return true; 11880 }; 11881 MVT CastVT; 11882 if (getScaledOffsetDup(V, Lane, CastVT)) { 11883 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0)); 11884 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && 11885 V.getOperand(0).getValueType().is128BitVector()) { 11886 // The lane is incremented by the index of the extract. 11887 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 11888 Lane += V.getConstantOperandVal(1); 11889 V = V.getOperand(0); 11890 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) { 11891 // The lane is decremented if we are splatting from the 2nd operand. 11892 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 11893 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; 11894 Lane -= Idx * VT.getVectorNumElements() / 2; 11895 V = WidenVector(V.getOperand(Idx), DAG); 11896 } else if (VT.getSizeInBits() == 64) { 11897 // Widen the operand to 128-bit register with undef. 11898 V = WidenVector(V, DAG); 11899 } 11900 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64)); 11901 } 11902 11903 // Return true if we can get a new shuffle mask by checking the parameter mask 11904 // array to test whether every two adjacent mask values are continuous and 11905 // starting from an even number. 11906 static bool isWideTypeMask(ArrayRef<int> M, EVT VT, 11907 SmallVectorImpl<int> &NewMask) { 11908 unsigned NumElts = VT.getVectorNumElements(); 11909 if (NumElts % 2 != 0) 11910 return false; 11911 11912 NewMask.clear(); 11913 for (unsigned i = 0; i < NumElts; i += 2) { 11914 int M0 = M[i]; 11915 int M1 = M[i + 1]; 11916 11917 // If both elements are undef, new mask is undef too. 11918 if (M0 == -1 && M1 == -1) { 11919 NewMask.push_back(-1); 11920 continue; 11921 } 11922 11923 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) { 11924 NewMask.push_back(M1 / 2); 11925 continue; 11926 } 11927 11928 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) { 11929 NewMask.push_back(M0 / 2); 11930 continue; 11931 } 11932 11933 NewMask.clear(); 11934 return false; 11935 } 11936 11937 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!"); 11938 return true; 11939 } 11940 11941 // Try to widen element type to get a new mask value for a better permutation 11942 // sequence, so that we can use NEON shuffle instructions, such as zip1/2, 11943 // UZP1/2, TRN1/2, REV, INS, etc. 11944 // For example: 11945 // shufflevector <4 x i32> %a, <4 x i32> %b, 11946 // <4 x i32> <i32 6, i32 7, i32 2, i32 3> 11947 // is equivalent to: 11948 // shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1> 11949 // Finally, we can get: 11950 // mov v0.d[0], v1.d[1] 11951 static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) { 11952 SDLoc DL(Op); 11953 EVT VT = Op.getValueType(); 11954 EVT ScalarVT = VT.getVectorElementType(); 11955 unsigned ElementSize = ScalarVT.getFixedSizeInBits(); 11956 SDValue V0 = Op.getOperand(0); 11957 SDValue V1 = Op.getOperand(1); 11958 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); 11959 11960 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ... 11961 // We need to make sure the wider element type is legal. Thus, ElementSize 11962 // should be not larger than 32 bits, and i1 type should also be excluded. 11963 if (ElementSize > 32 || ElementSize == 1) 11964 return SDValue(); 11965 11966 SmallVector<int, 8> NewMask; 11967 if (isWideTypeMask(Mask, VT, NewMask)) { 11968 MVT NewEltVT = VT.isFloatingPoint() 11969 ? MVT::getFloatingPointVT(ElementSize * 2) 11970 : MVT::getIntegerVT(ElementSize * 2); 11971 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); 11972 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { 11973 V0 = DAG.getBitcast(NewVT, V0); 11974 V1 = DAG.getBitcast(NewVT, V1); 11975 return DAG.getBitcast(VT, 11976 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask)); 11977 } 11978 } 11979 11980 return SDValue(); 11981 } 11982 11983 // Try to fold shuffle (tbl2, tbl2) into a single tbl4. 11984 static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, 11985 ArrayRef<int> ShuffleMask, 11986 SelectionDAG &DAG) { 11987 SDValue Tbl1 = Op->getOperand(0); 11988 SDValue Tbl2 = Op->getOperand(1); 11989 SDLoc dl(Op); 11990 SDValue Tbl2ID = 11991 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64); 11992 11993 EVT VT = Op.getValueType(); 11994 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN || 11995 Tbl1->getOperand(0) != Tbl2ID || 11996 Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN || 11997 Tbl2->getOperand(0) != Tbl2ID) 11998 return SDValue(); 11999 12000 if (Tbl1->getValueType(0) != MVT::v16i8 || 12001 Tbl2->getValueType(0) != MVT::v16i8) 12002 return SDValue(); 12003 12004 SDValue Mask1 = Tbl1->getOperand(3); 12005 SDValue Mask2 = Tbl2->getOperand(3); 12006 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue()); 12007 for (unsigned I = 0; I < 16; I++) { 12008 if (ShuffleMask[I] < 16) 12009 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]); 12010 else { 12011 auto *C = 12012 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16)); 12013 if (!C) 12014 return SDValue(); 12015 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32); 12016 } 12017 } 12018 12019 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts); 12020 SDValue ID = 12021 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64); 12022 12023 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8, 12024 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2), 12025 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask}); 12026 } 12027 12028 // Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros, 12029 // but we don't have an appropriate instruction, 12030 // so custom-lower it as ZIP1-with-zeros. 12031 SDValue 12032 AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op, 12033 SelectionDAG &DAG) const { 12034 SDLoc dl(Op); 12035 EVT VT = Op.getValueType(); 12036 SDValue SrcOp = Op.getOperand(0); 12037 EVT SrcVT = SrcOp.getValueType(); 12038 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 && 12039 "Unexpected extension factor."); 12040 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); 12041 // FIXME: support multi-step zipping? 12042 if (Scale != 2) 12043 return SDValue(); 12044 SDValue Zeros = DAG.getConstant(0, dl, SrcVT); 12045 return DAG.getBitcast(VT, 12046 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros)); 12047 } 12048 12049 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 12050 SelectionDAG &DAG) const { 12051 SDLoc dl(Op); 12052 EVT VT = Op.getValueType(); 12053 12054 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 12055 12056 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) 12057 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG); 12058 12059 // Convert shuffles that are directly supported on NEON to target-specific 12060 // DAG nodes, instead of keeping them as shuffles and matching them again 12061 // during code selection. This is more efficient and avoids the possibility 12062 // of inconsistencies between legalization and selection. 12063 ArrayRef<int> ShuffleMask = SVN->getMask(); 12064 12065 SDValue V1 = Op.getOperand(0); 12066 SDValue V2 = Op.getOperand(1); 12067 12068 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!"); 12069 assert(ShuffleMask.size() == VT.getVectorNumElements() && 12070 "Unexpected VECTOR_SHUFFLE mask size!"); 12071 12072 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG)) 12073 return Res; 12074 12075 if (SVN->isSplat()) { 12076 int Lane = SVN->getSplatIndex(); 12077 // If this is undef splat, generate it via "just" vdup, if possible. 12078 if (Lane == -1) 12079 Lane = 0; 12080 12081 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) 12082 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), 12083 V1.getOperand(0)); 12084 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- 12085 // constant. If so, we can just reference the lane's definition directly. 12086 if (V1.getOpcode() == ISD::BUILD_VECTOR && 12087 !isa<ConstantSDNode>(V1.getOperand(Lane))) 12088 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); 12089 12090 // Otherwise, duplicate from the lane of the input vector. 12091 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); 12092 return constructDup(V1, Lane, dl, VT, Opcode, DAG); 12093 } 12094 12095 // Check if the mask matches a DUP for a wider element 12096 for (unsigned LaneSize : {64U, 32U, 16U}) { 12097 unsigned Lane = 0; 12098 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) { 12099 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64 12100 : LaneSize == 32 ? AArch64ISD::DUPLANE32 12101 : AArch64ISD::DUPLANE16; 12102 // Cast V1 to an integer vector with required lane size 12103 MVT NewEltTy = MVT::getIntegerVT(LaneSize); 12104 unsigned NewEltCount = VT.getSizeInBits() / LaneSize; 12105 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount); 12106 V1 = DAG.getBitcast(NewVecTy, V1); 12107 // Constuct the DUP instruction 12108 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG); 12109 // Cast back to the original type 12110 return DAG.getBitcast(VT, V1); 12111 } 12112 } 12113 12114 if (isREVMask(ShuffleMask, VT, 64)) 12115 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); 12116 if (isREVMask(ShuffleMask, VT, 32)) 12117 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); 12118 if (isREVMask(ShuffleMask, VT, 16)) 12119 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); 12120 12121 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) || 12122 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) && 12123 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) { 12124 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1); 12125 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev, 12126 DAG.getConstant(8, dl, MVT::i32)); 12127 } 12128 12129 bool ReverseEXT = false; 12130 unsigned Imm; 12131 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { 12132 if (ReverseEXT) 12133 std::swap(V1, V2); 12134 Imm *= getExtFactor(V1); 12135 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, 12136 DAG.getConstant(Imm, dl, MVT::i32)); 12137 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { 12138 Imm *= getExtFactor(V1); 12139 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, 12140 DAG.getConstant(Imm, dl, MVT::i32)); 12141 } 12142 12143 unsigned WhichResult; 12144 if (isZIPMask(ShuffleMask, VT, WhichResult)) { 12145 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 12146 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 12147 } 12148 if (isUZPMask(ShuffleMask, VT, WhichResult)) { 12149 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 12150 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 12151 } 12152 if (isTRNMask(ShuffleMask, VT, WhichResult)) { 12153 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 12154 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 12155 } 12156 12157 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 12158 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 12159 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 12160 } 12161 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 12162 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 12163 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 12164 } 12165 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 12166 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 12167 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 12168 } 12169 12170 if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) 12171 return Concat; 12172 12173 bool DstIsLeft; 12174 int Anomaly; 12175 int NumInputElements = V1.getValueType().getVectorNumElements(); 12176 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { 12177 SDValue DstVec = DstIsLeft ? V1 : V2; 12178 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); 12179 12180 SDValue SrcVec = V1; 12181 int SrcLane = ShuffleMask[Anomaly]; 12182 if (SrcLane >= NumInputElements) { 12183 SrcVec = V2; 12184 SrcLane -= VT.getVectorNumElements(); 12185 } 12186 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); 12187 12188 EVT ScalarVT = VT.getVectorElementType(); 12189 12190 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger()) 12191 ScalarVT = MVT::i32; 12192 12193 return DAG.getNode( 12194 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 12195 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), 12196 DstLaneV); 12197 } 12198 12199 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG)) 12200 return NewSD; 12201 12202 // If the shuffle is not directly supported and it has 4 elements, use 12203 // the PerfectShuffle-generated table to synthesize it from other shuffles. 12204 unsigned NumElts = VT.getVectorNumElements(); 12205 if (NumElts == 4) { 12206 unsigned PFIndexes[4]; 12207 for (unsigned i = 0; i != 4; ++i) { 12208 if (ShuffleMask[i] < 0) 12209 PFIndexes[i] = 8; 12210 else 12211 PFIndexes[i] = ShuffleMask[i]; 12212 } 12213 12214 // Compute the index in the perfect shuffle table. 12215 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 12216 PFIndexes[2] * 9 + PFIndexes[3]; 12217 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 12218 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG, 12219 dl); 12220 } 12221 12222 return GenerateTBL(Op, ShuffleMask, DAG); 12223 } 12224 12225 SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, 12226 SelectionDAG &DAG) const { 12227 EVT VT = Op.getValueType(); 12228 12229 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) 12230 return LowerToScalableOp(Op, DAG); 12231 12232 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 && 12233 "Unexpected vector type!"); 12234 12235 // We can handle the constant cases during isel. 12236 if (isa<ConstantSDNode>(Op.getOperand(0))) 12237 return Op; 12238 12239 // There isn't a natural way to handle the general i1 case, so we use some 12240 // trickery with whilelo. 12241 SDLoc DL(Op); 12242 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64); 12243 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal, 12244 DAG.getValueType(MVT::i1)); 12245 SDValue ID = 12246 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64); 12247 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 12248 if (VT == MVT::nxv1i1) 12249 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1, 12250 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID, 12251 Zero, SplatVal), 12252 Zero); 12253 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal); 12254 } 12255 12256 SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, 12257 SelectionDAG &DAG) const { 12258 SDLoc DL(Op); 12259 12260 EVT VT = Op.getValueType(); 12261 if (!isTypeLegal(VT) || !VT.isScalableVector()) 12262 return SDValue(); 12263 12264 // Current lowering only supports the SVE-ACLE types. 12265 if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock) 12266 return SDValue(); 12267 12268 // The DUPQ operation is indepedent of element type so normalise to i64s. 12269 SDValue Idx128 = Op.getOperand(2); 12270 12271 // DUPQ can be used when idx is in range. 12272 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128); 12273 if (CIdx && (CIdx->getZExtValue() <= 3)) { 12274 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64); 12275 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI); 12276 } 12277 12278 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); 12279 12280 // The ACLE says this must produce the same result as: 12281 // svtbl(data, svadd_x(svptrue_b64(), 12282 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1), 12283 // index * 2)) 12284 SDValue One = DAG.getConstant(1, DL, MVT::i64); 12285 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One); 12286 12287 // create the vector 0,1,0,1,... 12288 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64); 12289 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne); 12290 12291 // create the vector idx64,idx64+1,idx64,idx64+1,... 12292 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128); 12293 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64); 12294 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64); 12295 12296 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],... 12297 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask); 12298 return DAG.getNode(ISD::BITCAST, DL, VT, TBL); 12299 } 12300 12301 12302 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, 12303 APInt &UndefBits) { 12304 EVT VT = BVN->getValueType(0); 12305 APInt SplatBits, SplatUndef; 12306 unsigned SplatBitSize; 12307 bool HasAnyUndefs; 12308 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 12309 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; 12310 12311 for (unsigned i = 0; i < NumSplats; ++i) { 12312 CnstBits <<= SplatBitSize; 12313 UndefBits <<= SplatBitSize; 12314 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); 12315 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); 12316 } 12317 12318 return true; 12319 } 12320 12321 return false; 12322 } 12323 12324 // Try 64-bit splatted SIMD immediate. 12325 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 12326 const APInt &Bits) { 12327 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 12328 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 12329 EVT VT = Op.getValueType(); 12330 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64; 12331 12332 if (AArch64_AM::isAdvSIMDModImmType10(Value)) { 12333 Value = AArch64_AM::encodeAdvSIMDModImmType10(Value); 12334 12335 SDLoc dl(Op); 12336 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 12337 DAG.getConstant(Value, dl, MVT::i32)); 12338 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 12339 } 12340 } 12341 12342 return SDValue(); 12343 } 12344 12345 // Try 32-bit splatted SIMD immediate. 12346 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 12347 const APInt &Bits, 12348 const SDValue *LHS = nullptr) { 12349 EVT VT = Op.getValueType(); 12350 if (VT.isFixedLengthVector() && 12351 !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()) 12352 return SDValue(); 12353 12354 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 12355 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 12356 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 12357 bool isAdvSIMDModImm = false; 12358 uint64_t Shift; 12359 12360 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) { 12361 Value = AArch64_AM::encodeAdvSIMDModImmType1(Value); 12362 Shift = 0; 12363 } 12364 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) { 12365 Value = AArch64_AM::encodeAdvSIMDModImmType2(Value); 12366 Shift = 8; 12367 } 12368 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) { 12369 Value = AArch64_AM::encodeAdvSIMDModImmType3(Value); 12370 Shift = 16; 12371 } 12372 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) { 12373 Value = AArch64_AM::encodeAdvSIMDModImmType4(Value); 12374 Shift = 24; 12375 } 12376 12377 if (isAdvSIMDModImm) { 12378 SDLoc dl(Op); 12379 SDValue Mov; 12380 12381 if (LHS) 12382 Mov = DAG.getNode(NewOp, dl, MovTy, 12383 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS), 12384 DAG.getConstant(Value, dl, MVT::i32), 12385 DAG.getConstant(Shift, dl, MVT::i32)); 12386 else 12387 Mov = DAG.getNode(NewOp, dl, MovTy, 12388 DAG.getConstant(Value, dl, MVT::i32), 12389 DAG.getConstant(Shift, dl, MVT::i32)); 12390 12391 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 12392 } 12393 } 12394 12395 return SDValue(); 12396 } 12397 12398 // Try 16-bit splatted SIMD immediate. 12399 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 12400 const APInt &Bits, 12401 const SDValue *LHS = nullptr) { 12402 EVT VT = Op.getValueType(); 12403 if (VT.isFixedLengthVector() && 12404 !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()) 12405 return SDValue(); 12406 12407 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 12408 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 12409 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 12410 bool isAdvSIMDModImm = false; 12411 uint64_t Shift; 12412 12413 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) { 12414 Value = AArch64_AM::encodeAdvSIMDModImmType5(Value); 12415 Shift = 0; 12416 } 12417 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) { 12418 Value = AArch64_AM::encodeAdvSIMDModImmType6(Value); 12419 Shift = 8; 12420 } 12421 12422 if (isAdvSIMDModImm) { 12423 SDLoc dl(Op); 12424 SDValue Mov; 12425 12426 if (LHS) 12427 Mov = DAG.getNode(NewOp, dl, MovTy, 12428 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS), 12429 DAG.getConstant(Value, dl, MVT::i32), 12430 DAG.getConstant(Shift, dl, MVT::i32)); 12431 else 12432 Mov = DAG.getNode(NewOp, dl, MovTy, 12433 DAG.getConstant(Value, dl, MVT::i32), 12434 DAG.getConstant(Shift, dl, MVT::i32)); 12435 12436 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 12437 } 12438 } 12439 12440 return SDValue(); 12441 } 12442 12443 // Try 32-bit splatted SIMD immediate with shifted ones. 12444 static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, 12445 SelectionDAG &DAG, const APInt &Bits) { 12446 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 12447 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 12448 EVT VT = Op.getValueType(); 12449 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 12450 bool isAdvSIMDModImm = false; 12451 uint64_t Shift; 12452 12453 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) { 12454 Value = AArch64_AM::encodeAdvSIMDModImmType7(Value); 12455 Shift = 264; 12456 } 12457 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) { 12458 Value = AArch64_AM::encodeAdvSIMDModImmType8(Value); 12459 Shift = 272; 12460 } 12461 12462 if (isAdvSIMDModImm) { 12463 SDLoc dl(Op); 12464 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 12465 DAG.getConstant(Value, dl, MVT::i32), 12466 DAG.getConstant(Shift, dl, MVT::i32)); 12467 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 12468 } 12469 } 12470 12471 return SDValue(); 12472 } 12473 12474 // Try 8-bit splatted SIMD immediate. 12475 static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 12476 const APInt &Bits) { 12477 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 12478 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 12479 EVT VT = Op.getValueType(); 12480 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; 12481 12482 if (AArch64_AM::isAdvSIMDModImmType9(Value)) { 12483 Value = AArch64_AM::encodeAdvSIMDModImmType9(Value); 12484 12485 SDLoc dl(Op); 12486 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 12487 DAG.getConstant(Value, dl, MVT::i32)); 12488 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 12489 } 12490 } 12491 12492 return SDValue(); 12493 } 12494 12495 // Try FP splatted SIMD immediate. 12496 static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 12497 const APInt &Bits) { 12498 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 12499 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 12500 EVT VT = Op.getValueType(); 12501 bool isWide = (VT.getSizeInBits() == 128); 12502 MVT MovTy; 12503 bool isAdvSIMDModImm = false; 12504 12505 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) { 12506 Value = AArch64_AM::encodeAdvSIMDModImmType11(Value); 12507 MovTy = isWide ? MVT::v4f32 : MVT::v2f32; 12508 } 12509 else if (isWide && 12510 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) { 12511 Value = AArch64_AM::encodeAdvSIMDModImmType12(Value); 12512 MovTy = MVT::v2f64; 12513 } 12514 12515 if (isAdvSIMDModImm) { 12516 SDLoc dl(Op); 12517 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 12518 DAG.getConstant(Value, dl, MVT::i32)); 12519 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 12520 } 12521 } 12522 12523 return SDValue(); 12524 } 12525 12526 // Specialized code to quickly find if PotentialBVec is a BuildVector that 12527 // consists of only the same constant int value, returned in reference arg 12528 // ConstVal 12529 static bool isAllConstantBuildVector(const SDValue &PotentialBVec, 12530 uint64_t &ConstVal) { 12531 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec); 12532 if (!Bvec) 12533 return false; 12534 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0)); 12535 if (!FirstElt) 12536 return false; 12537 EVT VT = Bvec->getValueType(0); 12538 unsigned NumElts = VT.getVectorNumElements(); 12539 for (unsigned i = 1; i < NumElts; ++i) 12540 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt) 12541 return false; 12542 ConstVal = FirstElt->getZExtValue(); 12543 return true; 12544 } 12545 12546 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), 12547 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a 12548 // BUILD_VECTORs with constant element C1, C2 is a constant, and: 12549 // - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2) 12550 // - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2) 12551 // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled. 12552 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { 12553 EVT VT = N->getValueType(0); 12554 12555 if (!VT.isVector()) 12556 return SDValue(); 12557 12558 SDLoc DL(N); 12559 12560 SDValue And; 12561 SDValue Shift; 12562 12563 SDValue FirstOp = N->getOperand(0); 12564 unsigned FirstOpc = FirstOp.getOpcode(); 12565 SDValue SecondOp = N->getOperand(1); 12566 unsigned SecondOpc = SecondOp.getOpcode(); 12567 12568 // Is one of the operands an AND or a BICi? The AND may have been optimised to 12569 // a BICi in order to use an immediate instead of a register. 12570 // Is the other operand an shl or lshr? This will have been turned into: 12571 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift. 12572 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) && 12573 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) { 12574 And = FirstOp; 12575 Shift = SecondOp; 12576 12577 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) && 12578 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) { 12579 And = SecondOp; 12580 Shift = FirstOp; 12581 } else 12582 return SDValue(); 12583 12584 bool IsAnd = And.getOpcode() == ISD::AND; 12585 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR; 12586 12587 // Is the shift amount constant? 12588 ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 12589 if (!C2node) 12590 return SDValue(); 12591 12592 uint64_t C1; 12593 if (IsAnd) { 12594 // Is the and mask vector all constant? 12595 if (!isAllConstantBuildVector(And.getOperand(1), C1)) 12596 return SDValue(); 12597 } else { 12598 // Reconstruct the corresponding AND immediate from the two BICi immediates. 12599 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1)); 12600 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2)); 12601 assert(C1nodeImm && C1nodeShift); 12602 C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue()); 12603 } 12604 12605 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or 12606 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account 12607 // how much one can shift elements of a particular size? 12608 uint64_t C2 = C2node->getZExtValue(); 12609 unsigned ElemSizeInBits = VT.getScalarSizeInBits(); 12610 if (C2 > ElemSizeInBits) 12611 return SDValue(); 12612 12613 APInt C1AsAPInt(ElemSizeInBits, C1); 12614 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2) 12615 : APInt::getLowBitsSet(ElemSizeInBits, C2); 12616 if (C1AsAPInt != RequiredC1) 12617 return SDValue(); 12618 12619 SDValue X = And.getOperand(0); 12620 SDValue Y = Shift.getOperand(0); 12621 12622 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; 12623 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1)); 12624 12625 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); 12626 LLVM_DEBUG(N->dump(&DAG)); 12627 LLVM_DEBUG(dbgs() << "into: \n"); 12628 LLVM_DEBUG(ResultSLI->dump(&DAG)); 12629 12630 ++NumShiftInserts; 12631 return ResultSLI; 12632 } 12633 12634 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, 12635 SelectionDAG &DAG) const { 12636 if (useSVEForFixedLengthVectorVT(Op.getValueType(), 12637 !Subtarget->isNeonAvailable())) 12638 return LowerToScalableOp(Op, DAG); 12639 12640 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) 12641 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) 12642 return Res; 12643 12644 EVT VT = Op.getValueType(); 12645 12646 SDValue LHS = Op.getOperand(0); 12647 BuildVectorSDNode *BVN = 12648 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 12649 if (!BVN) { 12650 // OR commutes, so try swapping the operands. 12651 LHS = Op.getOperand(1); 12652 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode()); 12653 } 12654 if (!BVN) 12655 return Op; 12656 12657 APInt DefBits(VT.getSizeInBits(), 0); 12658 APInt UndefBits(VT.getSizeInBits(), 0); 12659 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 12660 SDValue NewOp; 12661 12662 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, 12663 DefBits, &LHS)) || 12664 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, 12665 DefBits, &LHS))) 12666 return NewOp; 12667 12668 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, 12669 UndefBits, &LHS)) || 12670 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, 12671 UndefBits, &LHS))) 12672 return NewOp; 12673 } 12674 12675 // We can always fall back to a non-immediate OR. 12676 return Op; 12677 } 12678 12679 // Normalize the operands of BUILD_VECTOR. The value of constant operands will 12680 // be truncated to fit element width. 12681 static SDValue NormalizeBuildVector(SDValue Op, 12682 SelectionDAG &DAG) { 12683 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 12684 SDLoc dl(Op); 12685 EVT VT = Op.getValueType(); 12686 EVT EltTy= VT.getVectorElementType(); 12687 12688 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) 12689 return Op; 12690 12691 SmallVector<SDValue, 16> Ops; 12692 for (SDValue Lane : Op->ops()) { 12693 // For integer vectors, type legalization would have promoted the 12694 // operands already. Otherwise, if Op is a floating-point splat 12695 // (with operands cast to integers), then the only possibilities 12696 // are constants and UNDEFs. 12697 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) { 12698 APInt LowBits(EltTy.getSizeInBits(), 12699 CstLane->getZExtValue()); 12700 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); 12701 } else if (Lane.getNode()->isUndef()) { 12702 Lane = DAG.getUNDEF(MVT::i32); 12703 } else { 12704 assert(Lane.getValueType() == MVT::i32 && 12705 "Unexpected BUILD_VECTOR operand type"); 12706 } 12707 Ops.push_back(Lane); 12708 } 12709 return DAG.getBuildVector(VT, dl, Ops); 12710 } 12711 12712 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) { 12713 EVT VT = Op.getValueType(); 12714 12715 APInt DefBits(VT.getSizeInBits(), 0); 12716 APInt UndefBits(VT.getSizeInBits(), 0); 12717 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 12718 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 12719 SDValue NewOp; 12720 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || 12721 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 12722 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || 12723 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 12724 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || 12725 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) 12726 return NewOp; 12727 12728 DefBits = ~DefBits; 12729 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || 12730 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || 12731 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) 12732 return NewOp; 12733 12734 DefBits = UndefBits; 12735 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || 12736 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 12737 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || 12738 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 12739 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || 12740 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) 12741 return NewOp; 12742 12743 DefBits = ~UndefBits; 12744 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || 12745 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || 12746 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) 12747 return NewOp; 12748 } 12749 12750 return SDValue(); 12751 } 12752 12753 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, 12754 SelectionDAG &DAG) const { 12755 EVT VT = Op.getValueType(); 12756 12757 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) { 12758 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) { 12759 SDLoc DL(Op); 12760 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 12761 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT); 12762 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second); 12763 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps); 12764 return convertFromScalableVector(DAG, Op.getValueType(), Seq); 12765 } 12766 12767 // Revert to common legalisation for all other variants. 12768 return SDValue(); 12769 } 12770 12771 // Try to build a simple constant vector. 12772 Op = NormalizeBuildVector(Op, DAG); 12773 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so, 12774 // abort. 12775 if (Op.getOpcode() != ISD::BUILD_VECTOR) 12776 return SDValue(); 12777 12778 // Certain vector constants, used to express things like logical NOT and 12779 // arithmetic NEG, are passed through unmodified. This allows special 12780 // patterns for these operations to match, which will lower these constants 12781 // to whatever is proven necessary. 12782 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 12783 if (BVN->isConstant()) { 12784 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) { 12785 unsigned BitSize = VT.getVectorElementType().getSizeInBits(); 12786 APInt Val(BitSize, 12787 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue()); 12788 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes())) 12789 return Op; 12790 } 12791 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode()) 12792 if (Const->isZero() && !Const->isNegative()) 12793 return Op; 12794 } 12795 12796 if (SDValue V = ConstantBuildVector(Op, DAG)) 12797 return V; 12798 12799 // Scan through the operands to find some interesting properties we can 12800 // exploit: 12801 // 1) If only one value is used, we can use a DUP, or 12802 // 2) if only the low element is not undef, we can just insert that, or 12803 // 3) if only one constant value is used (w/ some non-constant lanes), 12804 // we can splat the constant value into the whole vector then fill 12805 // in the non-constant lanes. 12806 // 4) FIXME: If different constant values are used, but we can intelligently 12807 // select the values we'll be overwriting for the non-constant 12808 // lanes such that we can directly materialize the vector 12809 // some other way (MOVI, e.g.), we can be sneaky. 12810 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP. 12811 SDLoc dl(Op); 12812 unsigned NumElts = VT.getVectorNumElements(); 12813 bool isOnlyLowElement = true; 12814 bool usesOnlyOneValue = true; 12815 bool usesOnlyOneConstantValue = true; 12816 bool isConstant = true; 12817 bool AllLanesExtractElt = true; 12818 unsigned NumConstantLanes = 0; 12819 unsigned NumDifferentLanes = 0; 12820 unsigned NumUndefLanes = 0; 12821 SDValue Value; 12822 SDValue ConstantValue; 12823 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap; 12824 unsigned ConsecutiveValCount = 0; 12825 SDValue PrevVal; 12826 for (unsigned i = 0; i < NumElts; ++i) { 12827 SDValue V = Op.getOperand(i); 12828 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12829 AllLanesExtractElt = false; 12830 if (V.isUndef()) { 12831 ++NumUndefLanes; 12832 continue; 12833 } 12834 if (i > 0) 12835 isOnlyLowElement = false; 12836 if (!isIntOrFPConstant(V)) 12837 isConstant = false; 12838 12839 if (isIntOrFPConstant(V)) { 12840 ++NumConstantLanes; 12841 if (!ConstantValue.getNode()) 12842 ConstantValue = V; 12843 else if (ConstantValue != V) 12844 usesOnlyOneConstantValue = false; 12845 } 12846 12847 if (!Value.getNode()) 12848 Value = V; 12849 else if (V != Value) { 12850 usesOnlyOneValue = false; 12851 ++NumDifferentLanes; 12852 } 12853 12854 if (PrevVal != V) { 12855 ConsecutiveValCount = 0; 12856 PrevVal = V; 12857 } 12858 12859 // Keep different values and its last consecutive count. For example, 12860 // 12861 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23, 12862 // t24, t24, t24, t24, t24, t24, t24, t24 12863 // t23 = consecutive count 8 12864 // t24 = consecutive count 8 12865 // ------------------------------------------------------------------ 12866 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24, 12867 // t24, t24, t24, t24, t24, t24, t24, t24 12868 // t23 = consecutive count 5 12869 // t24 = consecutive count 9 12870 DifferentValueMap[V] = ++ConsecutiveValCount; 12871 } 12872 12873 if (!Value.getNode()) { 12874 LLVM_DEBUG( 12875 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n"); 12876 return DAG.getUNDEF(VT); 12877 } 12878 12879 // Convert BUILD_VECTOR where all elements but the lowest are undef into 12880 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector 12881 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR. 12882 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) { 12883 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 " 12884 "SCALAR_TO_VECTOR node\n"); 12885 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 12886 } 12887 12888 if (AllLanesExtractElt) { 12889 SDNode *Vector = nullptr; 12890 bool Even = false; 12891 bool Odd = false; 12892 // Check whether the extract elements match the Even pattern <0,2,4,...> or 12893 // the Odd pattern <1,3,5,...>. 12894 for (unsigned i = 0; i < NumElts; ++i) { 12895 SDValue V = Op.getOperand(i); 12896 const SDNode *N = V.getNode(); 12897 if (!isa<ConstantSDNode>(N->getOperand(1))) { 12898 Even = false; 12899 Odd = false; 12900 break; 12901 } 12902 SDValue N0 = N->getOperand(0); 12903 12904 // All elements are extracted from the same vector. 12905 if (!Vector) { 12906 Vector = N0.getNode(); 12907 // Check that the type of EXTRACT_VECTOR_ELT matches the type of 12908 // BUILD_VECTOR. 12909 if (VT.getVectorElementType() != 12910 N0.getValueType().getVectorElementType()) 12911 break; 12912 } else if (Vector != N0.getNode()) { 12913 Odd = false; 12914 Even = false; 12915 break; 12916 } 12917 12918 // Extracted values are either at Even indices <0,2,4,...> or at Odd 12919 // indices <1,3,5,...>. 12920 uint64_t Val = N->getConstantOperandVal(1); 12921 if (Val == 2 * i) { 12922 Even = true; 12923 continue; 12924 } 12925 if (Val - 1 == 2 * i) { 12926 Odd = true; 12927 continue; 12928 } 12929 12930 // Something does not match: abort. 12931 Odd = false; 12932 Even = false; 12933 break; 12934 } 12935 if (Even || Odd) { 12936 SDValue LHS = 12937 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), 12938 DAG.getConstant(0, dl, MVT::i64)); 12939 SDValue RHS = 12940 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), 12941 DAG.getConstant(NumElts, dl, MVT::i64)); 12942 12943 if (Even && !Odd) 12944 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS, 12945 RHS); 12946 if (Odd && !Even) 12947 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS, 12948 RHS); 12949 } 12950 } 12951 12952 // Use DUP for non-constant splats. For f32 constant splats, reduce to 12953 // i32 and try again. 12954 if (usesOnlyOneValue) { 12955 if (!isConstant) { 12956 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 12957 Value.getValueType() != VT) { 12958 LLVM_DEBUG( 12959 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n"); 12960 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); 12961 } 12962 12963 // This is actually a DUPLANExx operation, which keeps everything vectory. 12964 12965 SDValue Lane = Value.getOperand(1); 12966 Value = Value.getOperand(0); 12967 if (Value.getValueSizeInBits() == 64) { 12968 LLVM_DEBUG( 12969 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, " 12970 "widening it\n"); 12971 Value = WidenVector(Value, DAG); 12972 } 12973 12974 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); 12975 return DAG.getNode(Opcode, dl, VT, Value, Lane); 12976 } 12977 12978 if (VT.getVectorElementType().isFloatingPoint()) { 12979 SmallVector<SDValue, 8> Ops; 12980 EVT EltTy = VT.getVectorElementType(); 12981 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 || 12982 EltTy == MVT::f64) && "Unsupported floating-point vector type"); 12983 LLVM_DEBUG( 12984 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " 12985 "BITCASTS, and try again\n"); 12986 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); 12987 for (unsigned i = 0; i < NumElts; ++i) 12988 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); 12989 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); 12990 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 12991 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: "; 12992 Val.dump();); 12993 Val = LowerBUILD_VECTOR(Val, DAG); 12994 if (Val.getNode()) 12995 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 12996 } 12997 } 12998 12999 // If we need to insert a small number of different non-constant elements and 13000 // the vector width is sufficiently large, prefer using DUP with the common 13001 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred, 13002 // skip the constant lane handling below. 13003 bool PreferDUPAndInsert = 13004 !isConstant && NumDifferentLanes >= 1 && 13005 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) && 13006 NumDifferentLanes >= NumConstantLanes; 13007 13008 // If there was only one constant value used and for more than one lane, 13009 // start by splatting that value, then replace the non-constant lanes. This 13010 // is better than the default, which will perform a separate initialization 13011 // for each lane. 13012 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) { 13013 // Firstly, try to materialize the splat constant. 13014 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue); 13015 unsigned BitSize = VT.getScalarSizeInBits(); 13016 APInt ConstantValueAPInt(1, 0); 13017 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue)) 13018 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize); 13019 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) && 13020 !ConstantValueAPInt.isAllOnes()) { 13021 Val = ConstantBuildVector(Val, DAG); 13022 if (!Val) 13023 // Otherwise, materialize the constant and splat it. 13024 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); 13025 } 13026 13027 // Now insert the non-constant lanes. 13028 for (unsigned i = 0; i < NumElts; ++i) { 13029 SDValue V = Op.getOperand(i); 13030 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 13031 if (!isIntOrFPConstant(V)) 13032 // Note that type legalization likely mucked about with the VT of the 13033 // source operand, so we may have to convert it here before inserting. 13034 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); 13035 } 13036 return Val; 13037 } 13038 13039 // This will generate a load from the constant pool. 13040 if (isConstant) { 13041 LLVM_DEBUG( 13042 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default " 13043 "expansion\n"); 13044 return SDValue(); 13045 } 13046 13047 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from 13048 // v4i32s. This is really a truncate, which we can construct out of (legal) 13049 // concats and truncate nodes. 13050 if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG)) 13051 return M; 13052 13053 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 13054 if (NumElts >= 4) { 13055 if (SDValue Shuffle = ReconstructShuffle(Op, DAG)) 13056 return Shuffle; 13057 13058 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG)) 13059 return Shuffle; 13060 } 13061 13062 if (PreferDUPAndInsert) { 13063 // First, build a constant vector with the common element. 13064 SmallVector<SDValue, 8> Ops(NumElts, Value); 13065 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG); 13066 // Next, insert the elements that do not match the common value. 13067 for (unsigned I = 0; I < NumElts; ++I) 13068 if (Op.getOperand(I) != Value) 13069 NewVector = 13070 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector, 13071 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64)); 13072 13073 return NewVector; 13074 } 13075 13076 // If vector consists of two different values, try to generate two DUPs and 13077 // (CONCAT_VECTORS or VECTOR_SHUFFLE). 13078 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) { 13079 SmallVector<SDValue, 2> Vals; 13080 // Check the consecutive count of the value is the half number of vector 13081 // elements. In this case, we can use CONCAT_VECTORS. For example, 13082 // 13083 // canUseVECTOR_CONCAT = true; 13084 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23, 13085 // t24, t24, t24, t24, t24, t24, t24, t24 13086 // 13087 // canUseVECTOR_CONCAT = false; 13088 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24, 13089 // t24, t24, t24, t24, t24, t24, t24, t24 13090 bool canUseVECTOR_CONCAT = true; 13091 for (auto Pair : DifferentValueMap) { 13092 // Check different values have same length which is NumElts / 2. 13093 if (Pair.second != NumElts / 2) 13094 canUseVECTOR_CONCAT = false; 13095 Vals.push_back(Pair.first); 13096 } 13097 13098 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and 13099 // CONCAT_VECTORs. For example, 13100 // 13101 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23, 13102 // t24, t24, t24, t24, t24, t24, t24, t24 13103 // ==> 13104 // t26: v8i8 = AArch64ISD::DUP t23 13105 // t28: v8i8 = AArch64ISD::DUP t24 13106 // t29: v16i8 = concat_vectors t26, t28 13107 if (canUseVECTOR_CONCAT) { 13108 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 13109 if (isTypeLegal(SubVT) && SubVT.isVector() && 13110 SubVT.getVectorNumElements() >= 2) { 13111 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]); 13112 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]); 13113 SDValue DUP1 = 13114 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG); 13115 SDValue DUP2 = 13116 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG); 13117 SDValue CONCAT_VECTORS = 13118 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2); 13119 return CONCAT_VECTORS; 13120 } 13121 } 13122 13123 // Let's try to generate VECTOR_SHUFFLE. For example, 13124 // 13125 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26 13126 // ==> 13127 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26 13128 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25 13129 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28 13130 if (NumElts >= 8) { 13131 SmallVector<int, 16> MaskVec; 13132 // Build mask for VECTOR_SHUFLLE. 13133 SDValue FirstLaneVal = Op.getOperand(0); 13134 for (unsigned i = 0; i < NumElts; ++i) { 13135 SDValue Val = Op.getOperand(i); 13136 if (FirstLaneVal == Val) 13137 MaskVec.push_back(i); 13138 else 13139 MaskVec.push_back(i + NumElts); 13140 } 13141 13142 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]); 13143 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]); 13144 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1); 13145 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2); 13146 SDValue VECTOR_SHUFFLE = 13147 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec); 13148 return VECTOR_SHUFFLE; 13149 } 13150 } 13151 13152 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 13153 // know the default expansion would otherwise fall back on something even 13154 // worse. For a vector with one or two non-undef values, that's 13155 // scalar_to_vector for the elements followed by a shuffle (provided the 13156 // shuffle is valid for the target) and materialization element by element 13157 // on the stack followed by a load for everything else. 13158 if (!isConstant && !usesOnlyOneValue) { 13159 LLVM_DEBUG( 13160 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence " 13161 "of INSERT_VECTOR_ELT\n"); 13162 13163 SDValue Vec = DAG.getUNDEF(VT); 13164 SDValue Op0 = Op.getOperand(0); 13165 unsigned i = 0; 13166 13167 // Use SCALAR_TO_VECTOR for lane zero to 13168 // a) Avoid a RMW dependency on the full vector register, and 13169 // b) Allow the register coalescer to fold away the copy if the 13170 // value is already in an S or D register, and we're forced to emit an 13171 // INSERT_SUBREG that we can't fold anywhere. 13172 // 13173 // We also allow types like i8 and i16 which are illegal scalar but legal 13174 // vector element types. After type-legalization the inserted value is 13175 // extended (i32) and it is safe to cast them to the vector type by ignoring 13176 // the upper bits of the lowest lane (e.g. v8i8, v4i16). 13177 if (!Op0.isUndef()) { 13178 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n"); 13179 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); 13180 ++i; 13181 } 13182 LLVM_DEBUG(if (i < NumElts) dbgs() 13183 << "Creating nodes for the other vector elements:\n";); 13184 for (; i < NumElts; ++i) { 13185 SDValue V = Op.getOperand(i); 13186 if (V.isUndef()) 13187 continue; 13188 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 13189 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 13190 } 13191 return Vec; 13192 } 13193 13194 LLVM_DEBUG( 13195 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find " 13196 "better alternative\n"); 13197 return SDValue(); 13198 } 13199 13200 SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op, 13201 SelectionDAG &DAG) const { 13202 if (useSVEForFixedLengthVectorVT(Op.getValueType(), 13203 !Subtarget->isNeonAvailable())) 13204 return LowerFixedLengthConcatVectorsToSVE(Op, DAG); 13205 13206 assert(Op.getValueType().isScalableVector() && 13207 isTypeLegal(Op.getValueType()) && 13208 "Expected legal scalable vector type!"); 13209 13210 if (isTypeLegal(Op.getOperand(0).getValueType())) { 13211 unsigned NumOperands = Op->getNumOperands(); 13212 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && 13213 "Unexpected number of operands in CONCAT_VECTORS"); 13214 13215 if (NumOperands == 2) 13216 return Op; 13217 13218 // Concat each pair of subvectors and pack into the lower half of the array. 13219 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end()); 13220 while (ConcatOps.size() > 1) { 13221 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) { 13222 SDValue V1 = ConcatOps[I]; 13223 SDValue V2 = ConcatOps[I + 1]; 13224 EVT SubVT = V1.getValueType(); 13225 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext()); 13226 ConcatOps[I / 2] = 13227 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2); 13228 } 13229 ConcatOps.resize(ConcatOps.size() / 2); 13230 } 13231 return ConcatOps[0]; 13232 } 13233 13234 return SDValue(); 13235 } 13236 13237 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 13238 SelectionDAG &DAG) const { 13239 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); 13240 13241 if (useSVEForFixedLengthVectorVT(Op.getValueType(), 13242 !Subtarget->isNeonAvailable())) 13243 return LowerFixedLengthInsertVectorElt(Op, DAG); 13244 13245 EVT VT = Op.getOperand(0).getValueType(); 13246 13247 if (VT.getScalarType() == MVT::i1) { 13248 EVT VectorVT = getPromotedVTForPredicate(VT); 13249 SDLoc DL(Op); 13250 SDValue ExtendedVector = 13251 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT); 13252 SDValue ExtendedValue = 13253 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL, 13254 VectorVT.getScalarType().getSizeInBits() < 32 13255 ? MVT::i32 13256 : VectorVT.getScalarType()); 13257 ExtendedVector = 13258 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector, 13259 ExtendedValue, Op.getOperand(2)); 13260 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT); 13261 } 13262 13263 // Check for non-constant or out of range lane. 13264 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 13265 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 13266 return SDValue(); 13267 13268 return Op; 13269 } 13270 13271 SDValue 13272 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 13273 SelectionDAG &DAG) const { 13274 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); 13275 EVT VT = Op.getOperand(0).getValueType(); 13276 13277 if (VT.getScalarType() == MVT::i1) { 13278 // We can't directly extract from an SVE predicate; extend it first. 13279 // (This isn't the only possible lowering, but it's straightforward.) 13280 EVT VectorVT = getPromotedVTForPredicate(VT); 13281 SDLoc DL(Op); 13282 SDValue Extend = 13283 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0)); 13284 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32; 13285 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy, 13286 Extend, Op.getOperand(1)); 13287 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType()); 13288 } 13289 13290 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) 13291 return LowerFixedLengthExtractVectorElt(Op, DAG); 13292 13293 // Check for non-constant or out of range lane. 13294 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 13295 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 13296 return SDValue(); 13297 13298 // Insertion/extraction are legal for V128 types. 13299 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 13300 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 13301 VT == MVT::v8f16 || VT == MVT::v8bf16) 13302 return Op; 13303 13304 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 13305 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && 13306 VT != MVT::v4bf16) 13307 return SDValue(); 13308 13309 // For V64 types, we perform extraction by expanding the value 13310 // to a V128 type and perform the extraction on that. 13311 SDLoc DL(Op); 13312 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 13313 EVT WideTy = WideVec.getValueType(); 13314 13315 EVT ExtrTy = WideTy.getVectorElementType(); 13316 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) 13317 ExtrTy = MVT::i32; 13318 13319 // For extractions, we just return the result directly. 13320 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, 13321 Op.getOperand(1)); 13322 } 13323 13324 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 13325 SelectionDAG &DAG) const { 13326 assert(Op.getValueType().isFixedLengthVector() && 13327 "Only cases that extract a fixed length vector are supported!"); 13328 13329 EVT InVT = Op.getOperand(0).getValueType(); 13330 unsigned Idx = Op.getConstantOperandVal(1); 13331 unsigned Size = Op.getValueSizeInBits(); 13332 13333 // If we don't have legal types yet, do nothing 13334 if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) 13335 return SDValue(); 13336 13337 if (InVT.isScalableVector()) { 13338 // This will be matched by custom code during ISelDAGToDAG. 13339 if (Idx == 0 && isPackedVectorType(InVT, DAG)) 13340 return Op; 13341 13342 return SDValue(); 13343 } 13344 13345 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. 13346 if (Idx == 0 && InVT.getSizeInBits() <= 128) 13347 return Op; 13348 13349 // If this is extracting the upper 64-bits of a 128-bit vector, we match 13350 // that directly. 13351 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 && 13352 InVT.getSizeInBits() == 128 && Subtarget->isNeonAvailable()) 13353 return Op; 13354 13355 if (useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) { 13356 SDLoc DL(Op); 13357 13358 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 13359 SDValue NewInVec = 13360 convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); 13361 13362 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec, 13363 NewInVec, DAG.getConstant(Idx, DL, MVT::i64)); 13364 return convertFromScalableVector(DAG, Op.getValueType(), Splice); 13365 } 13366 13367 return SDValue(); 13368 } 13369 13370 SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, 13371 SelectionDAG &DAG) const { 13372 assert(Op.getValueType().isScalableVector() && 13373 "Only expect to lower inserts into scalable vectors!"); 13374 13375 EVT InVT = Op.getOperand(1).getValueType(); 13376 unsigned Idx = Op.getConstantOperandVal(2); 13377 13378 SDValue Vec0 = Op.getOperand(0); 13379 SDValue Vec1 = Op.getOperand(1); 13380 SDLoc DL(Op); 13381 EVT VT = Op.getValueType(); 13382 13383 if (InVT.isScalableVector()) { 13384 if (!isTypeLegal(VT)) 13385 return SDValue(); 13386 13387 // Break down insert_subvector into simpler parts. 13388 if (VT.getVectorElementType() == MVT::i1) { 13389 unsigned NumElts = VT.getVectorMinNumElements(); 13390 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 13391 13392 SDValue Lo, Hi; 13393 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0, 13394 DAG.getVectorIdxConstant(0, DL)); 13395 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0, 13396 DAG.getVectorIdxConstant(NumElts / 2, DL)); 13397 if (Idx < (NumElts / 2)) { 13398 SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1, 13399 DAG.getVectorIdxConstant(Idx, DL)); 13400 return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi); 13401 } else { 13402 SDValue NewHi = 13403 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1, 13404 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL)); 13405 return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi); 13406 } 13407 } 13408 13409 // Ensure the subvector is half the size of the main vector. 13410 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) 13411 return SDValue(); 13412 13413 // Here narrow and wide refers to the vector element types. After "casting" 13414 // both vectors must have the same bit length and so because the subvector 13415 // has fewer elements, those elements need to be bigger. 13416 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount()); 13417 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount()); 13418 13419 // NOP cast operands to the largest legal vector of the same element count. 13420 if (VT.isFloatingPoint()) { 13421 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG); 13422 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG); 13423 } else { 13424 // Legal integer vectors are already their largest so Vec0 is fine as is. 13425 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1); 13426 } 13427 13428 // To replace the top/bottom half of vector V with vector SubV we widen the 13429 // preserved half of V, concatenate this to SubV (the order depending on the 13430 // half being replaced) and then narrow the result. 13431 SDValue Narrow; 13432 if (Idx == 0) { 13433 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0); 13434 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0); 13435 } else { 13436 assert(Idx == InVT.getVectorMinNumElements() && 13437 "Invalid subvector index!"); 13438 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0); 13439 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1); 13440 } 13441 13442 return getSVESafeBitCast(VT, Narrow, DAG); 13443 } 13444 13445 if (Idx == 0 && isPackedVectorType(VT, DAG)) { 13446 // This will be matched by custom code during ISelDAGToDAG. 13447 if (Vec0.isUndef()) 13448 return Op; 13449 13450 std::optional<unsigned> PredPattern = 13451 getSVEPredPatternFromNumElements(InVT.getVectorNumElements()); 13452 auto PredTy = VT.changeVectorElementType(MVT::i1); 13453 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern); 13454 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1); 13455 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0); 13456 } 13457 13458 return SDValue(); 13459 } 13460 13461 static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) { 13462 if (Op.getOpcode() != AArch64ISD::DUP && 13463 Op.getOpcode() != ISD::SPLAT_VECTOR && 13464 Op.getOpcode() != ISD::BUILD_VECTOR) 13465 return false; 13466 13467 if (Op.getOpcode() == ISD::BUILD_VECTOR && 13468 !isAllConstantBuildVector(Op, SplatVal)) 13469 return false; 13470 13471 if (Op.getOpcode() != ISD::BUILD_VECTOR && 13472 !isa<ConstantSDNode>(Op->getOperand(0))) 13473 return false; 13474 13475 SplatVal = Op->getConstantOperandVal(0); 13476 if (Op.getValueType().getVectorElementType() != MVT::i64) 13477 SplatVal = (int32_t)SplatVal; 13478 13479 Negated = false; 13480 if (isPowerOf2_64(SplatVal)) 13481 return true; 13482 13483 Negated = true; 13484 if (isPowerOf2_64(-SplatVal)) { 13485 SplatVal = -SplatVal; 13486 return true; 13487 } 13488 13489 return false; 13490 } 13491 13492 SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { 13493 EVT VT = Op.getValueType(); 13494 SDLoc dl(Op); 13495 13496 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) 13497 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG); 13498 13499 assert(VT.isScalableVector() && "Expected a scalable vector."); 13500 13501 bool Signed = Op.getOpcode() == ISD::SDIV; 13502 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; 13503 13504 bool Negated; 13505 uint64_t SplatVal; 13506 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) { 13507 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT); 13508 SDValue Res = 13509 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0), 13510 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32)); 13511 if (Negated) 13512 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res); 13513 13514 return Res; 13515 } 13516 13517 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64) 13518 return LowerToPredicatedOp(Op, DAG, PredOpcode); 13519 13520 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit 13521 // operations, and truncate the result. 13522 EVT WidenedVT; 13523 if (VT == MVT::nxv16i8) 13524 WidenedVT = MVT::nxv8i16; 13525 else if (VT == MVT::nxv8i16) 13526 WidenedVT = MVT::nxv4i32; 13527 else 13528 llvm_unreachable("Unexpected Custom DIV operation"); 13529 13530 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; 13531 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI; 13532 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0)); 13533 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1)); 13534 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0)); 13535 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1)); 13536 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo); 13537 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi); 13538 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi); 13539 } 13540 13541 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 13542 // Currently no fixed length shuffles that require SVE are legal. 13543 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) 13544 return false; 13545 13546 if (VT.getVectorNumElements() == 4 && 13547 (VT.is128BitVector() || VT.is64BitVector())) { 13548 unsigned Cost = getPerfectShuffleCost(M); 13549 if (Cost <= 1) 13550 return true; 13551 } 13552 13553 bool DummyBool; 13554 int DummyInt; 13555 unsigned DummyUnsigned; 13556 13557 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || 13558 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || 13559 isEXTMask(M, VT, DummyBool, DummyUnsigned) || 13560 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. 13561 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || 13562 isZIPMask(M, VT, DummyUnsigned) || 13563 isTRN_v_undef_Mask(M, VT, DummyUnsigned) || 13564 isUZP_v_undef_Mask(M, VT, DummyUnsigned) || 13565 isZIP_v_undef_Mask(M, VT, DummyUnsigned) || 13566 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || 13567 isConcatMask(M, VT, VT.getSizeInBits() == 128)); 13568 } 13569 13570 bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M, 13571 EVT VT) const { 13572 // Just delegate to the generic legality, clear masks aren't special. 13573 return isShuffleMaskLegal(M, VT); 13574 } 13575 13576 /// getVShiftImm - Check if this is a valid build_vector for the immediate 13577 /// operand of a vector shift operation, where all the elements of the 13578 /// build_vector must have the same constant integer value. 13579 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 13580 // Ignore bit_converts. 13581 while (Op.getOpcode() == ISD::BITCAST) 13582 Op = Op.getOperand(0); 13583 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 13584 APInt SplatBits, SplatUndef; 13585 unsigned SplatBitSize; 13586 bool HasAnyUndefs; 13587 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 13588 HasAnyUndefs, ElementBits) || 13589 SplatBitSize > ElementBits) 13590 return false; 13591 Cnt = SplatBits.getSExtValue(); 13592 return true; 13593 } 13594 13595 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 13596 /// operand of a vector shift left operation. That value must be in the range: 13597 /// 0 <= Value < ElementBits for a left shift; or 13598 /// 0 <= Value <= ElementBits for a long left shift. 13599 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 13600 assert(VT.isVector() && "vector shift count is not a vector type"); 13601 int64_t ElementBits = VT.getScalarSizeInBits(); 13602 if (!getVShiftImm(Op, ElementBits, Cnt)) 13603 return false; 13604 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 13605 } 13606 13607 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 13608 /// operand of a vector shift right operation. The value must be in the range: 13609 /// 1 <= Value <= ElementBits for a right shift; or 13610 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { 13611 assert(VT.isVector() && "vector shift count is not a vector type"); 13612 int64_t ElementBits = VT.getScalarSizeInBits(); 13613 if (!getVShiftImm(Op, ElementBits, Cnt)) 13614 return false; 13615 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 13616 } 13617 13618 SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op, 13619 SelectionDAG &DAG) const { 13620 EVT VT = Op.getValueType(); 13621 13622 if (VT.getScalarType() == MVT::i1) { 13623 // Lower i1 truncate to `(x & 1) != 0`. 13624 SDLoc dl(Op); 13625 EVT OpVT = Op.getOperand(0).getValueType(); 13626 SDValue Zero = DAG.getConstant(0, dl, OpVT); 13627 SDValue One = DAG.getConstant(1, dl, OpVT); 13628 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One); 13629 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE); 13630 } 13631 13632 if (!VT.isVector() || VT.isScalableVector()) 13633 return SDValue(); 13634 13635 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(), 13636 !Subtarget->isNeonAvailable())) 13637 return LowerFixedLengthVectorTruncateToSVE(Op, DAG); 13638 13639 return SDValue(); 13640 } 13641 13642 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, 13643 SelectionDAG &DAG) const { 13644 EVT VT = Op.getValueType(); 13645 SDLoc DL(Op); 13646 int64_t Cnt; 13647 13648 if (!Op.getOperand(1).getValueType().isVector()) 13649 return Op; 13650 unsigned EltSize = VT.getScalarSizeInBits(); 13651 13652 switch (Op.getOpcode()) { 13653 case ISD::SHL: 13654 if (VT.isScalableVector() || 13655 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) 13656 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED); 13657 13658 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) 13659 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), 13660 DAG.getConstant(Cnt, DL, MVT::i32)); 13661 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13662 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, 13663 MVT::i32), 13664 Op.getOperand(0), Op.getOperand(1)); 13665 case ISD::SRA: 13666 case ISD::SRL: 13667 if (VT.isScalableVector() || 13668 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) { 13669 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED 13670 : AArch64ISD::SRL_PRED; 13671 return LowerToPredicatedOp(Op, DAG, Opc); 13672 } 13673 13674 // Right shift immediate 13675 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { 13676 unsigned Opc = 13677 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; 13678 return DAG.getNode(Opc, DL, VT, Op.getOperand(0), 13679 DAG.getConstant(Cnt, DL, MVT::i32)); 13680 } 13681 13682 // Right shift register. Note, there is not a shift right register 13683 // instruction, but the shift left register instruction takes a signed 13684 // value, where negative numbers specify a right shift. 13685 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl 13686 : Intrinsic::aarch64_neon_ushl; 13687 // negate the shift amount 13688 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 13689 Op.getOperand(1)); 13690 SDValue NegShiftLeft = 13691 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 13692 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), 13693 NegShift); 13694 return NegShiftLeft; 13695 } 13696 13697 llvm_unreachable("unexpected shift opcode"); 13698 } 13699 13700 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, 13701 AArch64CC::CondCode CC, bool NoNans, EVT VT, 13702 const SDLoc &dl, SelectionDAG &DAG) { 13703 EVT SrcVT = LHS.getValueType(); 13704 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && 13705 "function only supposed to emit natural comparisons"); 13706 13707 APInt SplatValue; 13708 APInt SplatUndef; 13709 unsigned SplatBitSize = 0; 13710 bool HasAnyUndefs; 13711 13712 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); 13713 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef, 13714 SplatBitSize, HasAnyUndefs); 13715 13716 bool IsZero = IsCnst && SplatValue == 0; 13717 bool IsOne = 13718 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1; 13719 bool IsMinusOne = IsCnst && SplatValue.isAllOnes(); 13720 13721 if (SrcVT.getVectorElementType().isFloatingPoint()) { 13722 switch (CC) { 13723 default: 13724 return SDValue(); 13725 case AArch64CC::NE: { 13726 SDValue Fcmeq; 13727 if (IsZero) 13728 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 13729 else 13730 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 13731 return DAG.getNOT(dl, Fcmeq, VT); 13732 } 13733 case AArch64CC::EQ: 13734 if (IsZero) 13735 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 13736 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 13737 case AArch64CC::GE: 13738 if (IsZero) 13739 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); 13740 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); 13741 case AArch64CC::GT: 13742 if (IsZero) 13743 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); 13744 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); 13745 case AArch64CC::LE: 13746 if (!NoNans) 13747 return SDValue(); 13748 // If we ignore NaNs then we can use to the LS implementation. 13749 [[fallthrough]]; 13750 case AArch64CC::LS: 13751 if (IsZero) 13752 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); 13753 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); 13754 case AArch64CC::LT: 13755 if (!NoNans) 13756 return SDValue(); 13757 // If we ignore NaNs then we can use to the MI implementation. 13758 [[fallthrough]]; 13759 case AArch64CC::MI: 13760 if (IsZero) 13761 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); 13762 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); 13763 } 13764 } 13765 13766 switch (CC) { 13767 default: 13768 return SDValue(); 13769 case AArch64CC::NE: { 13770 SDValue Cmeq; 13771 if (IsZero) 13772 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 13773 else 13774 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 13775 return DAG.getNOT(dl, Cmeq, VT); 13776 } 13777 case AArch64CC::EQ: 13778 if (IsZero) 13779 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 13780 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 13781 case AArch64CC::GE: 13782 if (IsZero) 13783 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); 13784 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); 13785 case AArch64CC::GT: 13786 if (IsZero) 13787 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); 13788 if (IsMinusOne) 13789 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS); 13790 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); 13791 case AArch64CC::LE: 13792 if (IsZero) 13793 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); 13794 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); 13795 case AArch64CC::LS: 13796 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); 13797 case AArch64CC::LO: 13798 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); 13799 case AArch64CC::LT: 13800 if (IsZero) 13801 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); 13802 if (IsOne) 13803 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); 13804 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); 13805 case AArch64CC::HI: 13806 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); 13807 case AArch64CC::HS: 13808 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); 13809 } 13810 } 13811 13812 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, 13813 SelectionDAG &DAG) const { 13814 if (Op.getValueType().isScalableVector()) 13815 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); 13816 13817 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(), 13818 !Subtarget->isNeonAvailable())) 13819 return LowerFixedLengthVectorSetccToSVE(Op, DAG); 13820 13821 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 13822 SDValue LHS = Op.getOperand(0); 13823 SDValue RHS = Op.getOperand(1); 13824 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); 13825 SDLoc dl(Op); 13826 13827 if (LHS.getValueType().getVectorElementType().isInteger()) { 13828 assert(LHS.getValueType() == RHS.getValueType()); 13829 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 13830 SDValue Cmp = 13831 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); 13832 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 13833 } 13834 13835 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); 13836 13837 // Make v4f16 (only) fcmp operations utilise vector instructions 13838 // v8f16 support will be a litle more complicated 13839 if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) { 13840 if (LHS.getValueType().getVectorNumElements() == 4) { 13841 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS); 13842 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS); 13843 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC); 13844 DAG.ReplaceAllUsesWith(Op, NewSetcc); 13845 CmpVT = MVT::v4i32; 13846 } else 13847 return SDValue(); 13848 } 13849 13850 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) || 13851 LHS.getValueType().getVectorElementType() != MVT::f128); 13852 13853 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 13854 // clean. Some of them require two branches to implement. 13855 AArch64CC::CondCode CC1, CC2; 13856 bool ShouldInvert; 13857 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); 13858 13859 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs(); 13860 SDValue Cmp = 13861 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); 13862 if (!Cmp.getNode()) 13863 return SDValue(); 13864 13865 if (CC2 != AArch64CC::AL) { 13866 SDValue Cmp2 = 13867 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); 13868 if (!Cmp2.getNode()) 13869 return SDValue(); 13870 13871 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); 13872 } 13873 13874 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 13875 13876 if (ShouldInvert) 13877 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); 13878 13879 return Cmp; 13880 } 13881 13882 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, 13883 SelectionDAG &DAG) { 13884 SDValue VecOp = ScalarOp.getOperand(0); 13885 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); 13886 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, 13887 DAG.getConstant(0, DL, MVT::i64)); 13888 } 13889 13890 static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, 13891 SDLoc DL, SelectionDAG &DAG) { 13892 unsigned ScalarOpcode; 13893 switch (Opcode) { 13894 case ISD::VECREDUCE_AND: 13895 ScalarOpcode = ISD::AND; 13896 break; 13897 case ISD::VECREDUCE_OR: 13898 ScalarOpcode = ISD::OR; 13899 break; 13900 case ISD::VECREDUCE_XOR: 13901 ScalarOpcode = ISD::XOR; 13902 break; 13903 default: 13904 llvm_unreachable("Expected bitwise vector reduction"); 13905 return SDValue(); 13906 } 13907 13908 EVT VecVT = Vec.getValueType(); 13909 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() && 13910 "Expected power-of-2 length vector"); 13911 13912 EVT ElemVT = VecVT.getVectorElementType(); 13913 13914 SDValue Result; 13915 unsigned NumElems = VecVT.getVectorNumElements(); 13916 13917 // Special case for boolean reductions 13918 if (ElemVT == MVT::i1) { 13919 // Split large vectors into smaller ones 13920 if (NumElems > 16) { 13921 SDValue Lo, Hi; 13922 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL); 13923 EVT HalfVT = Lo.getValueType(); 13924 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi); 13925 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG); 13926 } 13927 13928 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit 13929 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to 13930 // this element size leads to the best codegen, since e.g. setcc results 13931 // might need to be truncated otherwise. 13932 EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u)); 13933 13934 // any_ext doesn't work with umin/umax, so only use it for uadd. 13935 unsigned ExtendOp = 13936 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND; 13937 SDValue Extended = DAG.getNode( 13938 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec); 13939 switch (ScalarOpcode) { 13940 case ISD::AND: 13941 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended); 13942 break; 13943 case ISD::OR: 13944 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended); 13945 break; 13946 case ISD::XOR: 13947 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended); 13948 break; 13949 default: 13950 llvm_unreachable("Unexpected Opcode"); 13951 } 13952 13953 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1); 13954 } else { 13955 // Iteratively split the vector in half and combine using the bitwise 13956 // operation until it fits in a 64 bit register. 13957 while (VecVT.getSizeInBits() > 64) { 13958 SDValue Lo, Hi; 13959 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL); 13960 VecVT = Lo.getValueType(); 13961 NumElems = VecVT.getVectorNumElements(); 13962 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi); 13963 } 13964 13965 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits()); 13966 13967 // Do the remaining work on a scalar since it allows the code generator to 13968 // combine the shift and bitwise operation into one instruction and since 13969 // integer instructions can have higher throughput than vector instructions. 13970 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec); 13971 13972 // Iteratively combine the lower and upper halves of the scalar using the 13973 // bitwise operation, halving the relevant region of the scalar in each 13974 // iteration, until the relevant region is just one element of the original 13975 // vector. 13976 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) { 13977 SDValue ShiftAmount = 13978 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64); 13979 SDValue Shifted = 13980 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount); 13981 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted); 13982 } 13983 13984 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT); 13985 } 13986 13987 return DAG.getAnyExtOrTrunc(Result, DL, VT); 13988 } 13989 13990 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, 13991 SelectionDAG &DAG) const { 13992 SDValue Src = Op.getOperand(0); 13993 13994 // Try to lower fixed length reductions to SVE. 13995 EVT SrcVT = Src.getValueType(); 13996 bool OverrideNEON = !Subtarget->isNeonAvailable() || 13997 Op.getOpcode() == ISD::VECREDUCE_AND || 13998 Op.getOpcode() == ISD::VECREDUCE_OR || 13999 Op.getOpcode() == ISD::VECREDUCE_XOR || 14000 Op.getOpcode() == ISD::VECREDUCE_FADD || 14001 (Op.getOpcode() != ISD::VECREDUCE_ADD && 14002 SrcVT.getVectorElementType() == MVT::i64); 14003 if (SrcVT.isScalableVector() || 14004 useSVEForFixedLengthVectorVT( 14005 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) { 14006 14007 if (SrcVT.getVectorElementType() == MVT::i1) 14008 return LowerPredReductionToSVE(Op, DAG); 14009 14010 switch (Op.getOpcode()) { 14011 case ISD::VECREDUCE_ADD: 14012 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG); 14013 case ISD::VECREDUCE_AND: 14014 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG); 14015 case ISD::VECREDUCE_OR: 14016 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG); 14017 case ISD::VECREDUCE_SMAX: 14018 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG); 14019 case ISD::VECREDUCE_SMIN: 14020 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG); 14021 case ISD::VECREDUCE_UMAX: 14022 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG); 14023 case ISD::VECREDUCE_UMIN: 14024 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG); 14025 case ISD::VECREDUCE_XOR: 14026 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG); 14027 case ISD::VECREDUCE_FADD: 14028 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG); 14029 case ISD::VECREDUCE_FMAX: 14030 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG); 14031 case ISD::VECREDUCE_FMIN: 14032 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG); 14033 case ISD::VECREDUCE_FMAXIMUM: 14034 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG); 14035 case ISD::VECREDUCE_FMINIMUM: 14036 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG); 14037 default: 14038 llvm_unreachable("Unhandled fixed length reduction"); 14039 } 14040 } 14041 14042 // Lower NEON reductions. 14043 SDLoc dl(Op); 14044 switch (Op.getOpcode()) { 14045 case ISD::VECREDUCE_AND: 14046 case ISD::VECREDUCE_OR: 14047 case ISD::VECREDUCE_XOR: 14048 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0), 14049 Op.getValueType(), dl, DAG); 14050 case ISD::VECREDUCE_ADD: 14051 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); 14052 case ISD::VECREDUCE_SMAX: 14053 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); 14054 case ISD::VECREDUCE_SMIN: 14055 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); 14056 case ISD::VECREDUCE_UMAX: 14057 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); 14058 case ISD::VECREDUCE_UMIN: 14059 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); 14060 default: 14061 llvm_unreachable("Unhandled reduction"); 14062 } 14063 } 14064 14065 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, 14066 SelectionDAG &DAG) const { 14067 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); 14068 // No point replacing if we don't have the relevant instruction/libcall anyway 14069 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) 14070 return SDValue(); 14071 14072 // LSE has an atomic load-clear instruction, but not a load-and. 14073 SDLoc dl(Op); 14074 MVT VT = Op.getSimpleValueType(); 14075 assert(VT != MVT::i128 && "Handled elsewhere, code replicated."); 14076 SDValue RHS = Op.getOperand(2); 14077 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode()); 14078 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS); 14079 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(), 14080 Op.getOperand(0), Op.getOperand(1), RHS, 14081 AN->getMemOperand()); 14082 } 14083 14084 SDValue 14085 AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, 14086 SelectionDAG &DAG) const { 14087 14088 SDLoc dl(Op); 14089 // Get the inputs. 14090 SDNode *Node = Op.getNode(); 14091 SDValue Chain = Op.getOperand(0); 14092 SDValue Size = Op.getOperand(1); 14093 MaybeAlign Align = 14094 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); 14095 EVT VT = Node->getValueType(0); 14096 14097 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 14098 "no-stack-arg-probe")) { 14099 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); 14100 Chain = SP.getValue(1); 14101 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); 14102 if (Align) 14103 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 14104 DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); 14105 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); 14106 SDValue Ops[2] = {SP, Chain}; 14107 return DAG.getMergeValues(Ops, dl); 14108 } 14109 14110 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); 14111 14112 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 14113 SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(), 14114 PtrVT, 0); 14115 14116 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 14117 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask(); 14118 if (Subtarget->hasCustomCallingConv()) 14119 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); 14120 14121 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size, 14122 DAG.getConstant(4, dl, MVT::i64)); 14123 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue()); 14124 Chain = 14125 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue), 14126 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64), 14127 DAG.getRegisterMask(Mask), Chain.getValue(1)); 14128 // To match the actual intent better, we should read the output from X15 here 14129 // again (instead of potentially spilling it to the stack), but rereading Size 14130 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined 14131 // here. 14132 14133 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, 14134 DAG.getConstant(4, dl, MVT::i64)); 14135 14136 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); 14137 Chain = SP.getValue(1); 14138 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); 14139 if (Align) 14140 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 14141 DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); 14142 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); 14143 14144 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); 14145 14146 SDValue Ops[2] = {SP, Chain}; 14147 return DAG.getMergeValues(Ops, dl); 14148 } 14149 14150 SDValue 14151 AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op, 14152 SelectionDAG &DAG) const { 14153 // Get the inputs. 14154 SDNode *Node = Op.getNode(); 14155 SDValue Chain = Op.getOperand(0); 14156 SDValue Size = Op.getOperand(1); 14157 14158 MaybeAlign Align = 14159 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); 14160 SDLoc dl(Op); 14161 EVT VT = Node->getValueType(0); 14162 14163 // Construct the new SP value in a GPR. 14164 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); 14165 Chain = SP.getValue(1); 14166 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); 14167 if (Align) 14168 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 14169 DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); 14170 14171 // Set the real SP to the new value with a probing loop. 14172 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP); 14173 SDValue Ops[2] = {SP, Chain}; 14174 return DAG.getMergeValues(Ops, dl); 14175 } 14176 14177 SDValue 14178 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 14179 SelectionDAG &DAG) const { 14180 MachineFunction &MF = DAG.getMachineFunction(); 14181 14182 if (Subtarget->isTargetWindows()) 14183 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG); 14184 else if (hasInlineStackProbe(MF)) 14185 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG); 14186 else 14187 return SDValue(); 14188 } 14189 14190 // When x and y are extended, lower: 14191 // avgfloor(x, y) -> (x + y) >> 1 14192 // avgceil(x, y) -> (x + y + 1) >> 1 14193 14194 // Otherwise, lower to: 14195 // avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1) 14196 // avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1) 14197 SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG, 14198 unsigned NewOp) const { 14199 if (Subtarget->hasSVE2()) 14200 return LowerToPredicatedOp(Op, DAG, NewOp); 14201 14202 SDLoc dl(Op); 14203 SDValue OpA = Op->getOperand(0); 14204 SDValue OpB = Op->getOperand(1); 14205 EVT VT = Op.getValueType(); 14206 bool IsCeil = 14207 (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU); 14208 bool IsSigned = 14209 (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS); 14210 unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL; 14211 14212 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); 14213 14214 auto IsZeroExtended = [&DAG](SDValue &Node) { 14215 KnownBits Known = DAG.computeKnownBits(Node, 0); 14216 return Known.Zero.isSignBitSet(); 14217 }; 14218 14219 auto IsSignExtended = [&DAG](SDValue &Node) { 14220 return (DAG.ComputeNumSignBits(Node, 0) > 1); 14221 }; 14222 14223 SDValue ConstantOne = DAG.getConstant(1, dl, VT); 14224 if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) || 14225 (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) { 14226 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB); 14227 if (IsCeil) 14228 Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne); 14229 return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne); 14230 } 14231 14232 SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne); 14233 SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne); 14234 14235 SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB); 14236 tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne); 14237 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB); 14238 return DAG.getNode(ISD::ADD, dl, VT, Add, tmp); 14239 } 14240 14241 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, 14242 SelectionDAG &DAG) const { 14243 EVT VT = Op.getValueType(); 14244 assert(VT != MVT::i64 && "Expected illegal VSCALE node"); 14245 14246 SDLoc DL(Op); 14247 APInt MulImm = Op.getConstantOperandAPInt(0); 14248 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL, 14249 VT); 14250 } 14251 14252 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics. 14253 template <unsigned NumVecs> 14254 static bool 14255 setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, 14256 AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) { 14257 Info.opc = ISD::INTRINSIC_VOID; 14258 // Retrieve EC from first vector argument. 14259 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType()); 14260 ElementCount EC = VT.getVectorElementCount(); 14261 #ifndef NDEBUG 14262 // Check the assumption that all input vectors are the same type. 14263 for (unsigned I = 0; I < NumVecs; ++I) 14264 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) && 14265 "Invalid type."); 14266 #endif 14267 // memVT is `NumVecs * VT`. 14268 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(), 14269 EC * NumVecs); 14270 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1); 14271 Info.offset = 0; 14272 Info.align.reset(); 14273 Info.flags = MachineMemOperand::MOStore; 14274 return true; 14275 } 14276 14277 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 14278 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 14279 /// specified in the intrinsic calls. 14280 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 14281 const CallInst &I, 14282 MachineFunction &MF, 14283 unsigned Intrinsic) const { 14284 auto &DL = I.getModule()->getDataLayout(); 14285 switch (Intrinsic) { 14286 case Intrinsic::aarch64_sve_st2: 14287 return setInfoSVEStN<2>(*this, DL, Info, I); 14288 case Intrinsic::aarch64_sve_st3: 14289 return setInfoSVEStN<3>(*this, DL, Info, I); 14290 case Intrinsic::aarch64_sve_st4: 14291 return setInfoSVEStN<4>(*this, DL, Info, I); 14292 case Intrinsic::aarch64_neon_ld2: 14293 case Intrinsic::aarch64_neon_ld3: 14294 case Intrinsic::aarch64_neon_ld4: 14295 case Intrinsic::aarch64_neon_ld1x2: 14296 case Intrinsic::aarch64_neon_ld1x3: 14297 case Intrinsic::aarch64_neon_ld1x4: { 14298 Info.opc = ISD::INTRINSIC_W_CHAIN; 14299 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 14300 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 14301 Info.ptrVal = I.getArgOperand(I.arg_size() - 1); 14302 Info.offset = 0; 14303 Info.align.reset(); 14304 // volatile loads with NEON intrinsics not supported 14305 Info.flags = MachineMemOperand::MOLoad; 14306 return true; 14307 } 14308 case Intrinsic::aarch64_neon_ld2lane: 14309 case Intrinsic::aarch64_neon_ld3lane: 14310 case Intrinsic::aarch64_neon_ld4lane: 14311 case Intrinsic::aarch64_neon_ld2r: 14312 case Intrinsic::aarch64_neon_ld3r: 14313 case Intrinsic::aarch64_neon_ld4r: { 14314 Info.opc = ISD::INTRINSIC_W_CHAIN; 14315 // ldx return struct with the same vec type 14316 Type *RetTy = I.getType(); 14317 auto *StructTy = cast<StructType>(RetTy); 14318 unsigned NumElts = StructTy->getNumElements(); 14319 Type *VecTy = StructTy->getElementType(0); 14320 MVT EleVT = MVT::getVT(VecTy).getVectorElementType(); 14321 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts); 14322 Info.ptrVal = I.getArgOperand(I.arg_size() - 1); 14323 Info.offset = 0; 14324 Info.align.reset(); 14325 // volatile loads with NEON intrinsics not supported 14326 Info.flags = MachineMemOperand::MOLoad; 14327 return true; 14328 } 14329 case Intrinsic::aarch64_neon_st2: 14330 case Intrinsic::aarch64_neon_st3: 14331 case Intrinsic::aarch64_neon_st4: 14332 case Intrinsic::aarch64_neon_st1x2: 14333 case Intrinsic::aarch64_neon_st1x3: 14334 case Intrinsic::aarch64_neon_st1x4: { 14335 Info.opc = ISD::INTRINSIC_VOID; 14336 unsigned NumElts = 0; 14337 for (const Value *Arg : I.args()) { 14338 Type *ArgTy = Arg->getType(); 14339 if (!ArgTy->isVectorTy()) 14340 break; 14341 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 14342 } 14343 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 14344 Info.ptrVal = I.getArgOperand(I.arg_size() - 1); 14345 Info.offset = 0; 14346 Info.align.reset(); 14347 // volatile stores with NEON intrinsics not supported 14348 Info.flags = MachineMemOperand::MOStore; 14349 return true; 14350 } 14351 case Intrinsic::aarch64_neon_st2lane: 14352 case Intrinsic::aarch64_neon_st3lane: 14353 case Intrinsic::aarch64_neon_st4lane: { 14354 Info.opc = ISD::INTRINSIC_VOID; 14355 unsigned NumElts = 0; 14356 // all the vector type is same 14357 Type *VecTy = I.getArgOperand(0)->getType(); 14358 MVT EleVT = MVT::getVT(VecTy).getVectorElementType(); 14359 14360 for (const Value *Arg : I.args()) { 14361 Type *ArgTy = Arg->getType(); 14362 if (!ArgTy->isVectorTy()) 14363 break; 14364 NumElts += 1; 14365 } 14366 14367 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts); 14368 Info.ptrVal = I.getArgOperand(I.arg_size() - 1); 14369 Info.offset = 0; 14370 Info.align.reset(); 14371 // volatile stores with NEON intrinsics not supported 14372 Info.flags = MachineMemOperand::MOStore; 14373 return true; 14374 } 14375 case Intrinsic::aarch64_ldaxr: 14376 case Intrinsic::aarch64_ldxr: { 14377 Type *ValTy = I.getParamElementType(0); 14378 Info.opc = ISD::INTRINSIC_W_CHAIN; 14379 Info.memVT = MVT::getVT(ValTy); 14380 Info.ptrVal = I.getArgOperand(0); 14381 Info.offset = 0; 14382 Info.align = DL.getABITypeAlign(ValTy); 14383 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 14384 return true; 14385 } 14386 case Intrinsic::aarch64_stlxr: 14387 case Intrinsic::aarch64_stxr: { 14388 Type *ValTy = I.getParamElementType(1); 14389 Info.opc = ISD::INTRINSIC_W_CHAIN; 14390 Info.memVT = MVT::getVT(ValTy); 14391 Info.ptrVal = I.getArgOperand(1); 14392 Info.offset = 0; 14393 Info.align = DL.getABITypeAlign(ValTy); 14394 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 14395 return true; 14396 } 14397 case Intrinsic::aarch64_ldaxp: 14398 case Intrinsic::aarch64_ldxp: 14399 Info.opc = ISD::INTRINSIC_W_CHAIN; 14400 Info.memVT = MVT::i128; 14401 Info.ptrVal = I.getArgOperand(0); 14402 Info.offset = 0; 14403 Info.align = Align(16); 14404 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 14405 return true; 14406 case Intrinsic::aarch64_stlxp: 14407 case Intrinsic::aarch64_stxp: 14408 Info.opc = ISD::INTRINSIC_W_CHAIN; 14409 Info.memVT = MVT::i128; 14410 Info.ptrVal = I.getArgOperand(2); 14411 Info.offset = 0; 14412 Info.align = Align(16); 14413 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 14414 return true; 14415 case Intrinsic::aarch64_sve_ldnt1: { 14416 Type *ElTy = cast<VectorType>(I.getType())->getElementType(); 14417 Info.opc = ISD::INTRINSIC_W_CHAIN; 14418 Info.memVT = MVT::getVT(I.getType()); 14419 Info.ptrVal = I.getArgOperand(1); 14420 Info.offset = 0; 14421 Info.align = DL.getABITypeAlign(ElTy); 14422 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal; 14423 return true; 14424 } 14425 case Intrinsic::aarch64_sve_stnt1: { 14426 Type *ElTy = 14427 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType(); 14428 Info.opc = ISD::INTRINSIC_W_CHAIN; 14429 Info.memVT = MVT::getVT(I.getOperand(0)->getType()); 14430 Info.ptrVal = I.getArgOperand(2); 14431 Info.offset = 0; 14432 Info.align = DL.getABITypeAlign(ElTy); 14433 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal; 14434 return true; 14435 } 14436 case Intrinsic::aarch64_mops_memset_tag: { 14437 Value *Dst = I.getArgOperand(0); 14438 Value *Val = I.getArgOperand(1); 14439 Info.opc = ISD::INTRINSIC_W_CHAIN; 14440 Info.memVT = MVT::getVT(Val->getType()); 14441 Info.ptrVal = Dst; 14442 Info.offset = 0; 14443 Info.align = I.getParamAlign(0).valueOrOne(); 14444 Info.flags = MachineMemOperand::MOStore; 14445 // The size of the memory being operated on is unknown at this point 14446 Info.size = MemoryLocation::UnknownSize; 14447 return true; 14448 } 14449 default: 14450 break; 14451 } 14452 14453 return false; 14454 } 14455 14456 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, 14457 ISD::LoadExtType ExtTy, 14458 EVT NewVT) const { 14459 // TODO: This may be worth removing. Check regression tests for diffs. 14460 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) 14461 return false; 14462 14463 // If we're reducing the load width in order to avoid having to use an extra 14464 // instruction to do extension then it's probably a good idea. 14465 if (ExtTy != ISD::NON_EXTLOAD) 14466 return true; 14467 // Don't reduce load width if it would prevent us from combining a shift into 14468 // the offset. 14469 MemSDNode *Mem = dyn_cast<MemSDNode>(Load); 14470 assert(Mem); 14471 const SDValue &Base = Mem->getBasePtr(); 14472 if (Base.getOpcode() == ISD::ADD && 14473 Base.getOperand(1).getOpcode() == ISD::SHL && 14474 Base.getOperand(1).hasOneUse() && 14475 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) { 14476 // It's unknown whether a scalable vector has a power-of-2 bitwidth. 14477 if (Mem->getMemoryVT().isScalableVector()) 14478 return false; 14479 // The shift can be combined if it matches the size of the value being 14480 // loaded (and so reducing the width would make it not match). 14481 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1); 14482 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8; 14483 if (ShiftAmount == Log2_32(LoadBytes)) 14484 return false; 14485 } 14486 // We have no reason to disallow reducing the load width, so allow it. 14487 return true; 14488 } 14489 14490 // Treat a sext_inreg(extract(..)) as free if it has multiple uses. 14491 bool AArch64TargetLowering::shouldRemoveRedundantExtend(SDValue Extend) const { 14492 EVT VT = Extend.getValueType(); 14493 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) { 14494 SDValue Extract = Extend.getOperand(0); 14495 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse()) 14496 Extract = Extract.getOperand(0); 14497 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) { 14498 EVT VecVT = Extract.getOperand(0).getValueType(); 14499 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16) 14500 return false; 14501 } 14502 } 14503 return true; 14504 } 14505 14506 // Truncations from 64-bit GPR to 32-bit GPR is free. 14507 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 14508 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 14509 return false; 14510 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue(); 14511 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue(); 14512 return NumBits1 > NumBits2; 14513 } 14514 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 14515 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 14516 return false; 14517 uint64_t NumBits1 = VT1.getFixedSizeInBits(); 14518 uint64_t NumBits2 = VT2.getFixedSizeInBits(); 14519 return NumBits1 > NumBits2; 14520 } 14521 14522 /// Check if it is profitable to hoist instruction in then/else to if. 14523 /// Not profitable if I and it's user can form a FMA instruction 14524 /// because we prefer FMSUB/FMADD. 14525 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { 14526 if (I->getOpcode() != Instruction::FMul) 14527 return true; 14528 14529 if (!I->hasOneUse()) 14530 return true; 14531 14532 Instruction *User = I->user_back(); 14533 14534 if (!(User->getOpcode() == Instruction::FSub || 14535 User->getOpcode() == Instruction::FAdd)) 14536 return true; 14537 14538 const TargetOptions &Options = getTargetMachine().Options; 14539 const Function *F = I->getFunction(); 14540 const DataLayout &DL = F->getParent()->getDataLayout(); 14541 Type *Ty = User->getOperand(0)->getType(); 14542 14543 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && 14544 isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && 14545 (Options.AllowFPOpFusion == FPOpFusion::Fast || 14546 Options.UnsafeFPMath)); 14547 } 14548 14549 // All 32-bit GPR operations implicitly zero the high-half of the corresponding 14550 // 64-bit GPR. 14551 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 14552 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 14553 return false; 14554 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 14555 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 14556 return NumBits1 == 32 && NumBits2 == 64; 14557 } 14558 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 14559 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 14560 return false; 14561 unsigned NumBits1 = VT1.getSizeInBits(); 14562 unsigned NumBits2 = VT2.getSizeInBits(); 14563 return NumBits1 == 32 && NumBits2 == 64; 14564 } 14565 14566 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 14567 EVT VT1 = Val.getValueType(); 14568 if (isZExtFree(VT1, VT2)) { 14569 return true; 14570 } 14571 14572 if (Val.getOpcode() != ISD::LOAD) 14573 return false; 14574 14575 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. 14576 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && 14577 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && 14578 VT1.getSizeInBits() <= 32); 14579 } 14580 14581 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { 14582 if (isa<FPExtInst>(Ext)) 14583 return false; 14584 14585 // Vector types are not free. 14586 if (Ext->getType()->isVectorTy()) 14587 return false; 14588 14589 for (const Use &U : Ext->uses()) { 14590 // The extension is free if we can fold it with a left shift in an 14591 // addressing mode or an arithmetic operation: add, sub, and cmp. 14592 14593 // Is there a shift? 14594 const Instruction *Instr = cast<Instruction>(U.getUser()); 14595 14596 // Is this a constant shift? 14597 switch (Instr->getOpcode()) { 14598 case Instruction::Shl: 14599 if (!isa<ConstantInt>(Instr->getOperand(1))) 14600 return false; 14601 break; 14602 case Instruction::GetElementPtr: { 14603 gep_type_iterator GTI = gep_type_begin(Instr); 14604 auto &DL = Ext->getModule()->getDataLayout(); 14605 std::advance(GTI, U.getOperandNo()-1); 14606 Type *IdxTy = GTI.getIndexedType(); 14607 // This extension will end up with a shift because of the scaling factor. 14608 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. 14609 // Get the shift amount based on the scaling factor: 14610 // log2(sizeof(IdxTy)) - log2(8). 14611 if (IdxTy->isScalableTy()) 14612 return false; 14613 uint64_t ShiftAmt = 14614 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) - 14615 3; 14616 // Is the constant foldable in the shift of the addressing mode? 14617 // I.e., shift amount is between 1 and 4 inclusive. 14618 if (ShiftAmt == 0 || ShiftAmt > 4) 14619 return false; 14620 break; 14621 } 14622 case Instruction::Trunc: 14623 // Check if this is a noop. 14624 // trunc(sext ty1 to ty2) to ty1. 14625 if (Instr->getType() == Ext->getOperand(0)->getType()) 14626 continue; 14627 [[fallthrough]]; 14628 default: 14629 return false; 14630 } 14631 14632 // At this point we can use the bfm family, so this extension is free 14633 // for that use. 14634 } 14635 return true; 14636 } 14637 14638 static bool isSplatShuffle(Value *V) { 14639 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V)) 14640 return all_equal(Shuf->getShuffleMask()); 14641 return false; 14642 } 14643 14644 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower 14645 /// or upper half of the vector elements. 14646 static bool areExtractShuffleVectors(Value *Op1, Value *Op2, 14647 bool AllowSplat = false) { 14648 auto areTypesHalfed = [](Value *FullV, Value *HalfV) { 14649 auto *FullTy = FullV->getType(); 14650 auto *HalfTy = HalfV->getType(); 14651 return FullTy->getPrimitiveSizeInBits().getFixedValue() == 14652 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue(); 14653 }; 14654 14655 auto extractHalf = [](Value *FullV, Value *HalfV) { 14656 auto *FullVT = cast<FixedVectorType>(FullV->getType()); 14657 auto *HalfVT = cast<FixedVectorType>(HalfV->getType()); 14658 return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); 14659 }; 14660 14661 ArrayRef<int> M1, M2; 14662 Value *S1Op1 = nullptr, *S2Op1 = nullptr; 14663 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) || 14664 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2)))) 14665 return false; 14666 14667 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that 14668 // it is not checked as an extract below. 14669 if (AllowSplat && isSplatShuffle(Op1)) 14670 S1Op1 = nullptr; 14671 if (AllowSplat && isSplatShuffle(Op2)) 14672 S2Op1 = nullptr; 14673 14674 // Check that the operands are half as wide as the result and we extract 14675 // half of the elements of the input vectors. 14676 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) || 14677 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2)))) 14678 return false; 14679 14680 // Check the mask extracts either the lower or upper half of vector 14681 // elements. 14682 int M1Start = 0; 14683 int M2Start = 0; 14684 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2; 14685 if ((S1Op1 && 14686 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) || 14687 (S2Op1 && 14688 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start))) 14689 return false; 14690 14691 if ((M1Start != 0 && M1Start != (NumElements / 2)) || 14692 (M2Start != 0 && M2Start != (NumElements / 2))) 14693 return false; 14694 if (S1Op1 && S2Op1 && M1Start != M2Start) 14695 return false; 14696 14697 return true; 14698 } 14699 14700 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 14701 /// of the vector elements. 14702 static bool areExtractExts(Value *Ext1, Value *Ext2) { 14703 auto areExtDoubled = [](Instruction *Ext) { 14704 return Ext->getType()->getScalarSizeInBits() == 14705 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 14706 }; 14707 14708 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 14709 !match(Ext2, m_ZExtOrSExt(m_Value())) || 14710 !areExtDoubled(cast<Instruction>(Ext1)) || 14711 !areExtDoubled(cast<Instruction>(Ext2))) 14712 return false; 14713 14714 return true; 14715 } 14716 14717 /// Check if Op could be used with vmull_high_p64 intrinsic. 14718 static bool isOperandOfVmullHighP64(Value *Op) { 14719 Value *VectorOperand = nullptr; 14720 ConstantInt *ElementIndex = nullptr; 14721 return match(Op, m_ExtractElt(m_Value(VectorOperand), 14722 m_ConstantInt(ElementIndex))) && 14723 ElementIndex->getValue() == 1 && 14724 isa<FixedVectorType>(VectorOperand->getType()) && 14725 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2; 14726 } 14727 14728 /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic. 14729 static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) { 14730 return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); 14731 } 14732 14733 static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) { 14734 // Restrict ourselves to the form CodeGenPrepare typically constructs. 14735 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs); 14736 if (!GEP || GEP->getNumOperands() != 2) 14737 return false; 14738 14739 Value *Base = GEP->getOperand(0); 14740 Value *Offsets = GEP->getOperand(1); 14741 14742 // We only care about scalar_base+vector_offsets. 14743 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy()) 14744 return false; 14745 14746 // Sink extends that would allow us to use 32-bit offset vectors. 14747 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) { 14748 auto *OffsetsInst = cast<Instruction>(Offsets); 14749 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 && 14750 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32) 14751 Ops.push_back(&GEP->getOperandUse(1)); 14752 } 14753 14754 // Sink the GEP. 14755 return true; 14756 } 14757 14758 /// We want to sink following cases: 14759 /// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale 14760 static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) { 14761 if (match(Op, m_VScale())) 14762 return true; 14763 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) || 14764 match(Op, m_Mul(m_VScale(), m_ConstantInt()))) { 14765 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0)); 14766 return true; 14767 } 14768 return false; 14769 } 14770 14771 /// Check if sinking \p I's operands to I's basic block is profitable, because 14772 /// the operands can be folded into a target instruction, e.g. 14773 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). 14774 bool AArch64TargetLowering::shouldSinkOperands( 14775 Instruction *I, SmallVectorImpl<Use *> &Ops) const { 14776 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 14777 switch (II->getIntrinsicID()) { 14778 case Intrinsic::aarch64_neon_smull: 14779 case Intrinsic::aarch64_neon_umull: 14780 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1), 14781 /*AllowSplat=*/true)) { 14782 Ops.push_back(&II->getOperandUse(0)); 14783 Ops.push_back(&II->getOperandUse(1)); 14784 return true; 14785 } 14786 [[fallthrough]]; 14787 14788 case Intrinsic::fma: 14789 if (isa<VectorType>(I->getType()) && 14790 cast<VectorType>(I->getType())->getElementType()->isHalfTy() && 14791 !Subtarget->hasFullFP16()) 14792 return false; 14793 [[fallthrough]]; 14794 case Intrinsic::aarch64_neon_sqdmull: 14795 case Intrinsic::aarch64_neon_sqdmulh: 14796 case Intrinsic::aarch64_neon_sqrdmulh: 14797 // Sink splats for index lane variants 14798 if (isSplatShuffle(II->getOperand(0))) 14799 Ops.push_back(&II->getOperandUse(0)); 14800 if (isSplatShuffle(II->getOperand(1))) 14801 Ops.push_back(&II->getOperandUse(1)); 14802 return !Ops.empty(); 14803 case Intrinsic::aarch64_neon_fmlal: 14804 case Intrinsic::aarch64_neon_fmlal2: 14805 case Intrinsic::aarch64_neon_fmlsl: 14806 case Intrinsic::aarch64_neon_fmlsl2: 14807 // Sink splats for index lane variants 14808 if (isSplatShuffle(II->getOperand(1))) 14809 Ops.push_back(&II->getOperandUse(1)); 14810 if (isSplatShuffle(II->getOperand(2))) 14811 Ops.push_back(&II->getOperandUse(2)); 14812 return !Ops.empty(); 14813 case Intrinsic::aarch64_sve_ptest_first: 14814 case Intrinsic::aarch64_sve_ptest_last: 14815 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0))) 14816 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue) 14817 Ops.push_back(&II->getOperandUse(0)); 14818 return !Ops.empty(); 14819 case Intrinsic::aarch64_sme_write_horiz: 14820 case Intrinsic::aarch64_sme_write_vert: 14821 case Intrinsic::aarch64_sme_writeq_horiz: 14822 case Intrinsic::aarch64_sme_writeq_vert: { 14823 auto *Idx = dyn_cast<Instruction>(II->getOperand(1)); 14824 if (!Idx || Idx->getOpcode() != Instruction::Add) 14825 return false; 14826 Ops.push_back(&II->getOperandUse(1)); 14827 return true; 14828 } 14829 case Intrinsic::aarch64_sme_read_horiz: 14830 case Intrinsic::aarch64_sme_read_vert: 14831 case Intrinsic::aarch64_sme_readq_horiz: 14832 case Intrinsic::aarch64_sme_readq_vert: 14833 case Intrinsic::aarch64_sme_ld1b_vert: 14834 case Intrinsic::aarch64_sme_ld1h_vert: 14835 case Intrinsic::aarch64_sme_ld1w_vert: 14836 case Intrinsic::aarch64_sme_ld1d_vert: 14837 case Intrinsic::aarch64_sme_ld1q_vert: 14838 case Intrinsic::aarch64_sme_st1b_vert: 14839 case Intrinsic::aarch64_sme_st1h_vert: 14840 case Intrinsic::aarch64_sme_st1w_vert: 14841 case Intrinsic::aarch64_sme_st1d_vert: 14842 case Intrinsic::aarch64_sme_st1q_vert: 14843 case Intrinsic::aarch64_sme_ld1b_horiz: 14844 case Intrinsic::aarch64_sme_ld1h_horiz: 14845 case Intrinsic::aarch64_sme_ld1w_horiz: 14846 case Intrinsic::aarch64_sme_ld1d_horiz: 14847 case Intrinsic::aarch64_sme_ld1q_horiz: 14848 case Intrinsic::aarch64_sme_st1b_horiz: 14849 case Intrinsic::aarch64_sme_st1h_horiz: 14850 case Intrinsic::aarch64_sme_st1w_horiz: 14851 case Intrinsic::aarch64_sme_st1d_horiz: 14852 case Intrinsic::aarch64_sme_st1q_horiz: { 14853 auto *Idx = dyn_cast<Instruction>(II->getOperand(3)); 14854 if (!Idx || Idx->getOpcode() != Instruction::Add) 14855 return false; 14856 Ops.push_back(&II->getOperandUse(3)); 14857 return true; 14858 } 14859 case Intrinsic::aarch64_neon_pmull: 14860 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) 14861 return false; 14862 Ops.push_back(&II->getOperandUse(0)); 14863 Ops.push_back(&II->getOperandUse(1)); 14864 return true; 14865 case Intrinsic::aarch64_neon_pmull64: 14866 if (!areOperandsOfVmullHighP64(II->getArgOperand(0), 14867 II->getArgOperand(1))) 14868 return false; 14869 Ops.push_back(&II->getArgOperandUse(0)); 14870 Ops.push_back(&II->getArgOperandUse(1)); 14871 return true; 14872 case Intrinsic::masked_gather: 14873 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops)) 14874 return false; 14875 Ops.push_back(&II->getArgOperandUse(0)); 14876 return true; 14877 case Intrinsic::masked_scatter: 14878 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops)) 14879 return false; 14880 Ops.push_back(&II->getArgOperandUse(1)); 14881 return true; 14882 default: 14883 return false; 14884 } 14885 } 14886 14887 // Sink vscales closer to uses for better isel 14888 switch (I->getOpcode()) { 14889 case Instruction::GetElementPtr: 14890 case Instruction::Add: 14891 case Instruction::Sub: 14892 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) { 14893 if (shouldSinkVScale(I->getOperand(Op), Ops)) { 14894 Ops.push_back(&I->getOperandUse(Op)); 14895 return true; 14896 } 14897 } 14898 break; 14899 default: 14900 break; 14901 } 14902 14903 if (!I->getType()->isVectorTy()) 14904 return false; 14905 14906 switch (I->getOpcode()) { 14907 case Instruction::Sub: 14908 case Instruction::Add: { 14909 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 14910 return false; 14911 14912 // If the exts' operands extract either the lower or upper elements, we 14913 // can sink them too. 14914 auto Ext1 = cast<Instruction>(I->getOperand(0)); 14915 auto Ext2 = cast<Instruction>(I->getOperand(1)); 14916 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) { 14917 Ops.push_back(&Ext1->getOperandUse(0)); 14918 Ops.push_back(&Ext2->getOperandUse(0)); 14919 } 14920 14921 Ops.push_back(&I->getOperandUse(0)); 14922 Ops.push_back(&I->getOperandUse(1)); 14923 14924 return true; 14925 } 14926 case Instruction::Or: { 14927 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) -> 14928 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1) 14929 if (Subtarget->hasNEON()) { 14930 Instruction *OtherAnd, *IA, *IB; 14931 Value *MaskValue; 14932 // MainAnd refers to And instruction that has 'Not' as one of its operands 14933 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)), 14934 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))), 14935 m_Instruction(IA)))))) { 14936 if (match(OtherAnd, 14937 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) { 14938 Instruction *MainAnd = I->getOperand(0) == OtherAnd 14939 ? cast<Instruction>(I->getOperand(1)) 14940 : cast<Instruction>(I->getOperand(0)); 14941 14942 // Both Ands should be in same basic block as Or 14943 if (I->getParent() != MainAnd->getParent() || 14944 I->getParent() != OtherAnd->getParent()) 14945 return false; 14946 14947 // Non-mask operands of both Ands should also be in same basic block 14948 if (I->getParent() != IA->getParent() || 14949 I->getParent() != IB->getParent()) 14950 return false; 14951 14952 Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0)); 14953 Ops.push_back(&I->getOperandUse(0)); 14954 Ops.push_back(&I->getOperandUse(1)); 14955 14956 return true; 14957 } 14958 } 14959 } 14960 14961 return false; 14962 } 14963 case Instruction::Mul: { 14964 int NumZExts = 0, NumSExts = 0; 14965 for (auto &Op : I->operands()) { 14966 // Make sure we are not already sinking this operand 14967 if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) 14968 continue; 14969 14970 if (match(&Op, m_SExt(m_Value()))) { 14971 NumSExts++; 14972 continue; 14973 } else if (match(&Op, m_ZExt(m_Value()))) { 14974 NumZExts++; 14975 continue; 14976 } 14977 14978 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op); 14979 14980 // If the Shuffle is a splat and the operand is a zext/sext, sinking the 14981 // operand and the s/zext can help create indexed s/umull. This is 14982 // especially useful to prevent i64 mul being scalarized. 14983 if (Shuffle && isSplatShuffle(Shuffle) && 14984 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) { 14985 Ops.push_back(&Shuffle->getOperandUse(0)); 14986 Ops.push_back(&Op); 14987 if (match(Shuffle->getOperand(0), m_SExt(m_Value()))) 14988 NumSExts++; 14989 else 14990 NumZExts++; 14991 continue; 14992 } 14993 14994 if (!Shuffle) 14995 continue; 14996 14997 Value *ShuffleOperand = Shuffle->getOperand(0); 14998 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand); 14999 if (!Insert) 15000 continue; 15001 15002 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1)); 15003 if (!OperandInstr) 15004 continue; 15005 15006 ConstantInt *ElementConstant = 15007 dyn_cast<ConstantInt>(Insert->getOperand(2)); 15008 // Check that the insertelement is inserting into element 0 15009 if (!ElementConstant || !ElementConstant->isZero()) 15010 continue; 15011 15012 unsigned Opcode = OperandInstr->getOpcode(); 15013 if (Opcode == Instruction::SExt) 15014 NumSExts++; 15015 else if (Opcode == Instruction::ZExt) 15016 NumZExts++; 15017 else { 15018 // If we find that the top bits are known 0, then we can sink and allow 15019 // the backend to generate a umull. 15020 unsigned Bitwidth = I->getType()->getScalarSizeInBits(); 15021 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2); 15022 const DataLayout &DL = I->getFunction()->getParent()->getDataLayout(); 15023 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL)) 15024 continue; 15025 NumZExts++; 15026 } 15027 15028 Ops.push_back(&Shuffle->getOperandUse(0)); 15029 Ops.push_back(&Op); 15030 } 15031 15032 // Is it profitable to sink if we found two of the same type of extends. 15033 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2); 15034 } 15035 default: 15036 return false; 15037 } 15038 return false; 15039 } 15040 15041 static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy, 15042 bool IsLittleEndian) { 15043 Value *Op = ZExt->getOperand(0); 15044 auto *SrcTy = cast<FixedVectorType>(Op->getType()); 15045 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth(); 15046 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth(); 15047 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64) 15048 return false; 15049 15050 assert(DstWidth % SrcWidth == 0 && 15051 "TBL lowering is not supported for a ZExt instruction with this " 15052 "source & destination element type."); 15053 unsigned ZExtFactor = DstWidth / SrcWidth; 15054 unsigned NumElts = SrcTy->getNumElements(); 15055 IRBuilder<> Builder(ZExt); 15056 SmallVector<int> Mask; 15057 // Create a mask that selects <0,...,Op[i]> for each lane of the destination 15058 // vector to replace the original ZExt. This can later be lowered to a set of 15059 // tbl instructions. 15060 for (unsigned i = 0; i < NumElts * ZExtFactor; i++) { 15061 if (IsLittleEndian) { 15062 if (i % ZExtFactor == 0) 15063 Mask.push_back(i / ZExtFactor); 15064 else 15065 Mask.push_back(NumElts); 15066 } else { 15067 if ((i + 1) % ZExtFactor == 0) 15068 Mask.push_back((i - ZExtFactor + 1) / ZExtFactor); 15069 else 15070 Mask.push_back(NumElts); 15071 } 15072 } 15073 15074 auto *FirstEltZero = Builder.CreateInsertElement( 15075 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0)); 15076 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask); 15077 Result = Builder.CreateBitCast(Result, DstTy); 15078 if (DstTy != ZExt->getType()) 15079 Result = Builder.CreateZExt(Result, ZExt->getType()); 15080 ZExt->replaceAllUsesWith(Result); 15081 ZExt->eraseFromParent(); 15082 return true; 15083 } 15084 15085 static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { 15086 IRBuilder<> Builder(TI); 15087 SmallVector<Value *> Parts; 15088 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements(); 15089 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType()); 15090 auto *DstTy = cast<FixedVectorType>(TI->getType()); 15091 assert(SrcTy->getElementType()->isIntegerTy() && 15092 "Non-integer type source vector element is not supported"); 15093 assert(DstTy->getElementType()->isIntegerTy(8) && 15094 "Unsupported destination vector element type"); 15095 unsigned SrcElemTySz = 15096 cast<IntegerType>(SrcTy->getElementType())->getBitWidth(); 15097 unsigned DstElemTySz = 15098 cast<IntegerType>(DstTy->getElementType())->getBitWidth(); 15099 assert((SrcElemTySz % DstElemTySz == 0) && 15100 "Cannot lower truncate to tbl instructions for a source element size " 15101 "that is not divisible by the destination element size"); 15102 unsigned TruncFactor = SrcElemTySz / DstElemTySz; 15103 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) && 15104 "Unsupported source vector element type size"); 15105 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16); 15106 15107 // Create a mask to choose every nth byte from the source vector table of 15108 // bytes to create the truncated destination vector, where 'n' is the truncate 15109 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose 15110 // 0,8,16,..Y*8th bytes for the little-endian format 15111 SmallVector<Constant *, 16> MaskConst; 15112 for (int Itr = 0; Itr < 16; Itr++) { 15113 if (Itr < NumElements) 15114 MaskConst.push_back(Builder.getInt8( 15115 IsLittleEndian ? Itr * TruncFactor 15116 : Itr * TruncFactor + (TruncFactor - 1))); 15117 else 15118 MaskConst.push_back(Builder.getInt8(255)); 15119 } 15120 15121 int MaxTblSz = 128 * 4; 15122 int MaxSrcSz = SrcElemTySz * NumElements; 15123 int ElemsPerTbl = 15124 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz); 15125 assert(ElemsPerTbl <= 16 && 15126 "Maximum elements selected using TBL instruction cannot exceed 16!"); 15127 15128 int ShuffleCount = 128 / SrcElemTySz; 15129 SmallVector<int> ShuffleLanes; 15130 for (int i = 0; i < ShuffleCount; ++i) 15131 ShuffleLanes.push_back(i); 15132 15133 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles 15134 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated, 15135 // call TBL & save the result in a vector of TBL results for combining later. 15136 SmallVector<Value *> Results; 15137 while (ShuffleLanes.back() < NumElements) { 15138 Parts.push_back(Builder.CreateBitCast( 15139 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy)); 15140 15141 if (Parts.size() == 4) { 15142 auto *F = Intrinsic::getDeclaration(TI->getModule(), 15143 Intrinsic::aarch64_neon_tbl4, VecTy); 15144 Parts.push_back(ConstantVector::get(MaskConst)); 15145 Results.push_back(Builder.CreateCall(F, Parts)); 15146 Parts.clear(); 15147 } 15148 15149 for (int i = 0; i < ShuffleCount; ++i) 15150 ShuffleLanes[i] += ShuffleCount; 15151 } 15152 15153 assert((Parts.empty() || Results.empty()) && 15154 "Lowering trunc for vectors requiring different TBL instructions is " 15155 "not supported!"); 15156 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD 15157 // registers 15158 if (!Parts.empty()) { 15159 Intrinsic::ID TblID; 15160 switch (Parts.size()) { 15161 case 1: 15162 TblID = Intrinsic::aarch64_neon_tbl1; 15163 break; 15164 case 2: 15165 TblID = Intrinsic::aarch64_neon_tbl2; 15166 break; 15167 case 3: 15168 TblID = Intrinsic::aarch64_neon_tbl3; 15169 break; 15170 } 15171 15172 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy); 15173 Parts.push_back(ConstantVector::get(MaskConst)); 15174 Results.push_back(Builder.CreateCall(F, Parts)); 15175 } 15176 15177 // Extract the destination vector from TBL result(s) after combining them 15178 // where applicable. Currently, at most two TBLs are supported. 15179 assert(Results.size() <= 2 && "Trunc lowering does not support generation of " 15180 "more than 2 tbl instructions!"); 15181 Value *FinalResult = Results[0]; 15182 if (Results.size() == 1) { 15183 if (ElemsPerTbl < 16) { 15184 SmallVector<int> FinalMask(ElemsPerTbl); 15185 std::iota(FinalMask.begin(), FinalMask.end(), 0); 15186 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask); 15187 } 15188 } else { 15189 SmallVector<int> FinalMask(ElemsPerTbl * Results.size()); 15190 if (ElemsPerTbl < 16) { 15191 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0); 15192 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16); 15193 } else { 15194 std::iota(FinalMask.begin(), FinalMask.end(), 0); 15195 } 15196 FinalResult = 15197 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask); 15198 } 15199 15200 TI->replaceAllUsesWith(FinalResult); 15201 TI->eraseFromParent(); 15202 } 15203 15204 bool AArch64TargetLowering::optimizeExtendOrTruncateConversion( 15205 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const { 15206 // shuffle_vector instructions are serialized when targeting SVE, 15207 // see LowerSPLAT_VECTOR. This peephole is not beneficial. 15208 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors()) 15209 return false; 15210 15211 // Try to optimize conversions using tbl. This requires materializing constant 15212 // index vectors, which can increase code size and add loads. Skip the 15213 // transform unless the conversion is in a loop block guaranteed to execute 15214 // and we are not optimizing for size. 15215 Function *F = I->getParent()->getParent(); 15216 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() || 15217 F->hasOptSize()) 15218 return false; 15219 15220 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType()); 15221 auto *DstTy = dyn_cast<FixedVectorType>(I->getType()); 15222 if (!SrcTy || !DstTy) 15223 return false; 15224 15225 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be 15226 // lowered to tbl instructions to insert the original i8 elements 15227 // into i8x lanes. This is enabled for cases where it is beneficial. 15228 auto *ZExt = dyn_cast<ZExtInst>(I); 15229 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) { 15230 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits(); 15231 if (DstWidth % 8 != 0) 15232 return false; 15233 15234 auto *TruncDstType = 15235 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy)); 15236 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and 15237 // the remaining ZExt folded into the user, don't use tbl lowering. 15238 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits(); 15239 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType, 15240 TargetTransformInfo::getCastContextHint(I), 15241 TTI::TCK_SizeAndLatency, I) == TTI::TCC_Free) { 15242 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits()) 15243 return false; 15244 15245 DstTy = TruncDstType; 15246 } 15247 15248 return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian()); 15249 } 15250 15251 auto *UIToFP = dyn_cast<UIToFPInst>(I); 15252 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) && 15253 DstTy->getElementType()->isFloatTy()) { 15254 IRBuilder<> Builder(I); 15255 auto *ZExt = cast<ZExtInst>( 15256 Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy))); 15257 auto *UI = Builder.CreateUIToFP(ZExt, DstTy); 15258 I->replaceAllUsesWith(UI); 15259 I->eraseFromParent(); 15260 return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()), 15261 Subtarget->isLittleEndian()); 15262 } 15263 15264 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui 15265 // followed by a truncate lowered to using tbl.4. 15266 auto *FPToUI = dyn_cast<FPToUIInst>(I); 15267 if (FPToUI && 15268 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) && 15269 SrcTy->getElementType()->isFloatTy() && 15270 DstTy->getElementType()->isIntegerTy(8)) { 15271 IRBuilder<> Builder(I); 15272 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0), 15273 VectorType::getInteger(SrcTy)); 15274 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy); 15275 I->replaceAllUsesWith(TruncI); 15276 I->eraseFromParent(); 15277 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian()); 15278 return true; 15279 } 15280 15281 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate 15282 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits 15283 // per lane of the input that is represented using 1,2,3 or 4 128-bit table 15284 // registers 15285 auto *TI = dyn_cast<TruncInst>(I); 15286 if (TI && DstTy->getElementType()->isIntegerTy(8) && 15287 ((SrcTy->getElementType()->isIntegerTy(32) || 15288 SrcTy->getElementType()->isIntegerTy(64)) && 15289 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) { 15290 createTblForTrunc(TI, Subtarget->isLittleEndian()); 15291 return true; 15292 } 15293 15294 return false; 15295 } 15296 15297 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, 15298 Align &RequiredAligment) const { 15299 if (!LoadedType.isSimple() || 15300 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) 15301 return false; 15302 // Cyclone supports unaligned accesses. 15303 RequiredAligment = Align(1); 15304 unsigned NumBits = LoadedType.getSizeInBits(); 15305 return NumBits == 32 || NumBits == 64; 15306 } 15307 15308 /// A helper function for determining the number of interleaved accesses we 15309 /// will generate when lowering accesses of the given type. 15310 unsigned AArch64TargetLowering::getNumInterleavedAccesses( 15311 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const { 15312 unsigned VecSize = 128; 15313 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 15314 unsigned MinElts = VecTy->getElementCount().getKnownMinValue(); 15315 if (UseScalable) 15316 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u); 15317 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize); 15318 } 15319 15320 MachineMemOperand::Flags 15321 AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const { 15322 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && 15323 I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) 15324 return MOStridedAccess; 15325 return MachineMemOperand::MONone; 15326 } 15327 15328 bool AArch64TargetLowering::isLegalInterleavedAccessType( 15329 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const { 15330 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 15331 auto EC = VecTy->getElementCount(); 15332 unsigned MinElts = EC.getKnownMinValue(); 15333 15334 UseScalable = false; 15335 15336 if (!VecTy->isScalableTy() && !Subtarget->hasNEON()) 15337 return false; 15338 15339 if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME()) 15340 return false; 15341 15342 // Ensure that the predicate for this number of elements is available. 15343 if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts)) 15344 return false; 15345 15346 // Ensure the number of vector elements is greater than 1. 15347 if (MinElts < 2) 15348 return false; 15349 15350 // Ensure the element type is legal. 15351 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) 15352 return false; 15353 15354 if (EC.isScalable()) { 15355 UseScalable = true; 15356 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0; 15357 } 15358 15359 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 15360 if (!Subtarget->isNeonAvailable() || 15361 (Subtarget->useSVEForFixedLengthVectors() && 15362 (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 || 15363 (VecSize < Subtarget->getMinSVEVectorSizeInBits() && 15364 isPowerOf2_32(MinElts) && VecSize > 128)))) { 15365 UseScalable = true; 15366 return true; 15367 } 15368 15369 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 15370 // 128 will be split into multiple interleaved accesses. 15371 return VecSize == 64 || VecSize % 128 == 0; 15372 } 15373 15374 static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) { 15375 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext())) 15376 return ScalableVectorType::get(VTy->getElementType(), 2); 15377 15378 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext())) 15379 return ScalableVectorType::get(VTy->getElementType(), 4); 15380 15381 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext())) 15382 return ScalableVectorType::get(VTy->getElementType(), 8); 15383 15384 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext())) 15385 return ScalableVectorType::get(VTy->getElementType(), 8); 15386 15387 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext())) 15388 return ScalableVectorType::get(VTy->getElementType(), 2); 15389 15390 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext())) 15391 return ScalableVectorType::get(VTy->getElementType(), 4); 15392 15393 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext())) 15394 return ScalableVectorType::get(VTy->getElementType(), 8); 15395 15396 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext())) 15397 return ScalableVectorType::get(VTy->getElementType(), 16); 15398 15399 llvm_unreachable("Cannot handle input vector type"); 15400 } 15401 15402 static Function *getStructuredLoadFunction(Module *M, unsigned Factor, 15403 bool Scalable, Type *LDVTy, 15404 Type *PtrTy) { 15405 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor"); 15406 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret, 15407 Intrinsic::aarch64_sve_ld3_sret, 15408 Intrinsic::aarch64_sve_ld4_sret}; 15409 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2, 15410 Intrinsic::aarch64_neon_ld3, 15411 Intrinsic::aarch64_neon_ld4}; 15412 if (Scalable) 15413 return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy}); 15414 15415 return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy}); 15416 } 15417 15418 static Function *getStructuredStoreFunction(Module *M, unsigned Factor, 15419 bool Scalable, Type *STVTy, 15420 Type *PtrTy) { 15421 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor"); 15422 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2, 15423 Intrinsic::aarch64_sve_st3, 15424 Intrinsic::aarch64_sve_st4}; 15425 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2, 15426 Intrinsic::aarch64_neon_st3, 15427 Intrinsic::aarch64_neon_st4}; 15428 if (Scalable) 15429 return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy}); 15430 15431 return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy}); 15432 } 15433 15434 /// Lower an interleaved load into a ldN intrinsic. 15435 /// 15436 /// E.g. Lower an interleaved load (Factor = 2): 15437 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr 15438 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 15439 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 15440 /// 15441 /// Into: 15442 /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) 15443 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 15444 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 15445 bool AArch64TargetLowering::lowerInterleavedLoad( 15446 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 15447 ArrayRef<unsigned> Indices, unsigned Factor) const { 15448 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 15449 "Invalid interleave factor"); 15450 assert(!Shuffles.empty() && "Empty shufflevector input"); 15451 assert(Shuffles.size() == Indices.size() && 15452 "Unmatched number of shufflevectors and indices"); 15453 15454 const DataLayout &DL = LI->getModule()->getDataLayout(); 15455 15456 VectorType *VTy = Shuffles[0]->getType(); 15457 15458 // Skip if we do not have NEON and skip illegal vector types. We can 15459 // "legalize" wide vector types into multiple interleaved accesses as long as 15460 // the vector types are divisible by 128. 15461 bool UseScalable; 15462 if (!Subtarget->hasNEON() || 15463 !isLegalInterleavedAccessType(VTy, DL, UseScalable)) 15464 return false; 15465 15466 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable); 15467 15468 auto *FVTy = cast<FixedVectorType>(VTy); 15469 15470 // A pointer vector can not be the return type of the ldN intrinsics. Need to 15471 // load integer vectors first and then convert to pointer vectors. 15472 Type *EltTy = FVTy->getElementType(); 15473 if (EltTy->isPointerTy()) 15474 FVTy = 15475 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements()); 15476 15477 // If we're going to generate more than one load, reset the sub-vector type 15478 // to something legal. 15479 FVTy = FixedVectorType::get(FVTy->getElementType(), 15480 FVTy->getNumElements() / NumLoads); 15481 15482 auto *LDVTy = 15483 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy; 15484 15485 IRBuilder<> Builder(LI); 15486 15487 // The base address of the load. 15488 Value *BaseAddr = LI->getPointerOperand(); 15489 15490 Type *PtrTy = LI->getPointerOperandType(); 15491 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()), 15492 LDVTy->getElementCount()); 15493 15494 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor, 15495 UseScalable, LDVTy, PtrTy); 15496 15497 // Holds sub-vectors extracted from the load intrinsic return values. The 15498 // sub-vectors are associated with the shufflevector instructions they will 15499 // replace. 15500 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 15501 15502 Value *PTrue = nullptr; 15503 if (UseScalable) { 15504 std::optional<unsigned> PgPattern = 15505 getSVEPredPatternFromNumElements(FVTy->getNumElements()); 15506 if (Subtarget->getMinSVEVectorSizeInBits() == 15507 Subtarget->getMaxSVEVectorSizeInBits() && 15508 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy)) 15509 PgPattern = AArch64SVEPredPattern::all; 15510 15511 auto *PTruePat = 15512 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern); 15513 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, 15514 {PTruePat}); 15515 } 15516 15517 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 15518 15519 // If we're generating more than one load, compute the base address of 15520 // subsequent loads as an offset from the previous. 15521 if (LoadCount > 0) 15522 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr, 15523 FVTy->getNumElements() * Factor); 15524 15525 CallInst *LdN; 15526 if (UseScalable) 15527 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN"); 15528 else 15529 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); 15530 15531 // Extract and store the sub-vectors returned by the load intrinsic. 15532 for (unsigned i = 0; i < Shuffles.size(); i++) { 15533 ShuffleVectorInst *SVI = Shuffles[i]; 15534 unsigned Index = Indices[i]; 15535 15536 Value *SubVec = Builder.CreateExtractValue(LdN, Index); 15537 15538 if (UseScalable) 15539 SubVec = Builder.CreateExtractVector( 15540 FVTy, SubVec, 15541 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0)); 15542 15543 // Convert the integer vector to pointer vector if the element is pointer. 15544 if (EltTy->isPointerTy()) 15545 SubVec = Builder.CreateIntToPtr( 15546 SubVec, FixedVectorType::get(SVI->getType()->getElementType(), 15547 FVTy->getNumElements())); 15548 15549 SubVecs[SVI].push_back(SubVec); 15550 } 15551 } 15552 15553 // Replace uses of the shufflevector instructions with the sub-vectors 15554 // returned by the load intrinsic. If a shufflevector instruction is 15555 // associated with more than one sub-vector, those sub-vectors will be 15556 // concatenated into a single wide vector. 15557 for (ShuffleVectorInst *SVI : Shuffles) { 15558 auto &SubVec = SubVecs[SVI]; 15559 auto *WideVec = 15560 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 15561 SVI->replaceAllUsesWith(WideVec); 15562 } 15563 15564 return true; 15565 } 15566 15567 template <typename Iter> 15568 bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) { 15569 int MaxLookupDist = 20; 15570 unsigned IdxWidth = DL.getIndexSizeInBits(0); 15571 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0); 15572 const Value *PtrA1 = 15573 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); 15574 15575 while (++It != End) { 15576 if (It->isDebugOrPseudoInst()) 15577 continue; 15578 if (MaxLookupDist-- == 0) 15579 break; 15580 if (const auto *SI = dyn_cast<StoreInst>(&*It)) { 15581 const Value *PtrB1 = 15582 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets( 15583 DL, OffsetB); 15584 if (PtrA1 == PtrB1 && 15585 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth)) 15586 .abs() == 16) 15587 return true; 15588 } 15589 } 15590 15591 return false; 15592 } 15593 15594 /// Lower an interleaved store into a stN intrinsic. 15595 /// 15596 /// E.g. Lower an interleaved store (Factor = 3): 15597 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 15598 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 15599 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 15600 /// 15601 /// Into: 15602 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 15603 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 15604 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 15605 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) 15606 /// 15607 /// Note that the new shufflevectors will be removed and we'll only generate one 15608 /// st3 instruction in CodeGen. 15609 /// 15610 /// Example for a more general valid mask (Factor 3). Lower: 15611 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 15612 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 15613 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 15614 /// 15615 /// Into: 15616 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 15617 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 15618 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 15619 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) 15620 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, 15621 ShuffleVectorInst *SVI, 15622 unsigned Factor) const { 15623 15624 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 15625 "Invalid interleave factor"); 15626 15627 auto *VecTy = cast<FixedVectorType>(SVI->getType()); 15628 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); 15629 15630 unsigned LaneLen = VecTy->getNumElements() / Factor; 15631 Type *EltTy = VecTy->getElementType(); 15632 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); 15633 15634 const DataLayout &DL = SI->getModule()->getDataLayout(); 15635 bool UseScalable; 15636 15637 // Skip if we do not have NEON and skip illegal vector types. We can 15638 // "legalize" wide vector types into multiple interleaved accesses as long as 15639 // the vector types are divisible by 128. 15640 if (!Subtarget->hasNEON() || 15641 !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) 15642 return false; 15643 15644 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable); 15645 15646 Value *Op0 = SVI->getOperand(0); 15647 Value *Op1 = SVI->getOperand(1); 15648 IRBuilder<> Builder(SI); 15649 15650 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 15651 // vectors to integer vectors. 15652 if (EltTy->isPointerTy()) { 15653 Type *IntTy = DL.getIntPtrType(EltTy); 15654 unsigned NumOpElts = 15655 cast<FixedVectorType>(Op0->getType())->getNumElements(); 15656 15657 // Convert to the corresponding integer vector. 15658 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts); 15659 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 15660 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 15661 15662 SubVecTy = FixedVectorType::get(IntTy, LaneLen); 15663 } 15664 15665 // If we're going to generate more than one store, reset the lane length 15666 // and sub-vector type to something legal. 15667 LaneLen /= NumStores; 15668 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); 15669 15670 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy)) 15671 : SubVecTy; 15672 15673 // The base address of the store. 15674 Value *BaseAddr = SI->getPointerOperand(); 15675 15676 auto Mask = SVI->getShuffleMask(); 15677 15678 // Sanity check if all the indices are NOT in range. 15679 // If mask is `poison`, `Mask` may be a vector of -1s. 15680 // If all of them are `poison`, OOB read will happen later. 15681 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) { 15682 return false; 15683 } 15684 // A 64bit st2 which does not start at element 0 will involved adding extra 15685 // ext elements making the st2 unprofitable, and if there is a nearby store 15686 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a 15687 // zip;ldp pair which has higher throughput. 15688 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 && 15689 (Mask[0] != 0 || 15690 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr, 15691 DL) || 15692 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(), 15693 BaseAddr, DL))) 15694 return false; 15695 15696 Type *PtrTy = SI->getPointerOperandType(); 15697 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()), 15698 STVTy->getElementCount()); 15699 15700 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor, 15701 UseScalable, STVTy, PtrTy); 15702 15703 Value *PTrue = nullptr; 15704 if (UseScalable) { 15705 std::optional<unsigned> PgPattern = 15706 getSVEPredPatternFromNumElements(SubVecTy->getNumElements()); 15707 if (Subtarget->getMinSVEVectorSizeInBits() == 15708 Subtarget->getMaxSVEVectorSizeInBits() && 15709 Subtarget->getMinSVEVectorSizeInBits() == 15710 DL.getTypeSizeInBits(SubVecTy)) 15711 PgPattern = AArch64SVEPredPattern::all; 15712 15713 auto *PTruePat = 15714 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern); 15715 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, 15716 {PTruePat}); 15717 } 15718 15719 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 15720 15721 SmallVector<Value *, 5> Ops; 15722 15723 // Split the shufflevector operands into sub vectors for the new stN call. 15724 for (unsigned i = 0; i < Factor; i++) { 15725 Value *Shuffle; 15726 unsigned IdxI = StoreCount * LaneLen * Factor + i; 15727 if (Mask[IdxI] >= 0) { 15728 Shuffle = Builder.CreateShuffleVector( 15729 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)); 15730 } else { 15731 unsigned StartMask = 0; 15732 for (unsigned j = 1; j < LaneLen; j++) { 15733 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i; 15734 if (Mask[IdxJ] >= 0) { 15735 StartMask = Mask[IdxJ] - j; 15736 break; 15737 } 15738 } 15739 // Note: Filling undef gaps with random elements is ok, since 15740 // those elements were being written anyway (with undefs). 15741 // In the case of all undefs we're defaulting to using elems from 0 15742 // Note: StartMask cannot be negative, it's checked in 15743 // isReInterleaveMask 15744 Shuffle = Builder.CreateShuffleVector( 15745 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)); 15746 } 15747 15748 if (UseScalable) 15749 Shuffle = Builder.CreateInsertVector( 15750 STVTy, UndefValue::get(STVTy), Shuffle, 15751 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0)); 15752 15753 Ops.push_back(Shuffle); 15754 } 15755 15756 if (UseScalable) 15757 Ops.push_back(PTrue); 15758 15759 // If we generating more than one store, we compute the base address of 15760 // subsequent stores as an offset from the previous. 15761 if (StoreCount > 0) 15762 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), 15763 BaseAddr, LaneLen * Factor); 15764 15765 Ops.push_back(BaseAddr); 15766 Builder.CreateCall(StNFunc, Ops); 15767 } 15768 return true; 15769 } 15770 15771 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( 15772 IntrinsicInst *DI, LoadInst *LI) const { 15773 // Only deinterleave2 supported at present. 15774 if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2) 15775 return false; 15776 15777 // Only a factor of 2 supported at present. 15778 const unsigned Factor = 2; 15779 15780 VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0)); 15781 const DataLayout &DL = DI->getModule()->getDataLayout(); 15782 bool UseScalable; 15783 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) 15784 return false; 15785 15786 // TODO: Add support for using SVE instructions with fixed types later, using 15787 // the code from lowerInterleavedLoad to obtain the correct container type. 15788 if (UseScalable && !VTy->isScalableTy()) 15789 return false; 15790 15791 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable); 15792 15793 VectorType *LdTy = 15794 VectorType::get(VTy->getElementType(), 15795 VTy->getElementCount().divideCoefficientBy(NumLoads)); 15796 15797 Type *PtrTy = LI->getPointerOperandType(); 15798 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor, 15799 UseScalable, LdTy, PtrTy); 15800 15801 IRBuilder<> Builder(LI); 15802 15803 Value *Pred = nullptr; 15804 if (UseScalable) 15805 Pred = 15806 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue()); 15807 15808 Value *BaseAddr = LI->getPointerOperand(); 15809 Value *Result; 15810 if (NumLoads > 1) { 15811 Value *Left = PoisonValue::get(VTy); 15812 Value *Right = PoisonValue::get(VTy); 15813 15814 for (unsigned I = 0; I < NumLoads; ++I) { 15815 Value *Offset = Builder.getInt64(I * Factor); 15816 15817 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset}); 15818 Value *LdN = nullptr; 15819 if (UseScalable) 15820 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN"); 15821 else 15822 LdN = Builder.CreateCall(LdNFunc, Address, "ldN"); 15823 15824 Value *Idx = 15825 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue()); 15826 Left = Builder.CreateInsertVector( 15827 VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx); 15828 Right = Builder.CreateInsertVector( 15829 VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx); 15830 } 15831 15832 Result = PoisonValue::get(DI->getType()); 15833 Result = Builder.CreateInsertValue(Result, Left, 0); 15834 Result = Builder.CreateInsertValue(Result, Right, 1); 15835 } else { 15836 if (UseScalable) 15837 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN"); 15838 else 15839 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); 15840 } 15841 15842 DI->replaceAllUsesWith(Result); 15843 return true; 15844 } 15845 15846 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( 15847 IntrinsicInst *II, StoreInst *SI) const { 15848 // Only interleave2 supported at present. 15849 if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2) 15850 return false; 15851 15852 // Only a factor of 2 supported at present. 15853 const unsigned Factor = 2; 15854 15855 VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType()); 15856 const DataLayout &DL = II->getModule()->getDataLayout(); 15857 bool UseScalable; 15858 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) 15859 return false; 15860 15861 // TODO: Add support for using SVE instructions with fixed types later, using 15862 // the code from lowerInterleavedStore to obtain the correct container type. 15863 if (UseScalable && !VTy->isScalableTy()) 15864 return false; 15865 15866 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable); 15867 15868 VectorType *StTy = 15869 VectorType::get(VTy->getElementType(), 15870 VTy->getElementCount().divideCoefficientBy(NumStores)); 15871 15872 Type *PtrTy = SI->getPointerOperandType(); 15873 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor, 15874 UseScalable, StTy, PtrTy); 15875 15876 IRBuilder<> Builder(SI); 15877 15878 Value *BaseAddr = SI->getPointerOperand(); 15879 Value *Pred = nullptr; 15880 15881 if (UseScalable) 15882 Pred = 15883 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue()); 15884 15885 Value *L = II->getOperand(0); 15886 Value *R = II->getOperand(1); 15887 15888 for (unsigned I = 0; I < NumStores; ++I) { 15889 Value *Address = BaseAddr; 15890 if (NumStores > 1) { 15891 Value *Offset = Builder.getInt64(I * Factor); 15892 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset}); 15893 15894 Value *Idx = 15895 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue()); 15896 L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx); 15897 R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx); 15898 } 15899 15900 if (UseScalable) 15901 Builder.CreateCall(StNFunc, {L, R, Pred, Address}); 15902 else 15903 Builder.CreateCall(StNFunc, {L, R, Address}); 15904 } 15905 15906 return true; 15907 } 15908 15909 EVT AArch64TargetLowering::getOptimalMemOpType( 15910 const MemOp &Op, const AttributeList &FuncAttributes) const { 15911 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); 15912 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; 15913 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; 15914 // Only use AdvSIMD to implement memset of 32-byte and above. It would have 15915 // taken one instruction to materialize the v2i64 zero and one store (with 15916 // restrictive addressing mode). Just do i64 stores. 15917 bool IsSmallMemset = Op.isMemset() && Op.size() < 32; 15918 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { 15919 if (Op.isAligned(AlignCheck)) 15920 return true; 15921 unsigned Fast; 15922 return allowsMisalignedMemoryAccesses(VT, 0, Align(1), 15923 MachineMemOperand::MONone, &Fast) && 15924 Fast; 15925 }; 15926 15927 if (CanUseNEON && Op.isMemset() && !IsSmallMemset && 15928 AlignmentIsAcceptable(MVT::v16i8, Align(16))) 15929 return MVT::v16i8; 15930 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) 15931 return MVT::f128; 15932 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) 15933 return MVT::i64; 15934 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) 15935 return MVT::i32; 15936 return MVT::Other; 15937 } 15938 15939 LLT AArch64TargetLowering::getOptimalMemOpLLT( 15940 const MemOp &Op, const AttributeList &FuncAttributes) const { 15941 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); 15942 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; 15943 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; 15944 // Only use AdvSIMD to implement memset of 32-byte and above. It would have 15945 // taken one instruction to materialize the v2i64 zero and one store (with 15946 // restrictive addressing mode). Just do i64 stores. 15947 bool IsSmallMemset = Op.isMemset() && Op.size() < 32; 15948 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { 15949 if (Op.isAligned(AlignCheck)) 15950 return true; 15951 unsigned Fast; 15952 return allowsMisalignedMemoryAccesses(VT, 0, Align(1), 15953 MachineMemOperand::MONone, &Fast) && 15954 Fast; 15955 }; 15956 15957 if (CanUseNEON && Op.isMemset() && !IsSmallMemset && 15958 AlignmentIsAcceptable(MVT::v2i64, Align(16))) 15959 return LLT::fixed_vector(2, 64); 15960 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16))) 15961 return LLT::scalar(128); 15962 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8))) 15963 return LLT::scalar(64); 15964 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4))) 15965 return LLT::scalar(32); 15966 return LLT(); 15967 } 15968 15969 // 12-bit optionally shifted immediates are legal for adds. 15970 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { 15971 if (Immed == std::numeric_limits<int64_t>::min()) { 15972 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed 15973 << ": avoid UB for INT64_MIN\n"); 15974 return false; 15975 } 15976 // Same encoding for add/sub, just flip the sign. 15977 Immed = std::abs(Immed); 15978 bool IsLegal = ((Immed >> 12) == 0 || 15979 ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); 15980 LLVM_DEBUG(dbgs() << "Is " << Immed 15981 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n"); 15982 return IsLegal; 15983 } 15984 15985 // Return false to prevent folding 15986 // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine, 15987 // if the folding leads to worse code. 15988 bool AArch64TargetLowering::isMulAddWithConstProfitable( 15989 SDValue AddNode, SDValue ConstNode) const { 15990 // Let the DAGCombiner decide for vector types and large types. 15991 const EVT VT = AddNode.getValueType(); 15992 if (VT.isVector() || VT.getScalarSizeInBits() > 64) 15993 return true; 15994 15995 // It is worse if c1 is legal add immediate, while c1*c2 is not 15996 // and has to be composed by at least two instructions. 15997 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1)); 15998 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode); 15999 const int64_t C1 = C1Node->getSExtValue(); 16000 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue(); 16001 if (!isLegalAddImmediate(C1) || isLegalAddImmediate(C1C2.getSExtValue())) 16002 return true; 16003 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 16004 // Adapt to the width of a register. 16005 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64; 16006 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn); 16007 if (Insn.size() > 1) 16008 return false; 16009 16010 // Default to true and let the DAGCombiner decide. 16011 return true; 16012 } 16013 16014 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid 16015 // immediates is the same as for an add or a sub. 16016 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { 16017 return isLegalAddImmediate(Immed); 16018 } 16019 16020 /// isLegalAddressingMode - Return true if the addressing mode represented 16021 /// by AM is legal for this target, for a load/store of the specified type. 16022 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, 16023 const AddrMode &AMode, Type *Ty, 16024 unsigned AS, Instruction *I) const { 16025 // AArch64 has five basic addressing modes: 16026 // reg 16027 // reg + 9-bit signed offset 16028 // reg + SIZE_IN_BYTES * 12-bit unsigned offset 16029 // reg1 + reg2 16030 // reg + SIZE_IN_BYTES * reg 16031 16032 // No global is ever allowed as a base. 16033 if (AMode.BaseGV) 16034 return false; 16035 16036 // No reg+reg+imm addressing. 16037 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale) 16038 return false; 16039 16040 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and 16041 // `2*ScaledReg` into `BaseReg + ScaledReg` 16042 AddrMode AM = AMode; 16043 if (AM.Scale && !AM.HasBaseReg) { 16044 if (AM.Scale == 1) { 16045 AM.HasBaseReg = true; 16046 AM.Scale = 0; 16047 } else if (AM.Scale == 2) { 16048 AM.HasBaseReg = true; 16049 AM.Scale = 1; 16050 } else { 16051 return false; 16052 } 16053 } 16054 16055 // A base register is required in all addressing modes. 16056 if (!AM.HasBaseReg) 16057 return false; 16058 16059 if (Ty->isScalableTy()) { 16060 if (isa<ScalableVectorType>(Ty)) { 16061 uint64_t VecElemNumBytes = 16062 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8; 16063 return AM.HasBaseReg && !AM.BaseOffs && 16064 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes); 16065 } 16066 16067 return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale; 16068 } 16069 16070 // check reg + imm case: 16071 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 16072 uint64_t NumBytes = 0; 16073 if (Ty->isSized()) { 16074 uint64_t NumBits = DL.getTypeSizeInBits(Ty); 16075 NumBytes = NumBits / 8; 16076 if (!isPowerOf2_64(NumBits)) 16077 NumBytes = 0; 16078 } 16079 16080 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs, 16081 AM.Scale); 16082 } 16083 16084 // Check whether the 2 offsets belong to the same imm24 range, and their high 16085 // 12bits are same, then their high part can be decoded with the offset of add. 16086 int64_t 16087 AArch64TargetLowering::getPreferredLargeGEPBaseOffset(int64_t MinOffset, 16088 int64_t MaxOffset) const { 16089 int64_t HighPart = MinOffset & ~0xfffULL; 16090 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) { 16091 // Rebase the value to an integer multiple of imm12. 16092 return HighPart; 16093 } 16094 16095 return 0; 16096 } 16097 16098 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const { 16099 // Consider splitting large offset of struct or array. 16100 return true; 16101 } 16102 16103 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd( 16104 const MachineFunction &MF, EVT VT) const { 16105 VT = VT.getScalarType(); 16106 16107 if (!VT.isSimple()) 16108 return false; 16109 16110 switch (VT.getSimpleVT().SimpleTy) { 16111 case MVT::f16: 16112 return Subtarget->hasFullFP16(); 16113 case MVT::f32: 16114 case MVT::f64: 16115 return true; 16116 default: 16117 break; 16118 } 16119 16120 return false; 16121 } 16122 16123 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F, 16124 Type *Ty) const { 16125 switch (Ty->getScalarType()->getTypeID()) { 16126 case Type::FloatTyID: 16127 case Type::DoubleTyID: 16128 return true; 16129 default: 16130 return false; 16131 } 16132 } 16133 16134 bool AArch64TargetLowering::generateFMAsInMachineCombiner( 16135 EVT VT, CodeGenOptLevel OptLevel) const { 16136 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() && 16137 !useSVEForFixedLengthVectorVT(VT); 16138 } 16139 16140 const MCPhysReg * 16141 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { 16142 // LR is a callee-save register, but we must treat it as clobbered by any call 16143 // site. Hence we include LR in the scratch registers, which are in turn added 16144 // as implicit-defs for stackmaps and patchpoints. 16145 static const MCPhysReg ScratchRegs[] = { 16146 AArch64::X16, AArch64::X17, AArch64::LR, 0 16147 }; 16148 return ScratchRegs; 16149 } 16150 16151 ArrayRef<MCPhysReg> AArch64TargetLowering::getRoundingControlRegisters() const { 16152 static const MCPhysReg RCRegs[] = {AArch64::FPCR}; 16153 return RCRegs; 16154 } 16155 16156 bool 16157 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 16158 CombineLevel Level) const { 16159 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || 16160 N->getOpcode() == ISD::SRL) && 16161 "Expected shift op"); 16162 16163 SDValue ShiftLHS = N->getOperand(0); 16164 EVT VT = N->getValueType(0); 16165 16166 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not 16167 // combine it with shift 'N' to let it be lowered to UBFX except: 16168 // ((x >> C) & mask) << C. 16169 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && 16170 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) { 16171 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1); 16172 if (isMask_64(TruncMask)) { 16173 SDValue AndLHS = ShiftLHS.getOperand(0); 16174 if (AndLHS.getOpcode() == ISD::SRL) { 16175 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) { 16176 if (N->getOpcode() == ISD::SHL) 16177 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1))) 16178 return SRLC->getZExtValue() == SHLC->getZExtValue(); 16179 return false; 16180 } 16181 } 16182 } 16183 } 16184 return true; 16185 } 16186 16187 bool AArch64TargetLowering::isDesirableToCommuteXorWithShift( 16188 const SDNode *N) const { 16189 assert(N->getOpcode() == ISD::XOR && 16190 (N->getOperand(0).getOpcode() == ISD::SHL || 16191 N->getOperand(0).getOpcode() == ISD::SRL) && 16192 "Expected XOR(SHIFT) pattern"); 16193 16194 // Only commute if the entire NOT mask is a hidden shifted mask. 16195 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1)); 16196 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1)); 16197 if (XorC && ShiftC) { 16198 unsigned MaskIdx, MaskLen; 16199 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) { 16200 unsigned ShiftAmt = ShiftC->getZExtValue(); 16201 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); 16202 if (N->getOperand(0).getOpcode() == ISD::SHL) 16203 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt); 16204 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt); 16205 } 16206 } 16207 16208 return false; 16209 } 16210 16211 bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask( 16212 const SDNode *N, CombineLevel Level) const { 16213 assert(((N->getOpcode() == ISD::SHL && 16214 N->getOperand(0).getOpcode() == ISD::SRL) || 16215 (N->getOpcode() == ISD::SRL && 16216 N->getOperand(0).getOpcode() == ISD::SHL)) && 16217 "Expected shift-shift mask"); 16218 // Don't allow multiuse shift folding with the same shift amount. 16219 if (!N->getOperand(0)->hasOneUse()) 16220 return false; 16221 16222 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns. 16223 EVT VT = N->getValueType(0); 16224 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) { 16225 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1)); 16226 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 16227 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue()); 16228 } 16229 16230 return true; 16231 } 16232 16233 bool AArch64TargetLowering::shouldFoldSelectWithIdentityConstant( 16234 unsigned BinOpcode, EVT VT) const { 16235 return VT.isScalableVector() && isTypeLegal(VT); 16236 } 16237 16238 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 16239 Type *Ty) const { 16240 assert(Ty->isIntegerTy()); 16241 16242 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 16243 if (BitSize == 0) 16244 return false; 16245 16246 int64_t Val = Imm.getSExtValue(); 16247 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) 16248 return true; 16249 16250 if ((int64_t)Val < 0) 16251 Val = ~Val; 16252 if (BitSize == 32) 16253 Val &= (1LL << 32) - 1; 16254 16255 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16; 16256 // MOVZ is free so return true for one or fewer MOVK. 16257 return Shift < 3; 16258 } 16259 16260 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 16261 unsigned Index) const { 16262 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 16263 return false; 16264 16265 return (Index == 0 || Index == ResVT.getVectorMinNumElements()); 16266 } 16267 16268 /// Turn vector tests of the signbit in the form of: 16269 /// xor (sra X, elt_size(X)-1), -1 16270 /// into: 16271 /// cmge X, X, #0 16272 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, 16273 const AArch64Subtarget *Subtarget) { 16274 EVT VT = N->getValueType(0); 16275 if (!Subtarget->hasNEON() || !VT.isVector()) 16276 return SDValue(); 16277 16278 // There must be a shift right algebraic before the xor, and the xor must be a 16279 // 'not' operation. 16280 SDValue Shift = N->getOperand(0); 16281 SDValue Ones = N->getOperand(1); 16282 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() || 16283 !ISD::isBuildVectorAllOnes(Ones.getNode())) 16284 return SDValue(); 16285 16286 // The shift should be smearing the sign bit across each vector element. 16287 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 16288 EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); 16289 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) 16290 return SDValue(); 16291 16292 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); 16293 } 16294 16295 // Given a vecreduce_add node, detect the below pattern and convert it to the 16296 // node sequence with UABDL, [S|U]ADB and UADDLP. 16297 // 16298 // i32 vecreduce_add( 16299 // v16i32 abs( 16300 // v16i32 sub( 16301 // v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b)))) 16302 // =================> 16303 // i32 vecreduce_add( 16304 // v4i32 UADDLP( 16305 // v8i16 add( 16306 // v8i16 zext( 16307 // v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b 16308 // v8i16 zext( 16309 // v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b 16310 static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, 16311 SelectionDAG &DAG) { 16312 // Assumed i32 vecreduce_add 16313 if (N->getValueType(0) != MVT::i32) 16314 return SDValue(); 16315 16316 SDValue VecReduceOp0 = N->getOperand(0); 16317 unsigned Opcode = VecReduceOp0.getOpcode(); 16318 // Assumed v16i32 abs 16319 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32) 16320 return SDValue(); 16321 16322 SDValue ABS = VecReduceOp0; 16323 // Assumed v16i32 sub 16324 if (ABS->getOperand(0)->getOpcode() != ISD::SUB || 16325 ABS->getOperand(0)->getValueType(0) != MVT::v16i32) 16326 return SDValue(); 16327 16328 SDValue SUB = ABS->getOperand(0); 16329 unsigned Opcode0 = SUB->getOperand(0).getOpcode(); 16330 unsigned Opcode1 = SUB->getOperand(1).getOpcode(); 16331 // Assumed v16i32 type 16332 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 || 16333 SUB->getOperand(1)->getValueType(0) != MVT::v16i32) 16334 return SDValue(); 16335 16336 // Assumed zext or sext 16337 bool IsZExt = false; 16338 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) { 16339 IsZExt = true; 16340 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) { 16341 IsZExt = false; 16342 } else 16343 return SDValue(); 16344 16345 SDValue EXT0 = SUB->getOperand(0); 16346 SDValue EXT1 = SUB->getOperand(1); 16347 // Assumed zext's operand has v16i8 type 16348 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 || 16349 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8) 16350 return SDValue(); 16351 16352 // Pattern is dectected. Let's convert it to sequence of nodes. 16353 SDLoc DL(N); 16354 16355 // First, create the node pattern of UABD/SABD. 16356 SDValue UABDHigh8Op0 = 16357 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0), 16358 DAG.getConstant(8, DL, MVT::i64)); 16359 SDValue UABDHigh8Op1 = 16360 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0), 16361 DAG.getConstant(8, DL, MVT::i64)); 16362 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8, 16363 UABDHigh8Op0, UABDHigh8Op1); 16364 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8); 16365 16366 // Second, create the node pattern of UABAL. 16367 SDValue UABDLo8Op0 = 16368 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0), 16369 DAG.getConstant(0, DL, MVT::i64)); 16370 SDValue UABDLo8Op1 = 16371 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0), 16372 DAG.getConstant(0, DL, MVT::i64)); 16373 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8, 16374 UABDLo8Op0, UABDLo8Op1); 16375 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8); 16376 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD); 16377 16378 // Third, create the node of UADDLP. 16379 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL); 16380 16381 // Fourth, create the node of VECREDUCE_ADD. 16382 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP); 16383 } 16384 16385 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce 16386 // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one)) 16387 // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B)) 16388 // If we have vectors larger than v16i8 we extract v16i8 vectors, 16389 // Follow the same steps above to get DOT instructions concatenate them 16390 // and generate vecreduce.add(concat_vector(DOT, DOT2, ..)). 16391 static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, 16392 const AArch64Subtarget *ST) { 16393 if (!ST->hasDotProd()) 16394 return performVecReduceAddCombineWithUADDLP(N, DAG); 16395 16396 SDValue Op0 = N->getOperand(0); 16397 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() || 16398 Op0.getValueType().getVectorElementType() != MVT::i32) 16399 return SDValue(); 16400 16401 unsigned ExtOpcode = Op0.getOpcode(); 16402 SDValue A = Op0; 16403 SDValue B; 16404 if (ExtOpcode == ISD::MUL) { 16405 A = Op0.getOperand(0); 16406 B = Op0.getOperand(1); 16407 if (A.getOpcode() != B.getOpcode() || 16408 A.getOperand(0).getValueType() != B.getOperand(0).getValueType()) 16409 return SDValue(); 16410 ExtOpcode = A.getOpcode(); 16411 } 16412 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND) 16413 return SDValue(); 16414 16415 EVT Op0VT = A.getOperand(0).getValueType(); 16416 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0; 16417 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8; 16418 if (!IsValidElementCount || !IsValidSize) 16419 return SDValue(); 16420 16421 SDLoc DL(Op0); 16422 // For non-mla reductions B can be set to 1. For MLA we take the operand of 16423 // the extend B. 16424 if (!B) 16425 B = DAG.getConstant(1, DL, Op0VT); 16426 else 16427 B = B.getOperand(0); 16428 16429 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0; 16430 unsigned NumOfVecReduce; 16431 EVT TargetType; 16432 if (IsMultipleOf16) { 16433 NumOfVecReduce = Op0VT.getVectorNumElements() / 16; 16434 TargetType = MVT::v4i32; 16435 } else { 16436 NumOfVecReduce = Op0VT.getVectorNumElements() / 8; 16437 TargetType = MVT::v2i32; 16438 } 16439 auto DotOpcode = 16440 (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT; 16441 // Handle the case where we need to generate only one Dot operation. 16442 if (NumOfVecReduce == 1) { 16443 SDValue Zeros = DAG.getConstant(0, DL, TargetType); 16444 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, 16445 A.getOperand(0), B); 16446 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot); 16447 } 16448 // Generate Dot instructions that are multiple of 16. 16449 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16; 16450 SmallVector<SDValue, 4> SDotVec16; 16451 unsigned I = 0; 16452 for (; I < VecReduce16Num; I += 1) { 16453 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32); 16454 SDValue Op0 = 16455 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0), 16456 DAG.getConstant(I * 16, DL, MVT::i64)); 16457 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B, 16458 DAG.getConstant(I * 16, DL, MVT::i64)); 16459 SDValue Dot = 16460 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1); 16461 SDotVec16.push_back(Dot); 16462 } 16463 // Concatenate dot operations. 16464 EVT SDot16EVT = 16465 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num); 16466 SDValue ConcatSDot16 = 16467 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16); 16468 SDValue VecReduceAdd16 = 16469 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16); 16470 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8; 16471 if (VecReduce8Num == 0) 16472 return VecReduceAdd16; 16473 16474 // Generate the remainder Dot operation that is multiple of 8. 16475 SmallVector<SDValue, 4> SDotVec8; 16476 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32); 16477 SDValue Vec8Op0 = 16478 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0), 16479 DAG.getConstant(I * 16, DL, MVT::i64)); 16480 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B, 16481 DAG.getConstant(I * 16, DL, MVT::i64)); 16482 SDValue Dot = 16483 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1); 16484 SDValue VecReudceAdd8 = 16485 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot); 16486 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16, 16487 VecReudceAdd8); 16488 } 16489 16490 // Given an (integer) vecreduce, we know the order of the inputs does not 16491 // matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x)))) 16492 // into UADDV(UADDLP(x)). This can also happen through an extra add, where we 16493 // transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))). 16494 static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) { 16495 auto DetectAddExtract = [&](SDValue A) { 16496 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning 16497 // UADDLP(x) if found. 16498 if (A.getOpcode() != ISD::ADD) 16499 return SDValue(); 16500 EVT VT = A.getValueType(); 16501 SDValue Op0 = A.getOperand(0); 16502 SDValue Op1 = A.getOperand(1); 16503 if (Op0.getOpcode() != Op0.getOpcode() || 16504 (Op0.getOpcode() != ISD::ZERO_EXTEND && 16505 Op0.getOpcode() != ISD::SIGN_EXTEND)) 16506 return SDValue(); 16507 SDValue Ext0 = Op0.getOperand(0); 16508 SDValue Ext1 = Op1.getOperand(0); 16509 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR || 16510 Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR || 16511 Ext0.getOperand(0) != Ext1.getOperand(0)) 16512 return SDValue(); 16513 // Check that the type is twice the add types, and the extract are from 16514 // upper/lower parts of the same source. 16515 if (Ext0.getOperand(0).getValueType().getVectorNumElements() != 16516 VT.getVectorNumElements() * 2) 16517 return SDValue(); 16518 if ((Ext0.getConstantOperandVal(1) != 0 || 16519 Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) && 16520 (Ext1.getConstantOperandVal(1) != 0 || 16521 Ext0.getConstantOperandVal(1) != VT.getVectorNumElements())) 16522 return SDValue(); 16523 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP 16524 : AArch64ISD::SADDLP; 16525 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0)); 16526 }; 16527 16528 if (SDValue R = DetectAddExtract(A)) 16529 return R; 16530 16531 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse()) 16532 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG)) 16533 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R, 16534 A.getOperand(1)); 16535 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse()) 16536 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG)) 16537 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R, 16538 A.getOperand(0)); 16539 return SDValue(); 16540 } 16541 16542 static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) { 16543 SDValue A = N->getOperand(0); 16544 if (A.getOpcode() == ISD::ADD) 16545 if (SDValue R = performUADDVAddCombine(A, DAG)) 16546 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R); 16547 return SDValue(); 16548 } 16549 16550 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, 16551 TargetLowering::DAGCombinerInfo &DCI, 16552 const AArch64Subtarget *Subtarget) { 16553 if (DCI.isBeforeLegalizeOps()) 16554 return SDValue(); 16555 16556 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget); 16557 } 16558 16559 SDValue 16560 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 16561 SelectionDAG &DAG, 16562 SmallVectorImpl<SDNode *> &Created) const { 16563 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 16564 if (isIntDivCheap(N->getValueType(0), Attr)) 16565 return SDValue(N,0); // Lower SDIV as SDIV 16566 16567 EVT VT = N->getValueType(0); 16568 16569 // For scalable and fixed types, mark them as cheap so we can handle it much 16570 // later. This allows us to handle larger than legal types. 16571 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors()) 16572 return SDValue(N, 0); 16573 16574 // fold (sdiv X, pow2) 16575 if ((VT != MVT::i32 && VT != MVT::i64) || 16576 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2())) 16577 return SDValue(); 16578 16579 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created); 16580 } 16581 16582 SDValue 16583 AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor, 16584 SelectionDAG &DAG, 16585 SmallVectorImpl<SDNode *> &Created) const { 16586 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 16587 if (isIntDivCheap(N->getValueType(0), Attr)) 16588 return SDValue(N, 0); // Lower SREM as SREM 16589 16590 EVT VT = N->getValueType(0); 16591 16592 // For scalable and fixed types, mark them as cheap so we can handle it much 16593 // later. This allows us to handle larger than legal types. 16594 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors()) 16595 return SDValue(N, 0); 16596 16597 // fold (srem X, pow2) 16598 if ((VT != MVT::i32 && VT != MVT::i64) || 16599 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2())) 16600 return SDValue(); 16601 16602 unsigned Lg2 = Divisor.countr_zero(); 16603 if (Lg2 == 0) 16604 return SDValue(); 16605 16606 SDLoc DL(N); 16607 SDValue N0 = N->getOperand(0); 16608 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); 16609 SDValue Zero = DAG.getConstant(0, DL, VT); 16610 SDValue CCVal, CSNeg; 16611 if (Lg2 == 1) { 16612 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL); 16613 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne); 16614 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp); 16615 16616 Created.push_back(Cmp.getNode()); 16617 Created.push_back(And.getNode()); 16618 } else { 16619 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC); 16620 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 16621 16622 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0); 16623 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne); 16624 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne); 16625 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal, 16626 Negs.getValue(1)); 16627 16628 Created.push_back(Negs.getNode()); 16629 Created.push_back(AndPos.getNode()); 16630 Created.push_back(AndNeg.getNode()); 16631 } 16632 16633 return CSNeg; 16634 } 16635 16636 static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) { 16637 switch(getIntrinsicID(S.getNode())) { 16638 default: 16639 break; 16640 case Intrinsic::aarch64_sve_cntb: 16641 return 8; 16642 case Intrinsic::aarch64_sve_cnth: 16643 return 16; 16644 case Intrinsic::aarch64_sve_cntw: 16645 return 32; 16646 case Intrinsic::aarch64_sve_cntd: 16647 return 64; 16648 } 16649 return {}; 16650 } 16651 16652 /// Calculates what the pre-extend type is, based on the extension 16653 /// operation node provided by \p Extend. 16654 /// 16655 /// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the 16656 /// pre-extend type is pulled directly from the operand, while other extend 16657 /// operations need a bit more inspection to get this information. 16658 /// 16659 /// \param Extend The SDNode from the DAG that represents the extend operation 16660 /// 16661 /// \returns The type representing the \p Extend source type, or \p MVT::Other 16662 /// if no valid type can be determined 16663 static EVT calculatePreExtendType(SDValue Extend) { 16664 switch (Extend.getOpcode()) { 16665 case ISD::SIGN_EXTEND: 16666 case ISD::ZERO_EXTEND: 16667 return Extend.getOperand(0).getValueType(); 16668 case ISD::AssertSext: 16669 case ISD::AssertZext: 16670 case ISD::SIGN_EXTEND_INREG: { 16671 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1)); 16672 if (!TypeNode) 16673 return MVT::Other; 16674 return TypeNode->getVT(); 16675 } 16676 case ISD::AND: { 16677 ConstantSDNode *Constant = 16678 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode()); 16679 if (!Constant) 16680 return MVT::Other; 16681 16682 uint32_t Mask = Constant->getZExtValue(); 16683 16684 if (Mask == UCHAR_MAX) 16685 return MVT::i8; 16686 else if (Mask == USHRT_MAX) 16687 return MVT::i16; 16688 else if (Mask == UINT_MAX) 16689 return MVT::i32; 16690 16691 return MVT::Other; 16692 } 16693 default: 16694 return MVT::Other; 16695 } 16696 } 16697 16698 /// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern 16699 /// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector 16700 /// SExt/ZExt rather than the scalar SExt/ZExt 16701 static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) { 16702 EVT VT = BV.getValueType(); 16703 if (BV.getOpcode() != ISD::BUILD_VECTOR && 16704 BV.getOpcode() != ISD::VECTOR_SHUFFLE) 16705 return SDValue(); 16706 16707 // Use the first item in the buildvector/shuffle to get the size of the 16708 // extend, and make sure it looks valid. 16709 SDValue Extend = BV->getOperand(0); 16710 unsigned ExtendOpcode = Extend.getOpcode(); 16711 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND || 16712 ExtendOpcode == ISD::SIGN_EXTEND_INREG || 16713 ExtendOpcode == ISD::AssertSext; 16714 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND && 16715 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND) 16716 return SDValue(); 16717 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure 16718 // calculatePreExtendType will work without issue. 16719 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE && 16720 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND) 16721 return SDValue(); 16722 16723 // Restrict valid pre-extend data type 16724 EVT PreExtendType = calculatePreExtendType(Extend); 16725 if (PreExtendType == MVT::Other || 16726 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2) 16727 return SDValue(); 16728 16729 // Make sure all other operands are equally extended 16730 for (SDValue Op : drop_begin(BV->ops())) { 16731 if (Op.isUndef()) 16732 continue; 16733 unsigned Opc = Op.getOpcode(); 16734 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG || 16735 Opc == ISD::AssertSext; 16736 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType) 16737 return SDValue(); 16738 } 16739 16740 SDValue NBV; 16741 SDLoc DL(BV); 16742 if (BV.getOpcode() == ISD::BUILD_VECTOR) { 16743 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType); 16744 EVT PreExtendLegalType = 16745 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType; 16746 SmallVector<SDValue, 8> NewOps; 16747 for (SDValue Op : BV->ops()) 16748 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType) 16749 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, 16750 PreExtendLegalType)); 16751 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps); 16752 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE 16753 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType()); 16754 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0), 16755 BV.getOperand(1).isUndef() 16756 ? DAG.getUNDEF(PreExtendVT) 16757 : BV.getOperand(1).getOperand(0), 16758 cast<ShuffleVectorSDNode>(BV)->getMask()); 16759 } 16760 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV); 16761 } 16762 16763 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) 16764 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt 16765 static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) { 16766 // If the value type isn't a vector, none of the operands are going to be dups 16767 EVT VT = Mul->getValueType(0); 16768 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64) 16769 return SDValue(); 16770 16771 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG); 16772 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG); 16773 16774 // Neither operands have been changed, don't make any further changes 16775 if (!Op0 && !Op1) 16776 return SDValue(); 16777 16778 SDLoc DL(Mul); 16779 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0), 16780 Op1 ? Op1 : Mul->getOperand(1)); 16781 } 16782 16783 // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz 16784 // Same for other types with equivalent constants. 16785 static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) { 16786 EVT VT = N->getValueType(0); 16787 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 && 16788 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16) 16789 return SDValue(); 16790 if (N->getOperand(0).getOpcode() != ISD::AND || 16791 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL) 16792 return SDValue(); 16793 16794 SDValue And = N->getOperand(0); 16795 SDValue Srl = And.getOperand(0); 16796 16797 APInt V1, V2, V3; 16798 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) || 16799 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) || 16800 !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3)) 16801 return SDValue(); 16802 16803 unsigned HalfSize = VT.getScalarSizeInBits() / 2; 16804 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) || 16805 V3 != (HalfSize - 1)) 16806 return SDValue(); 16807 16808 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), 16809 EVT::getIntegerVT(*DAG.getContext(), HalfSize), 16810 VT.getVectorElementCount() * 2); 16811 16812 SDLoc DL(N); 16813 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0)); 16814 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In); 16815 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM); 16816 } 16817 16818 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, 16819 TargetLowering::DAGCombinerInfo &DCI, 16820 const AArch64Subtarget *Subtarget) { 16821 16822 if (SDValue Ext = performMulVectorExtendCombine(N, DAG)) 16823 return Ext; 16824 if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG)) 16825 return Ext; 16826 16827 if (DCI.isBeforeLegalizeOps()) 16828 return SDValue(); 16829 16830 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y, 16831 // and in MachineCombiner pass, add+mul will be combined into madd. 16832 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X. 16833 SDLoc DL(N); 16834 EVT VT = N->getValueType(0); 16835 SDValue N0 = N->getOperand(0); 16836 SDValue N1 = N->getOperand(1); 16837 SDValue MulOper; 16838 unsigned AddSubOpc; 16839 16840 auto IsAddSubWith1 = [&](SDValue V) -> bool { 16841 AddSubOpc = V->getOpcode(); 16842 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) { 16843 SDValue Opnd = V->getOperand(1); 16844 MulOper = V->getOperand(0); 16845 if (AddSubOpc == ISD::SUB) 16846 std::swap(Opnd, MulOper); 16847 if (auto C = dyn_cast<ConstantSDNode>(Opnd)) 16848 return C->isOne(); 16849 } 16850 return false; 16851 }; 16852 16853 if (IsAddSubWith1(N0)) { 16854 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper); 16855 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal); 16856 } 16857 16858 if (IsAddSubWith1(N1)) { 16859 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper); 16860 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal); 16861 } 16862 16863 // The below optimizations require a constant RHS. 16864 if (!isa<ConstantSDNode>(N1)) 16865 return SDValue(); 16866 16867 ConstantSDNode *C = cast<ConstantSDNode>(N1); 16868 const APInt &ConstValue = C->getAPIntValue(); 16869 16870 // Allow the scaling to be folded into the `cnt` instruction by preventing 16871 // the scaling to be obscured here. This makes it easier to pattern match. 16872 if (IsSVECntIntrinsic(N0) || 16873 (N0->getOpcode() == ISD::TRUNCATE && 16874 (IsSVECntIntrinsic(N0->getOperand(0))))) 16875 if (ConstValue.sge(1) && ConstValue.sle(16)) 16876 return SDValue(); 16877 16878 // Multiplication of a power of two plus/minus one can be done more 16879 // cheaply as shift+add/sub. For now, this is true unilaterally. If 16880 // future CPUs have a cheaper MADD instruction, this may need to be 16881 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and 16882 // 64-bit is 5 cycles, so this is always a win. 16883 // More aggressively, some multiplications N0 * C can be lowered to 16884 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M, 16885 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8) 16886 // TODO: lower more cases. 16887 16888 // TrailingZeroes is used to test if the mul can be lowered to 16889 // shift+add+shift. 16890 unsigned TrailingZeroes = ConstValue.countr_zero(); 16891 if (TrailingZeroes) { 16892 // Conservatively do not lower to shift+add+shift if the mul might be 16893 // folded into smul or umul. 16894 if (N0->hasOneUse() && (isSignExtended(N0, DAG) || 16895 isZeroExtended(N0, DAG))) 16896 return SDValue(); 16897 // Conservatively do not lower to shift+add+shift if the mul might be 16898 // folded into madd or msub. 16899 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD || 16900 N->use_begin()->getOpcode() == ISD::SUB)) 16901 return SDValue(); 16902 } 16903 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub 16904 // and shift+add+shift. 16905 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes); 16906 unsigned ShiftAmt; 16907 16908 auto Shl = [&](SDValue N0, unsigned N1) { 16909 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64); 16910 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS); 16911 }; 16912 auto Add = [&](SDValue N0, SDValue N1) { 16913 return DAG.getNode(ISD::ADD, DL, VT, N0, N1); 16914 }; 16915 auto Sub = [&](SDValue N0, SDValue N1) { 16916 return DAG.getNode(ISD::SUB, DL, VT, N0, N1); 16917 }; 16918 auto Negate = [&](SDValue N) { 16919 SDValue Zero = DAG.getConstant(0, DL, VT); 16920 return DAG.getNode(ISD::SUB, DL, VT, Zero, N); 16921 }; 16922 16923 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg: 16924 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as 16925 // the (2^N - 1) can't be execused via a single instruction. 16926 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) { 16927 unsigned BitWidth = C.getBitWidth(); 16928 for (unsigned i = 1; i < BitWidth / 2; i++) { 16929 APInt Rem; 16930 APInt X(BitWidth, (1 << i) + 1); 16931 APInt::sdivrem(C, X, N, Rem); 16932 APInt NVMinus1 = N - 1; 16933 if (Rem == 0 && NVMinus1.isPowerOf2()) { 16934 M = X; 16935 return true; 16936 } 16937 } 16938 return false; 16939 }; 16940 16941 if (ConstValue.isNonNegative()) { 16942 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) 16943 // (mul x, 2^N - 1) => (sub (shl x, N), x) 16944 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M)) 16945 // (mul x, (2^M + 1) * (2^N + 1)) 16946 // => MV = (add (shl x, M), x); (add (shl MV, N), MV) 16947 APInt SCVMinus1 = ShiftedConstValue - 1; 16948 APInt SCVPlus1 = ShiftedConstValue + 1; 16949 APInt CVPlus1 = ConstValue + 1; 16950 APInt CVM, CVN; 16951 if (SCVMinus1.isPowerOf2()) { 16952 ShiftAmt = SCVMinus1.logBase2(); 16953 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes); 16954 } else if (CVPlus1.isPowerOf2()) { 16955 ShiftAmt = CVPlus1.logBase2(); 16956 return Sub(Shl(N0, ShiftAmt), N0); 16957 } else if (SCVPlus1.isPowerOf2()) { 16958 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes; 16959 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes)); 16960 } else if (Subtarget->hasALULSLFast() && 16961 isPowPlusPlusConst(ConstValue, CVM, CVN)) { 16962 APInt CVMMinus1 = CVM - 1; 16963 APInt CVNMinus1 = CVN - 1; 16964 unsigned ShiftM1 = CVMMinus1.logBase2(); 16965 unsigned ShiftN1 = CVNMinus1.logBase2(); 16966 // LSLFast implicate that Shifts <= 3 places are fast 16967 if (ShiftM1 <= 3 && ShiftN1 <= 3) { 16968 SDValue MVal = Add(Shl(N0, ShiftM1), N0); 16969 return Add(Shl(MVal, ShiftN1), MVal); 16970 } 16971 } 16972 } else { 16973 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 16974 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 16975 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N)) 16976 APInt SCVPlus1 = -ShiftedConstValue + 1; 16977 APInt CVNegPlus1 = -ConstValue + 1; 16978 APInt CVNegMinus1 = -ConstValue - 1; 16979 if (CVNegPlus1.isPowerOf2()) { 16980 ShiftAmt = CVNegPlus1.logBase2(); 16981 return Sub(N0, Shl(N0, ShiftAmt)); 16982 } else if (CVNegMinus1.isPowerOf2()) { 16983 ShiftAmt = CVNegMinus1.logBase2(); 16984 return Negate(Add(Shl(N0, ShiftAmt), N0)); 16985 } else if (SCVPlus1.isPowerOf2()) { 16986 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes; 16987 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt)); 16988 } 16989 } 16990 16991 return SDValue(); 16992 } 16993 16994 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, 16995 SelectionDAG &DAG) { 16996 // Take advantage of vector comparisons producing 0 or -1 in each lane to 16997 // optimize away operation when it's from a constant. 16998 // 16999 // The general transformation is: 17000 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> 17001 // AND(VECTOR_CMP(x,y), constant2) 17002 // constant2 = UNARYOP(constant) 17003 17004 // Early exit if this isn't a vector operation, the operand of the 17005 // unary operation isn't a bitwise AND, or if the sizes of the operations 17006 // aren't the same. 17007 EVT VT = N->getValueType(0); 17008 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || 17009 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || 17010 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) 17011 return SDValue(); 17012 17013 // Now check that the other operand of the AND is a constant. We could 17014 // make the transformation for non-constant splats as well, but it's unclear 17015 // that would be a benefit as it would not eliminate any operations, just 17016 // perform one more step in scalar code before moving to the vector unit. 17017 if (BuildVectorSDNode *BV = 17018 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { 17019 // Bail out if the vector isn't a constant. 17020 if (!BV->isConstant()) 17021 return SDValue(); 17022 17023 // Everything checks out. Build up the new and improved node. 17024 SDLoc DL(N); 17025 EVT IntVT = BV->getValueType(0); 17026 // Create a new constant of the appropriate type for the transformed 17027 // DAG. 17028 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); 17029 // The AND node needs bitcasts to/from an integer vector type around it. 17030 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); 17031 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, 17032 N->getOperand(0)->getOperand(0), MaskConst); 17033 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); 17034 return Res; 17035 } 17036 17037 return SDValue(); 17038 } 17039 17040 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, 17041 const AArch64Subtarget *Subtarget) { 17042 // First try to optimize away the conversion when it's conditionally from 17043 // a constant. Vectors only. 17044 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) 17045 return Res; 17046 17047 EVT VT = N->getValueType(0); 17048 if (VT != MVT::f32 && VT != MVT::f64) 17049 return SDValue(); 17050 17051 // Only optimize when the source and destination types have the same width. 17052 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) 17053 return SDValue(); 17054 17055 // If the result of an integer load is only used by an integer-to-float 17056 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. 17057 // This eliminates an "integer-to-vector-move" UOP and improves throughput. 17058 SDValue N0 = N->getOperand(0); 17059 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) && 17060 N0.hasOneUse() && 17061 // Do not change the width of a volatile load. 17062 !cast<LoadSDNode>(N0)->isVolatile()) { 17063 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 17064 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), 17065 LN0->getPointerInfo(), LN0->getAlign(), 17066 LN0->getMemOperand()->getFlags()); 17067 17068 // Make sure successors of the original load stay after it by updating them 17069 // to use the new Chain. 17070 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); 17071 17072 unsigned Opcode = 17073 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; 17074 return DAG.getNode(Opcode, SDLoc(N), VT, Load); 17075 } 17076 17077 return SDValue(); 17078 } 17079 17080 /// Fold a floating-point multiply by power of two into floating-point to 17081 /// fixed-point conversion. 17082 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, 17083 TargetLowering::DAGCombinerInfo &DCI, 17084 const AArch64Subtarget *Subtarget) { 17085 if (!Subtarget->isNeonAvailable()) 17086 return SDValue(); 17087 17088 if (!N->getValueType(0).isSimple()) 17089 return SDValue(); 17090 17091 SDValue Op = N->getOperand(0); 17092 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL) 17093 return SDValue(); 17094 17095 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector()) 17096 return SDValue(); 17097 17098 SDValue ConstVec = Op->getOperand(1); 17099 if (!isa<BuildVectorSDNode>(ConstVec)) 17100 return SDValue(); 17101 17102 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 17103 uint32_t FloatBits = FloatTy.getSizeInBits(); 17104 if (FloatBits != 32 && FloatBits != 64 && 17105 (FloatBits != 16 || !Subtarget->hasFullFP16())) 17106 return SDValue(); 17107 17108 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 17109 uint32_t IntBits = IntTy.getSizeInBits(); 17110 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 17111 return SDValue(); 17112 17113 // Avoid conversions where iN is larger than the float (e.g., float -> i64). 17114 if (IntBits > FloatBits) 17115 return SDValue(); 17116 17117 BitVector UndefElements; 17118 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 17119 int32_t Bits = IntBits == 64 ? 64 : 32; 17120 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); 17121 if (C == -1 || C == 0 || C > Bits) 17122 return SDValue(); 17123 17124 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger(); 17125 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy)) 17126 return SDValue(); 17127 17128 if (N->getOpcode() == ISD::FP_TO_SINT_SAT || 17129 N->getOpcode() == ISD::FP_TO_UINT_SAT) { 17130 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT(); 17131 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits) 17132 return SDValue(); 17133 } 17134 17135 SDLoc DL(N); 17136 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT || 17137 N->getOpcode() == ISD::FP_TO_SINT_SAT); 17138 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs 17139 : Intrinsic::aarch64_neon_vcvtfp2fxu; 17140 SDValue FixConv = 17141 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, 17142 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), 17143 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); 17144 // We can handle smaller integers by generating an extra trunc. 17145 if (IntBits < FloatBits) 17146 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); 17147 17148 return FixConv; 17149 } 17150 17151 /// Fold a floating-point divide by power of two into fixed-point to 17152 /// floating-point conversion. 17153 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, 17154 TargetLowering::DAGCombinerInfo &DCI, 17155 const AArch64Subtarget *Subtarget) { 17156 if (!Subtarget->hasNEON()) 17157 return SDValue(); 17158 17159 SDValue Op = N->getOperand(0); 17160 unsigned Opc = Op->getOpcode(); 17161 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 17162 !Op.getOperand(0).getValueType().isSimple() || 17163 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) 17164 return SDValue(); 17165 17166 SDValue ConstVec = N->getOperand(1); 17167 if (!isa<BuildVectorSDNode>(ConstVec)) 17168 return SDValue(); 17169 17170 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 17171 int32_t IntBits = IntTy.getSizeInBits(); 17172 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 17173 return SDValue(); 17174 17175 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 17176 int32_t FloatBits = FloatTy.getSizeInBits(); 17177 if (FloatBits != 32 && FloatBits != 64) 17178 return SDValue(); 17179 17180 // Avoid conversions where iN is larger than the float (e.g., i64 -> float). 17181 if (IntBits > FloatBits) 17182 return SDValue(); 17183 17184 BitVector UndefElements; 17185 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 17186 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); 17187 if (C == -1 || C == 0 || C > FloatBits) 17188 return SDValue(); 17189 17190 MVT ResTy; 17191 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 17192 switch (NumLanes) { 17193 default: 17194 return SDValue(); 17195 case 2: 17196 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; 17197 break; 17198 case 4: 17199 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; 17200 break; 17201 } 17202 17203 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) 17204 return SDValue(); 17205 17206 SDLoc DL(N); 17207 SDValue ConvInput = Op.getOperand(0); 17208 bool IsSigned = Opc == ISD::SINT_TO_FP; 17209 if (IntBits < FloatBits) 17210 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, 17211 ResTy, ConvInput); 17212 17213 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp 17214 : Intrinsic::aarch64_neon_vcvtfxu2fp; 17215 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), 17216 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, 17217 DAG.getConstant(C, DL, MVT::i32)); 17218 } 17219 17220 static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 17221 const AArch64TargetLowering &TLI) { 17222 EVT VT = N->getValueType(0); 17223 SelectionDAG &DAG = DCI.DAG; 17224 SDLoc DL(N); 17225 17226 if (!VT.isVector()) 17227 return SDValue(); 17228 17229 // The combining code currently only works for NEON vectors. In particular, 17230 // it does not work for SVE when dealing with vectors wider than 128 bits. 17231 // It also doesn't work for streaming mode because it causes generating 17232 // bsl instructions that are invalid in streaming mode. 17233 if (TLI.useSVEForFixedLengthVectorVT( 17234 VT, !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())) 17235 return SDValue(); 17236 17237 SDValue N0 = N->getOperand(0); 17238 if (N0.getOpcode() != ISD::AND) 17239 return SDValue(); 17240 17241 SDValue N1 = N->getOperand(1); 17242 if (N1.getOpcode() != ISD::AND) 17243 return SDValue(); 17244 17245 // InstCombine does (not (neg a)) => (add a -1). 17246 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c) 17247 // Loop over all combinations of AND operands. 17248 for (int i = 1; i >= 0; --i) { 17249 for (int j = 1; j >= 0; --j) { 17250 SDValue O0 = N0->getOperand(i); 17251 SDValue O1 = N1->getOperand(j); 17252 SDValue Sub, Add, SubSibling, AddSibling; 17253 17254 // Find a SUB and an ADD operand, one from each AND. 17255 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) { 17256 Sub = O0; 17257 Add = O1; 17258 SubSibling = N0->getOperand(1 - i); 17259 AddSibling = N1->getOperand(1 - j); 17260 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) { 17261 Add = O0; 17262 Sub = O1; 17263 AddSibling = N0->getOperand(1 - i); 17264 SubSibling = N1->getOperand(1 - j); 17265 } else 17266 continue; 17267 17268 if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode())) 17269 continue; 17270 17271 // Constant ones is always righthand operand of the Add. 17272 if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode())) 17273 continue; 17274 17275 if (Sub.getOperand(1) != Add.getOperand(0)) 17276 continue; 17277 17278 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling); 17279 } 17280 } 17281 17282 // (or (and a b) (and (not a) c)) => (bsl a b c) 17283 // We only have to look for constant vectors here since the general, variable 17284 // case can be handled in TableGen. 17285 unsigned Bits = VT.getScalarSizeInBits(); 17286 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); 17287 for (int i = 1; i >= 0; --i) 17288 for (int j = 1; j >= 0; --j) { 17289 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); 17290 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); 17291 if (!BVN0 || !BVN1) 17292 continue; 17293 17294 bool FoundMatch = true; 17295 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { 17296 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); 17297 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); 17298 if (!CN0 || !CN1 || 17299 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { 17300 FoundMatch = false; 17301 break; 17302 } 17303 } 17304 17305 if (FoundMatch) 17306 return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0), 17307 N0->getOperand(1 - i), N1->getOperand(1 - j)); 17308 } 17309 17310 return SDValue(); 17311 } 17312 17313 // Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to 17314 // convert to csel(ccmp(.., cc0)), depending on cc1: 17315 17316 // (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1))) 17317 // => 17318 // (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0)) 17319 // 17320 // (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1))) 17321 // => 17322 // (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0)) 17323 static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) { 17324 EVT VT = N->getValueType(0); 17325 SDValue CSel0 = N->getOperand(0); 17326 SDValue CSel1 = N->getOperand(1); 17327 17328 if (CSel0.getOpcode() != AArch64ISD::CSEL || 17329 CSel1.getOpcode() != AArch64ISD::CSEL) 17330 return SDValue(); 17331 17332 if (!CSel0->hasOneUse() || !CSel1->hasOneUse()) 17333 return SDValue(); 17334 17335 if (!isNullConstant(CSel0.getOperand(0)) || 17336 !isOneConstant(CSel0.getOperand(1)) || 17337 !isNullConstant(CSel1.getOperand(0)) || 17338 !isOneConstant(CSel1.getOperand(1))) 17339 return SDValue(); 17340 17341 SDValue Cmp0 = CSel0.getOperand(3); 17342 SDValue Cmp1 = CSel1.getOperand(3); 17343 AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2); 17344 AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2); 17345 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse()) 17346 return SDValue(); 17347 if (Cmp1.getOpcode() != AArch64ISD::SUBS && 17348 Cmp0.getOpcode() == AArch64ISD::SUBS) { 17349 std::swap(Cmp0, Cmp1); 17350 std::swap(CC0, CC1); 17351 } 17352 17353 if (Cmp1.getOpcode() != AArch64ISD::SUBS) 17354 return SDValue(); 17355 17356 SDLoc DL(N); 17357 SDValue CCmp, Condition; 17358 unsigned NZCV; 17359 17360 if (N->getOpcode() == ISD::AND) { 17361 AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0); 17362 Condition = DAG.getConstant(InvCC0, DL, MVT_CC); 17363 NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1); 17364 } else { 17365 AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1); 17366 Condition = DAG.getConstant(CC0, DL, MVT_CC); 17367 NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1); 17368 } 17369 17370 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); 17371 17372 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1)); 17373 if (Op1 && Op1->getAPIntValue().isNegative() && 17374 Op1->getAPIntValue().sgt(-32)) { 17375 // CCMP accept the constant int the range [0, 31] 17376 // if the Op1 is a constant in the range [-31, -1], we 17377 // can select to CCMN to avoid the extra mov 17378 SDValue AbsOp1 = 17379 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0)); 17380 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1, 17381 NZCVOp, Condition, Cmp0); 17382 } else { 17383 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0), 17384 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0); 17385 } 17386 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0), 17387 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32), 17388 CCmp); 17389 } 17390 17391 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 17392 const AArch64Subtarget *Subtarget, 17393 const AArch64TargetLowering &TLI) { 17394 SelectionDAG &DAG = DCI.DAG; 17395 EVT VT = N->getValueType(0); 17396 17397 if (SDValue R = performANDORCSELCombine(N, DAG)) 17398 return R; 17399 17400 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 17401 return SDValue(); 17402 17403 if (SDValue Res = tryCombineToBSL(N, DCI, TLI)) 17404 return Res; 17405 17406 return SDValue(); 17407 } 17408 17409 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) { 17410 if (!MemVT.getVectorElementType().isSimple()) 17411 return false; 17412 17413 uint64_t MaskForTy = 0ull; 17414 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) { 17415 case MVT::i8: 17416 MaskForTy = 0xffull; 17417 break; 17418 case MVT::i16: 17419 MaskForTy = 0xffffull; 17420 break; 17421 case MVT::i32: 17422 MaskForTy = 0xffffffffull; 17423 break; 17424 default: 17425 return false; 17426 break; 17427 } 17428 17429 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR) 17430 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0))) 17431 return Op0->getAPIntValue().getLimitedValue() == MaskForTy; 17432 17433 return false; 17434 } 17435 17436 static bool isAllInactivePredicate(SDValue N) { 17437 // Look through cast. 17438 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) 17439 N = N.getOperand(0); 17440 17441 return ISD::isConstantSplatVectorAllZeros(N.getNode()); 17442 } 17443 17444 static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) { 17445 unsigned NumElts = N.getValueType().getVectorMinNumElements(); 17446 17447 // Look through cast. 17448 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) { 17449 N = N.getOperand(0); 17450 // When reinterpreting from a type with fewer elements the "new" elements 17451 // are not active, so bail if they're likely to be used. 17452 if (N.getValueType().getVectorMinNumElements() < NumElts) 17453 return false; 17454 } 17455 17456 if (ISD::isConstantSplatVectorAllOnes(N.getNode())) 17457 return true; 17458 17459 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size 17460 // or smaller than the implicit element type represented by N. 17461 // NOTE: A larger element count implies a smaller element type. 17462 if (N.getOpcode() == AArch64ISD::PTRUE && 17463 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all) 17464 return N.getValueType().getVectorMinNumElements() >= NumElts; 17465 17466 // If we're compiling for a specific vector-length, we can check if the 17467 // pattern's VL equals that of the scalable vector at runtime. 17468 if (N.getOpcode() == AArch64ISD::PTRUE) { 17469 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); 17470 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits(); 17471 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits(); 17472 if (MaxSVESize && MinSVESize == MaxSVESize) { 17473 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock; 17474 unsigned PatNumElts = 17475 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0)); 17476 return PatNumElts == (NumElts * VScale); 17477 } 17478 } 17479 17480 return false; 17481 } 17482 17483 static SDValue performReinterpretCastCombine(SDNode *N) { 17484 SDValue LeafOp = SDValue(N, 0); 17485 SDValue Op = N->getOperand(0); 17486 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST && 17487 LeafOp.getValueType() != Op.getValueType()) 17488 Op = Op->getOperand(0); 17489 if (LeafOp.getValueType() == Op.getValueType()) 17490 return Op; 17491 return SDValue(); 17492 } 17493 17494 static SDValue performSVEAndCombine(SDNode *N, 17495 TargetLowering::DAGCombinerInfo &DCI) { 17496 if (DCI.isBeforeLegalizeOps()) 17497 return SDValue(); 17498 17499 SelectionDAG &DAG = DCI.DAG; 17500 SDValue Src = N->getOperand(0); 17501 unsigned Opc = Src->getOpcode(); 17502 17503 // Zero/any extend of an unsigned unpack 17504 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { 17505 SDValue UnpkOp = Src->getOperand(0); 17506 SDValue Dup = N->getOperand(1); 17507 17508 if (Dup.getOpcode() != ISD::SPLAT_VECTOR) 17509 return SDValue(); 17510 17511 SDLoc DL(N); 17512 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0)); 17513 if (!C) 17514 return SDValue(); 17515 17516 uint64_t ExtVal = C->getZExtValue(); 17517 17518 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool { 17519 return ((ExtVal == 0xFF && VT == MVT::i8) || 17520 (ExtVal == 0xFFFF && VT == MVT::i16) || 17521 (ExtVal == 0xFFFFFFFF && VT == MVT::i32)); 17522 }; 17523 17524 // If the mask is fully covered by the unpack, we don't need to push 17525 // a new AND onto the operand 17526 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType(); 17527 if (MaskAndTypeMatch(EltTy)) 17528 return Src; 17529 17530 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check 17531 // to see if the mask is all-ones of size MemTy. 17532 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp); 17533 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD || 17534 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) { 17535 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType(); 17536 if (MaskAndTypeMatch(EltTy)) 17537 return Src; 17538 } 17539 17540 // Truncate to prevent a DUP with an over wide constant 17541 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits()); 17542 17543 // Otherwise, make sure we propagate the AND to the operand 17544 // of the unpack 17545 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0), 17546 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32)); 17547 17548 SDValue And = DAG.getNode(ISD::AND, DL, 17549 UnpkOp->getValueType(0), UnpkOp, Dup); 17550 17551 return DAG.getNode(Opc, DL, N->getValueType(0), And); 17552 } 17553 17554 // If both sides of AND operations are i1 splat_vectors then 17555 // we can produce just i1 splat_vector as the result. 17556 if (isAllActivePredicate(DAG, N->getOperand(0))) 17557 return N->getOperand(1); 17558 if (isAllActivePredicate(DAG, N->getOperand(1))) 17559 return N->getOperand(0); 17560 17561 if (!EnableCombineMGatherIntrinsics) 17562 return SDValue(); 17563 17564 SDValue Mask = N->getOperand(1); 17565 17566 if (!Src.hasOneUse()) 17567 return SDValue(); 17568 17569 EVT MemVT; 17570 17571 // SVE load instructions perform an implicit zero-extend, which makes them 17572 // perfect candidates for combining. 17573 switch (Opc) { 17574 case AArch64ISD::LD1_MERGE_ZERO: 17575 case AArch64ISD::LDNF1_MERGE_ZERO: 17576 case AArch64ISD::LDFF1_MERGE_ZERO: 17577 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT(); 17578 break; 17579 case AArch64ISD::GLD1_MERGE_ZERO: 17580 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 17581 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 17582 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 17583 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 17584 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 17585 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 17586 case AArch64ISD::GLDFF1_MERGE_ZERO: 17587 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: 17588 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: 17589 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: 17590 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: 17591 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: 17592 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: 17593 case AArch64ISD::GLDNT1_MERGE_ZERO: 17594 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); 17595 break; 17596 default: 17597 return SDValue(); 17598 } 17599 17600 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT)) 17601 return Src; 17602 17603 return SDValue(); 17604 } 17605 17606 // Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d) 17607 static SDValue performANDSETCCCombine(SDNode *N, 17608 TargetLowering::DAGCombinerInfo &DCI) { 17609 17610 // This function performs an optimization on a specific pattern involving 17611 // an AND operation and SETCC (Set Condition Code) node. 17612 17613 SDValue SetCC = N->getOperand(0); 17614 EVT VT = N->getValueType(0); 17615 SelectionDAG &DAG = DCI.DAG; 17616 17617 // Checks if the current node (N) is used by any SELECT instruction and 17618 // returns an empty SDValue to avoid applying the optimization to prevent 17619 // incorrect results 17620 for (auto U : N->uses()) 17621 if (U->getOpcode() == ISD::SELECT) 17622 return SDValue(); 17623 17624 // Check if the operand is a SETCC node with floating-point comparison 17625 if (SetCC.getOpcode() == ISD::SETCC && 17626 SetCC.getOperand(0).getValueType() == MVT::f32) { 17627 17628 SDValue Cmp; 17629 AArch64CC::CondCode CC; 17630 17631 // Check if the DAG is after legalization and if we can emit the conjunction 17632 if (!DCI.isBeforeLegalize() && 17633 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) { 17634 17635 AArch64CC::CondCode InvertedCC = AArch64CC::getInvertedCondCode(CC); 17636 17637 SDLoc DL(N); 17638 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT), 17639 DAG.getConstant(0, DL, VT), 17640 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp); 17641 } 17642 } 17643 return SDValue(); 17644 } 17645 17646 static SDValue performANDCombine(SDNode *N, 17647 TargetLowering::DAGCombinerInfo &DCI) { 17648 SelectionDAG &DAG = DCI.DAG; 17649 SDValue LHS = N->getOperand(0); 17650 SDValue RHS = N->getOperand(1); 17651 EVT VT = N->getValueType(0); 17652 17653 if (SDValue R = performANDORCSELCombine(N, DAG)) 17654 return R; 17655 17656 if (SDValue R = performANDSETCCCombine(N,DCI)) 17657 return R; 17658 17659 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 17660 return SDValue(); 17661 17662 if (VT.isScalableVector()) 17663 return performSVEAndCombine(N, DCI); 17664 17665 // The combining code below works only for NEON vectors. In particular, it 17666 // does not work for SVE when dealing with vectors wider than 128 bits. 17667 if (!VT.is64BitVector() && !VT.is128BitVector()) 17668 return SDValue(); 17669 17670 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); 17671 if (!BVN) 17672 return SDValue(); 17673 17674 // AND does not accept an immediate, so check if we can use a BIC immediate 17675 // instruction instead. We do this here instead of using a (and x, (mvni imm)) 17676 // pattern in isel, because some immediates may be lowered to the preferred 17677 // (and x, (movi imm)) form, even though an mvni representation also exists. 17678 APInt DefBits(VT.getSizeInBits(), 0); 17679 APInt UndefBits(VT.getSizeInBits(), 0); 17680 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 17681 SDValue NewOp; 17682 17683 // Any bits known to already be 0 need not be cleared again, which can help 17684 // reduce the size of the immediate to one supported by the instruction. 17685 KnownBits Known = DAG.computeKnownBits(LHS); 17686 APInt ZeroSplat(VT.getSizeInBits(), 0); 17687 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++) 17688 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits()) 17689 << (Known.Zero.getBitWidth() * I); 17690 17691 DefBits = ~(DefBits | ZeroSplat); 17692 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, 17693 DefBits, &LHS)) || 17694 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, 17695 DefBits, &LHS))) 17696 return NewOp; 17697 17698 UndefBits = ~(UndefBits | ZeroSplat); 17699 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, 17700 UndefBits, &LHS)) || 17701 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, 17702 UndefBits, &LHS))) 17703 return NewOp; 17704 } 17705 17706 return SDValue(); 17707 } 17708 17709 static SDValue performFADDCombine(SDNode *N, 17710 TargetLowering::DAGCombinerInfo &DCI) { 17711 SelectionDAG &DAG = DCI.DAG; 17712 SDValue LHS = N->getOperand(0); 17713 SDValue RHS = N->getOperand(1); 17714 EVT VT = N->getValueType(0); 17715 SDLoc DL(N); 17716 17717 if (!N->getFlags().hasAllowReassociation()) 17718 return SDValue(); 17719 17720 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c) 17721 auto ReassocComplex = [&](SDValue A, SDValue B) { 17722 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN) 17723 return SDValue(); 17724 unsigned Opc = A.getConstantOperandVal(0); 17725 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 && 17726 Opc != Intrinsic::aarch64_neon_vcmla_rot90 && 17727 Opc != Intrinsic::aarch64_neon_vcmla_rot180 && 17728 Opc != Intrinsic::aarch64_neon_vcmla_rot270) 17729 return SDValue(); 17730 SDValue VCMLA = DAG.getNode( 17731 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), 17732 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()), 17733 A.getOperand(2), A.getOperand(3)); 17734 VCMLA->setFlags(A->getFlags()); 17735 return VCMLA; 17736 }; 17737 if (SDValue R = ReassocComplex(LHS, RHS)) 17738 return R; 17739 if (SDValue R = ReassocComplex(RHS, LHS)) 17740 return R; 17741 17742 return SDValue(); 17743 } 17744 17745 static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) { 17746 switch (Opcode) { 17747 case ISD::STRICT_FADD: 17748 case ISD::FADD: 17749 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64; 17750 case ISD::ADD: 17751 return VT == MVT::i64; 17752 default: 17753 return false; 17754 } 17755 } 17756 17757 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, 17758 AArch64CC::CondCode Cond); 17759 17760 static bool isPredicateCCSettingOp(SDValue N) { 17761 if ((N.getOpcode() == ISD::SETCC) || 17762 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN && 17763 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege || 17764 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt || 17765 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi || 17766 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs || 17767 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele || 17768 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo || 17769 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels || 17770 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt || 17771 // get_active_lane_mask is lowered to a whilelo instruction. 17772 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask))) 17773 return true; 17774 17775 return false; 17776 } 17777 17778 // Materialize : i1 = extract_vector_elt t37, Constant:i64<0> 17779 // ... into: "ptrue p, all" + PTEST 17780 static SDValue 17781 performFirstTrueTestVectorCombine(SDNode *N, 17782 TargetLowering::DAGCombinerInfo &DCI, 17783 const AArch64Subtarget *Subtarget) { 17784 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT); 17785 // Make sure PTEST can be legalised with illegal types. 17786 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize()) 17787 return SDValue(); 17788 17789 SDValue N0 = N->getOperand(0); 17790 EVT VT = N0.getValueType(); 17791 17792 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 || 17793 !isNullConstant(N->getOperand(1))) 17794 return SDValue(); 17795 17796 // Restricted the DAG combine to only cases where we're extracting from a 17797 // flag-setting operation. 17798 if (!isPredicateCCSettingOp(N0)) 17799 return SDValue(); 17800 17801 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0 17802 SelectionDAG &DAG = DCI.DAG; 17803 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all); 17804 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE); 17805 } 17806 17807 // Materialize : Idx = (add (mul vscale, NumEls), -1) 17808 // i1 = extract_vector_elt t37, Constant:i64<Idx> 17809 // ... into: "ptrue p, all" + PTEST 17810 static SDValue 17811 performLastTrueTestVectorCombine(SDNode *N, 17812 TargetLowering::DAGCombinerInfo &DCI, 17813 const AArch64Subtarget *Subtarget) { 17814 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT); 17815 // Make sure PTEST is legal types. 17816 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize()) 17817 return SDValue(); 17818 17819 SDValue N0 = N->getOperand(0); 17820 EVT OpVT = N0.getValueType(); 17821 17822 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1) 17823 return SDValue(); 17824 17825 // Idx == (add (mul vscale, NumEls), -1) 17826 SDValue Idx = N->getOperand(1); 17827 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1))) 17828 return SDValue(); 17829 17830 SDValue VS = Idx.getOperand(0); 17831 if (VS.getOpcode() != ISD::VSCALE) 17832 return SDValue(); 17833 17834 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue(); 17835 if (VS.getConstantOperandVal(0) != NumEls) 17836 return SDValue(); 17837 17838 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0 17839 SelectionDAG &DAG = DCI.DAG; 17840 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all); 17841 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE); 17842 } 17843 17844 static SDValue 17845 performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 17846 const AArch64Subtarget *Subtarget) { 17847 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT); 17848 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget)) 17849 return Res; 17850 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget)) 17851 return Res; 17852 17853 SelectionDAG &DAG = DCI.DAG; 17854 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 17855 17856 EVT VT = N->getValueType(0); 17857 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16(); 17858 bool IsStrict = N0->isStrictFPOpcode(); 17859 17860 // extract(dup x) -> x 17861 if (N0.getOpcode() == AArch64ISD::DUP) 17862 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT) 17863 : N0.getOperand(0); 17864 17865 // Rewrite for pairwise fadd pattern 17866 // (f32 (extract_vector_elt 17867 // (fadd (vXf32 Other) 17868 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0)) 17869 // -> 17870 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0) 17871 // (extract_vector_elt (vXf32 Other) 1)) 17872 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so 17873 // we can only do this when it's used only by the extract_vector_elt. 17874 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) && 17875 (!IsStrict || N0.hasOneUse())) { 17876 SDLoc DL(N0); 17877 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0); 17878 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1); 17879 17880 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01); 17881 SDValue Other = N00; 17882 17883 // And handle the commutative case. 17884 if (!Shuffle) { 17885 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00); 17886 Other = N01; 17887 } 17888 17889 if (Shuffle && Shuffle->getMaskElt(0) == 1 && 17890 Other == Shuffle->getOperand(0)) { 17891 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, 17892 DAG.getConstant(0, DL, MVT::i64)); 17893 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, 17894 DAG.getConstant(1, DL, MVT::i64)); 17895 if (!IsStrict) 17896 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2); 17897 17898 // For strict_fadd we need uses of the final extract_vector to be replaced 17899 // with the strict_fadd, but we also need uses of the chain output of the 17900 // original strict_fadd to use the chain output of the new strict_fadd as 17901 // otherwise it may not be deleted. 17902 SDValue Ret = DAG.getNode(N0->getOpcode(), DL, 17903 {VT, MVT::Other}, 17904 {N0->getOperand(0), Extract1, Extract2}); 17905 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret); 17906 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1)); 17907 return SDValue(N, 0); 17908 } 17909 } 17910 17911 return SDValue(); 17912 } 17913 17914 static SDValue performConcatVectorsCombine(SDNode *N, 17915 TargetLowering::DAGCombinerInfo &DCI, 17916 SelectionDAG &DAG) { 17917 SDLoc dl(N); 17918 EVT VT = N->getValueType(0); 17919 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 17920 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode(); 17921 17922 if (VT.isScalableVector()) 17923 return SDValue(); 17924 17925 // Optimize concat_vectors of truncated vectors, where the intermediate 17926 // type is illegal, to avoid said illegality, e.g., 17927 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), 17928 // (v2i16 (truncate (v2i64))))) 17929 // -> 17930 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), 17931 // (v4i32 (bitcast (v2i64))), 17932 // <0, 2, 4, 6>))) 17933 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed 17934 // on both input and result type, so we might generate worse code. 17935 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. 17936 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE && 17937 N1Opc == ISD::TRUNCATE) { 17938 SDValue N00 = N0->getOperand(0); 17939 SDValue N10 = N1->getOperand(0); 17940 EVT N00VT = N00.getValueType(); 17941 17942 if (N00VT == N10.getValueType() && 17943 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && 17944 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { 17945 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); 17946 SmallVector<int, 8> Mask(MidVT.getVectorNumElements()); 17947 for (size_t i = 0; i < Mask.size(); ++i) 17948 Mask[i] = i * 2; 17949 return DAG.getNode(ISD::TRUNCATE, dl, VT, 17950 DAG.getVectorShuffle( 17951 MidVT, dl, 17952 DAG.getNode(ISD::BITCAST, dl, MidVT, N00), 17953 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); 17954 } 17955 } 17956 17957 if (N->getOperand(0).getValueType() == MVT::v4i8) { 17958 // If we have a concat of v4i8 loads, convert them to a buildvector of f32 17959 // loads to prevent having to go through the v4i8 load legalization that 17960 // needs to extend each element into a larger type. 17961 if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) { 17962 if (V.getValueType() != MVT::v4i8) 17963 return false; 17964 if (V.isUndef()) 17965 return true; 17966 LoadSDNode *LD = dyn_cast<LoadSDNode>(V); 17967 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() && 17968 LD->getExtensionType() == ISD::NON_EXTLOAD; 17969 })) { 17970 EVT NVT = 17971 EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands()); 17972 SmallVector<SDValue> Ops; 17973 17974 for (unsigned i = 0; i < N->getNumOperands(); i++) { 17975 SDValue V = N->getOperand(i); 17976 if (V.isUndef()) 17977 Ops.push_back(DAG.getUNDEF(MVT::f32)); 17978 else { 17979 LoadSDNode *LD = cast<LoadSDNode>(V); 17980 SDValue NewLoad = 17981 DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(), 17982 LD->getMemOperand()); 17983 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1)); 17984 Ops.push_back(NewLoad); 17985 } 17986 } 17987 return DAG.getBitcast(N->getValueType(0), 17988 DAG.getBuildVector(NVT, dl, Ops)); 17989 } 17990 } 17991 17992 // Canonicalise concat_vectors to replace concatenations of truncated nots 17993 // with nots of concatenated truncates. This in some cases allows for multiple 17994 // redundant negations to be eliminated. 17995 // (concat_vectors (v4i16 (truncate (not (v4i32)))), 17996 // (v4i16 (truncate (not (v4i32))))) 17997 // -> 17998 // (not (concat_vectors (v4i16 (truncate (v4i32))), 17999 // (v4i16 (truncate (v4i32))))) 18000 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE && 18001 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) && 18002 N->isOnlyUserOf(N1.getNode())) { 18003 auto isBitwiseVectorNegate = [](SDValue V) { 18004 return V->getOpcode() == ISD::XOR && 18005 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode()); 18006 }; 18007 SDValue N00 = N0->getOperand(0); 18008 SDValue N10 = N1->getOperand(0); 18009 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) && 18010 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) { 18011 return DAG.getNOT( 18012 dl, 18013 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, 18014 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(), 18015 N00->getOperand(0)), 18016 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(), 18017 N10->getOperand(0))), 18018 VT); 18019 } 18020 } 18021 18022 // Wait till after everything is legalized to try this. That way we have 18023 // legal vector types and such. 18024 if (DCI.isBeforeLegalizeOps()) 18025 return SDValue(); 18026 18027 // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use 18028 // extracted subvectors from the same original vectors. Combine these into a 18029 // single avg that operates on the two original vectors. 18030 // avgceil is the target independant name for rhadd, avgfloor is a hadd. 18031 // Example: 18032 // (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>), 18033 // extract_subvector (v16i8 OpB, <0>))), 18034 // (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>), 18035 // extract_subvector (v16i8 OpB, <8>))))) 18036 // -> 18037 // (v16i8(avgceils(v16i8 OpA, v16i8 OpB))) 18038 if (N->getNumOperands() == 2 && N0Opc == N1Opc && 18039 (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS || 18040 N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) { 18041 SDValue N00 = N0->getOperand(0); 18042 SDValue N01 = N0->getOperand(1); 18043 SDValue N10 = N1->getOperand(0); 18044 SDValue N11 = N1->getOperand(1); 18045 18046 EVT N00VT = N00.getValueType(); 18047 EVT N10VT = N10.getValueType(); 18048 18049 if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR && 18050 N01->getOpcode() == ISD::EXTRACT_SUBVECTOR && 18051 N10->getOpcode() == ISD::EXTRACT_SUBVECTOR && 18052 N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) { 18053 SDValue N00Source = N00->getOperand(0); 18054 SDValue N01Source = N01->getOperand(0); 18055 SDValue N10Source = N10->getOperand(0); 18056 SDValue N11Source = N11->getOperand(0); 18057 18058 if (N00Source == N10Source && N01Source == N11Source && 18059 N00Source.getValueType() == VT && N01Source.getValueType() == VT) { 18060 assert(N0.getValueType() == N1.getValueType()); 18061 18062 uint64_t N00Index = N00.getConstantOperandVal(1); 18063 uint64_t N01Index = N01.getConstantOperandVal(1); 18064 uint64_t N10Index = N10.getConstantOperandVal(1); 18065 uint64_t N11Index = N11.getConstantOperandVal(1); 18066 18067 if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 && 18068 N10Index == N00VT.getVectorNumElements()) 18069 return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source); 18070 } 18071 } 18072 } 18073 18074 auto IsRSHRN = [](SDValue Shr) { 18075 if (Shr.getOpcode() != AArch64ISD::VLSHR) 18076 return false; 18077 SDValue Op = Shr.getOperand(0); 18078 EVT VT = Op.getValueType(); 18079 unsigned ShtAmt = Shr.getConstantOperandVal(1); 18080 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD) 18081 return false; 18082 18083 APInt Imm; 18084 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift) 18085 Imm = APInt(VT.getScalarSizeInBits(), 18086 Op.getOperand(1).getConstantOperandVal(0) 18087 << Op.getOperand(1).getConstantOperandVal(1)); 18088 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP && 18089 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0))) 18090 Imm = APInt(VT.getScalarSizeInBits(), 18091 Op.getOperand(1).getConstantOperandVal(0)); 18092 else 18093 return false; 18094 18095 if (Imm != 1ULL << (ShtAmt - 1)) 18096 return false; 18097 return true; 18098 }; 18099 18100 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y)) 18101 if (N->getNumOperands() == 2 && IsRSHRN(N0) && 18102 ((IsRSHRN(N1) && 18103 N0.getConstantOperandVal(1) == N1.getConstantOperandVal(1)) || 18104 N1.isUndef())) { 18105 SDValue X = N0.getOperand(0).getOperand(0); 18106 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType()) 18107 : N1.getOperand(0).getOperand(0); 18108 EVT BVT = 18109 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext()); 18110 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y); 18111 SDValue Add = DAG.getNode( 18112 ISD::ADD, dl, BVT, CC, 18113 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT)); 18114 SDValue Shr = 18115 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1)); 18116 return Shr; 18117 } 18118 18119 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b) 18120 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 && 18121 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) && 18122 N0.getOperand(1) == N1.getOperand(1)) { 18123 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0), 18124 DAG.getUNDEF(N0.getValueType())); 18125 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1), 18126 DAG.getUNDEF(N0.getValueType())); 18127 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1); 18128 } 18129 18130 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector 18131 // splat. The indexed instructions are going to be expecting a DUPLANE64, so 18132 // canonicalise to that. 18133 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) { 18134 assert(VT.getScalarSizeInBits() == 64); 18135 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), 18136 DAG.getConstant(0, dl, MVT::i64)); 18137 } 18138 18139 // Canonicalise concat_vectors so that the right-hand vector has as few 18140 // bit-casts as possible before its real operation. The primary matching 18141 // destination for these operations will be the narrowing "2" instructions, 18142 // which depend on the operation being performed on this right-hand vector. 18143 // For example, 18144 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) 18145 // becomes 18146 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) 18147 18148 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST) 18149 return SDValue(); 18150 SDValue RHS = N1->getOperand(0); 18151 MVT RHSTy = RHS.getValueType().getSimpleVT(); 18152 // If the RHS is not a vector, this is not the pattern we're looking for. 18153 if (!RHSTy.isVector()) 18154 return SDValue(); 18155 18156 LLVM_DEBUG( 18157 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); 18158 18159 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), 18160 RHSTy.getVectorNumElements() * 2); 18161 return DAG.getNode(ISD::BITCAST, dl, VT, 18162 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, 18163 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), 18164 RHS)); 18165 } 18166 18167 static SDValue 18168 performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 18169 SelectionDAG &DAG) { 18170 if (DCI.isBeforeLegalizeOps()) 18171 return SDValue(); 18172 18173 EVT VT = N->getValueType(0); 18174 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) 18175 return SDValue(); 18176 18177 SDValue V = N->getOperand(0); 18178 18179 // NOTE: This combine exists in DAGCombiner, but that version's legality check 18180 // blocks this combine because the non-const case requires custom lowering. 18181 // 18182 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const) 18183 if (V.getOpcode() == ISD::SPLAT_VECTOR) 18184 if (isa<ConstantSDNode>(V.getOperand(0))) 18185 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0)); 18186 18187 return SDValue(); 18188 } 18189 18190 static SDValue 18191 performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 18192 SelectionDAG &DAG) { 18193 SDLoc DL(N); 18194 SDValue Vec = N->getOperand(0); 18195 SDValue SubVec = N->getOperand(1); 18196 uint64_t IdxVal = N->getConstantOperandVal(2); 18197 EVT VecVT = Vec.getValueType(); 18198 EVT SubVT = SubVec.getValueType(); 18199 18200 // Only do this for legal fixed vector types. 18201 if (!VecVT.isFixedLengthVector() || 18202 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) || 18203 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT)) 18204 return SDValue(); 18205 18206 // Ignore widening patterns. 18207 if (IdxVal == 0 && Vec.isUndef()) 18208 return SDValue(); 18209 18210 // Subvector must be half the width and an "aligned" insertion. 18211 unsigned NumSubElts = SubVT.getVectorNumElements(); 18212 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() || 18213 (IdxVal != 0 && IdxVal != NumSubElts)) 18214 return SDValue(); 18215 18216 // Fold insert_subvector -> concat_vectors 18217 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi)) 18218 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub) 18219 SDValue Lo, Hi; 18220 if (IdxVal == 0) { 18221 Lo = SubVec; 18222 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec, 18223 DAG.getVectorIdxConstant(NumSubElts, DL)); 18224 } else { 18225 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec, 18226 DAG.getVectorIdxConstant(0, DL)); 18227 Hi = SubVec; 18228 } 18229 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi); 18230 } 18231 18232 static SDValue tryCombineFixedPointConvert(SDNode *N, 18233 TargetLowering::DAGCombinerInfo &DCI, 18234 SelectionDAG &DAG) { 18235 // Wait until after everything is legalized to try this. That way we have 18236 // legal vector types and such. 18237 if (DCI.isBeforeLegalizeOps()) 18238 return SDValue(); 18239 // Transform a scalar conversion of a value from a lane extract into a 18240 // lane extract of a vector conversion. E.g., from foo1 to foo2: 18241 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } 18242 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } 18243 // 18244 // The second form interacts better with instruction selection and the 18245 // register allocator to avoid cross-class register copies that aren't 18246 // coalescable due to a lane reference. 18247 18248 // Check the operand and see if it originates from a lane extract. 18249 SDValue Op1 = N->getOperand(1); 18250 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 18251 return SDValue(); 18252 18253 // Yep, no additional predication needed. Perform the transform. 18254 SDValue IID = N->getOperand(0); 18255 SDValue Shift = N->getOperand(2); 18256 SDValue Vec = Op1.getOperand(0); 18257 SDValue Lane = Op1.getOperand(1); 18258 EVT ResTy = N->getValueType(0); 18259 EVT VecResTy; 18260 SDLoc DL(N); 18261 18262 // The vector width should be 128 bits by the time we get here, even 18263 // if it started as 64 bits (the extract_vector handling will have 18264 // done so). Bail if it is not. 18265 if (Vec.getValueSizeInBits() != 128) 18266 return SDValue(); 18267 18268 if (Vec.getValueType() == MVT::v4i32) 18269 VecResTy = MVT::v4f32; 18270 else if (Vec.getValueType() == MVT::v2i64) 18271 VecResTy = MVT::v2f64; 18272 else 18273 return SDValue(); 18274 18275 SDValue Convert = 18276 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); 18277 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); 18278 } 18279 18280 // AArch64 high-vector "long" operations are formed by performing the non-high 18281 // version on an extract_subvector of each operand which gets the high half: 18282 // 18283 // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) 18284 // 18285 // However, there are cases which don't have an extract_high explicitly, but 18286 // have another operation that can be made compatible with one for free. For 18287 // example: 18288 // 18289 // (dupv64 scalar) --> (extract_high (dup128 scalar)) 18290 // 18291 // This routine does the actual conversion of such DUPs, once outer routines 18292 // have determined that everything else is in order. 18293 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold 18294 // similarly here. 18295 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { 18296 MVT VT = N.getSimpleValueType(); 18297 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR && 18298 N.getConstantOperandVal(1) == 0) 18299 N = N.getOperand(0); 18300 18301 switch (N.getOpcode()) { 18302 case AArch64ISD::DUP: 18303 case AArch64ISD::DUPLANE8: 18304 case AArch64ISD::DUPLANE16: 18305 case AArch64ISD::DUPLANE32: 18306 case AArch64ISD::DUPLANE64: 18307 case AArch64ISD::MOVI: 18308 case AArch64ISD::MOVIshift: 18309 case AArch64ISD::MOVIedit: 18310 case AArch64ISD::MOVImsl: 18311 case AArch64ISD::MVNIshift: 18312 case AArch64ISD::MVNImsl: 18313 break; 18314 default: 18315 // FMOV could be supported, but isn't very useful, as it would only occur 18316 // if you passed a bitcast' floating point immediate to an eligible long 18317 // integer op (addl, smull, ...). 18318 return SDValue(); 18319 } 18320 18321 if (!VT.is64BitVector()) 18322 return SDValue(); 18323 18324 SDLoc DL(N); 18325 unsigned NumElems = VT.getVectorNumElements(); 18326 if (N.getValueType().is64BitVector()) { 18327 MVT ElementTy = VT.getVectorElementType(); 18328 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); 18329 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops()); 18330 } 18331 18332 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N, 18333 DAG.getConstant(NumElems, DL, MVT::i64)); 18334 } 18335 18336 static bool isEssentiallyExtractHighSubvector(SDValue N) { 18337 if (N.getOpcode() == ISD::BITCAST) 18338 N = N.getOperand(0); 18339 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR) 18340 return false; 18341 if (N.getOperand(0).getValueType().isScalableVector()) 18342 return false; 18343 return N.getConstantOperandAPInt(1) == 18344 N.getOperand(0).getValueType().getVectorNumElements() / 2; 18345 } 18346 18347 /// Helper structure to keep track of ISD::SET_CC operands. 18348 struct GenericSetCCInfo { 18349 const SDValue *Opnd0; 18350 const SDValue *Opnd1; 18351 ISD::CondCode CC; 18352 }; 18353 18354 /// Helper structure to keep track of a SET_CC lowered into AArch64 code. 18355 struct AArch64SetCCInfo { 18356 const SDValue *Cmp; 18357 AArch64CC::CondCode CC; 18358 }; 18359 18360 /// Helper structure to keep track of SetCC information. 18361 union SetCCInfo { 18362 GenericSetCCInfo Generic; 18363 AArch64SetCCInfo AArch64; 18364 }; 18365 18366 /// Helper structure to be able to read SetCC information. If set to 18367 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a 18368 /// GenericSetCCInfo. 18369 struct SetCCInfoAndKind { 18370 SetCCInfo Info; 18371 bool IsAArch64; 18372 }; 18373 18374 /// Check whether or not \p Op is a SET_CC operation, either a generic or 18375 /// an 18376 /// AArch64 lowered one. 18377 /// \p SetCCInfo is filled accordingly. 18378 /// \post SetCCInfo is meanginfull only when this function returns true. 18379 /// \return True when Op is a kind of SET_CC operation. 18380 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { 18381 // If this is a setcc, this is straight forward. 18382 if (Op.getOpcode() == ISD::SETCC) { 18383 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); 18384 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); 18385 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 18386 SetCCInfo.IsAArch64 = false; 18387 return true; 18388 } 18389 // Otherwise, check if this is a matching csel instruction. 18390 // In other words: 18391 // - csel 1, 0, cc 18392 // - csel 0, 1, !cc 18393 if (Op.getOpcode() != AArch64ISD::CSEL) 18394 return false; 18395 // Set the information about the operands. 18396 // TODO: we want the operands of the Cmp not the csel 18397 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); 18398 SetCCInfo.IsAArch64 = true; 18399 SetCCInfo.Info.AArch64.CC = 18400 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2)); 18401 18402 // Check that the operands matches the constraints: 18403 // (1) Both operands must be constants. 18404 // (2) One must be 1 and the other must be 0. 18405 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0)); 18406 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 18407 18408 // Check (1). 18409 if (!TValue || !FValue) 18410 return false; 18411 18412 // Check (2). 18413 if (!TValue->isOne()) { 18414 // Update the comparison when we are interested in !cc. 18415 std::swap(TValue, FValue); 18416 SetCCInfo.Info.AArch64.CC = 18417 AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); 18418 } 18419 return TValue->isOne() && FValue->isZero(); 18420 } 18421 18422 // Returns true if Op is setcc or zext of setcc. 18423 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { 18424 if (isSetCC(Op, Info)) 18425 return true; 18426 return ((Op.getOpcode() == ISD::ZERO_EXTEND) && 18427 isSetCC(Op->getOperand(0), Info)); 18428 } 18429 18430 // The folding we want to perform is: 18431 // (add x, [zext] (setcc cc ...) ) 18432 // --> 18433 // (csel x, (add x, 1), !cc ...) 18434 // 18435 // The latter will get matched to a CSINC instruction. 18436 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { 18437 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); 18438 SDValue LHS = Op->getOperand(0); 18439 SDValue RHS = Op->getOperand(1); 18440 SetCCInfoAndKind InfoAndKind; 18441 18442 // If both operands are a SET_CC, then we don't want to perform this 18443 // folding and create another csel as this results in more instructions 18444 // (and higher register usage). 18445 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) && 18446 isSetCCOrZExtSetCC(RHS, InfoAndKind)) 18447 return SDValue(); 18448 18449 // If neither operand is a SET_CC, give up. 18450 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { 18451 std::swap(LHS, RHS); 18452 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) 18453 return SDValue(); 18454 } 18455 18456 // FIXME: This could be generatized to work for FP comparisons. 18457 EVT CmpVT = InfoAndKind.IsAArch64 18458 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() 18459 : InfoAndKind.Info.Generic.Opnd0->getValueType(); 18460 if (CmpVT != MVT::i32 && CmpVT != MVT::i64) 18461 return SDValue(); 18462 18463 SDValue CCVal; 18464 SDValue Cmp; 18465 SDLoc dl(Op); 18466 if (InfoAndKind.IsAArch64) { 18467 CCVal = DAG.getConstant( 18468 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl, 18469 MVT::i32); 18470 Cmp = *InfoAndKind.Info.AArch64.Cmp; 18471 } else 18472 Cmp = getAArch64Cmp( 18473 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1, 18474 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG, 18475 dl); 18476 18477 EVT VT = Op->getValueType(0); 18478 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); 18479 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); 18480 } 18481 18482 // ADD(UADDV a, UADDV b) --> UADDV(ADD a, b) 18483 static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) { 18484 EVT VT = N->getValueType(0); 18485 // Only scalar integer and vector types. 18486 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger()) 18487 return SDValue(); 18488 18489 SDValue LHS = N->getOperand(0); 18490 SDValue RHS = N->getOperand(1); 18491 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 18492 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT) 18493 return SDValue(); 18494 18495 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 18496 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1)); 18497 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero()) 18498 return SDValue(); 18499 18500 SDValue Op1 = LHS->getOperand(0); 18501 SDValue Op2 = RHS->getOperand(0); 18502 EVT OpVT1 = Op1.getValueType(); 18503 EVT OpVT2 = Op2.getValueType(); 18504 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 || 18505 Op2.getOpcode() != AArch64ISD::UADDV || 18506 OpVT1.getVectorElementType() != VT) 18507 return SDValue(); 18508 18509 SDValue Val1 = Op1.getOperand(0); 18510 SDValue Val2 = Op2.getOperand(0); 18511 EVT ValVT = Val1->getValueType(0); 18512 SDLoc DL(N); 18513 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2); 18514 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, 18515 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal), 18516 DAG.getConstant(0, DL, MVT::i64)); 18517 } 18518 18519 /// Perform the scalar expression combine in the form of: 18520 /// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc) 18521 /// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc) 18522 static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) { 18523 EVT VT = N->getValueType(0); 18524 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD) 18525 return SDValue(); 18526 18527 SDValue LHS = N->getOperand(0); 18528 SDValue RHS = N->getOperand(1); 18529 18530 // Handle commutivity. 18531 if (LHS.getOpcode() != AArch64ISD::CSEL && 18532 LHS.getOpcode() != AArch64ISD::CSNEG) { 18533 std::swap(LHS, RHS); 18534 if (LHS.getOpcode() != AArch64ISD::CSEL && 18535 LHS.getOpcode() != AArch64ISD::CSNEG) { 18536 return SDValue(); 18537 } 18538 } 18539 18540 if (!LHS.hasOneUse()) 18541 return SDValue(); 18542 18543 AArch64CC::CondCode AArch64CC = 18544 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2)); 18545 18546 // The CSEL should include a const one operand, and the CSNEG should include 18547 // One or NegOne operand. 18548 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0)); 18549 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 18550 if (!CTVal || !CFVal) 18551 return SDValue(); 18552 18553 if (!(LHS.getOpcode() == AArch64ISD::CSEL && 18554 (CTVal->isOne() || CFVal->isOne())) && 18555 !(LHS.getOpcode() == AArch64ISD::CSNEG && 18556 (CTVal->isOne() || CFVal->isAllOnes()))) 18557 return SDValue(); 18558 18559 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc) 18560 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() && 18561 !CFVal->isOne()) { 18562 std::swap(CTVal, CFVal); 18563 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); 18564 } 18565 18566 SDLoc DL(N); 18567 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc) 18568 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() && 18569 !CFVal->isAllOnes()) { 18570 APInt C = -1 * CFVal->getAPIntValue(); 18571 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT)); 18572 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT)); 18573 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); 18574 } 18575 18576 // It might be neutral for larger constants, as the immediate need to be 18577 // materialized in a register. 18578 APInt ADDC = CTVal->getAPIntValue(); 18579 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 18580 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue())) 18581 return SDValue(); 18582 18583 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) || 18584 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) && 18585 "Unexpected constant value"); 18586 18587 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0)); 18588 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32); 18589 SDValue Cmp = LHS.getOperand(3); 18590 18591 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp); 18592 } 18593 18594 // ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y) 18595 static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) { 18596 EVT VT = N->getValueType(0); 18597 if (N->getOpcode() != ISD::ADD) 18598 return SDValue(); 18599 18600 SDValue Dot = N->getOperand(0); 18601 SDValue A = N->getOperand(1); 18602 // Handle commutivity 18603 auto isZeroDot = [](SDValue Dot) { 18604 return (Dot.getOpcode() == AArch64ISD::UDOT || 18605 Dot.getOpcode() == AArch64ISD::SDOT) && 18606 isZerosVector(Dot.getOperand(0).getNode()); 18607 }; 18608 if (!isZeroDot(Dot)) 18609 std::swap(Dot, A); 18610 if (!isZeroDot(Dot)) 18611 return SDValue(); 18612 18613 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1), 18614 Dot.getOperand(2)); 18615 } 18616 18617 static bool isNegatedInteger(SDValue Op) { 18618 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)); 18619 } 18620 18621 static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) { 18622 SDLoc DL(Op); 18623 EVT VT = Op.getValueType(); 18624 SDValue Zero = DAG.getConstant(0, DL, VT); 18625 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op); 18626 } 18627 18628 // Try to fold 18629 // 18630 // (neg (csel X, Y)) -> (csel (neg X), (neg Y)) 18631 // 18632 // The folding helps csel to be matched with csneg without generating 18633 // redundant neg instruction, which includes negation of the csel expansion 18634 // of abs node lowered by lowerABS. 18635 static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) { 18636 if (!isNegatedInteger(SDValue(N, 0))) 18637 return SDValue(); 18638 18639 SDValue CSel = N->getOperand(1); 18640 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse()) 18641 return SDValue(); 18642 18643 SDValue N0 = CSel.getOperand(0); 18644 SDValue N1 = CSel.getOperand(1); 18645 18646 // If both of them is not negations, it's not worth the folding as it 18647 // introduces two additional negations while reducing one negation. 18648 if (!isNegatedInteger(N0) && !isNegatedInteger(N1)) 18649 return SDValue(); 18650 18651 SDValue N0N = getNegatedInteger(N0, DAG); 18652 SDValue N1N = getNegatedInteger(N1, DAG); 18653 18654 SDLoc DL(N); 18655 EVT VT = CSel.getValueType(); 18656 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2), 18657 CSel.getOperand(3)); 18658 } 18659 18660 // The basic add/sub long vector instructions have variants with "2" on the end 18661 // which act on the high-half of their inputs. They are normally matched by 18662 // patterns like: 18663 // 18664 // (add (zeroext (extract_high LHS)), 18665 // (zeroext (extract_high RHS))) 18666 // -> uaddl2 vD, vN, vM 18667 // 18668 // However, if one of the extracts is something like a duplicate, this 18669 // instruction can still be used profitably. This function puts the DAG into a 18670 // more appropriate form for those patterns to trigger. 18671 static SDValue performAddSubLongCombine(SDNode *N, 18672 TargetLowering::DAGCombinerInfo &DCI) { 18673 SelectionDAG &DAG = DCI.DAG; 18674 if (DCI.isBeforeLegalizeOps()) 18675 return SDValue(); 18676 18677 MVT VT = N->getSimpleValueType(0); 18678 if (!VT.is128BitVector()) { 18679 if (N->getOpcode() == ISD::ADD) 18680 return performSetccAddFolding(N, DAG); 18681 return SDValue(); 18682 } 18683 18684 // Make sure both branches are extended in the same way. 18685 SDValue LHS = N->getOperand(0); 18686 SDValue RHS = N->getOperand(1); 18687 if ((LHS.getOpcode() != ISD::ZERO_EXTEND && 18688 LHS.getOpcode() != ISD::SIGN_EXTEND) || 18689 LHS.getOpcode() != RHS.getOpcode()) 18690 return SDValue(); 18691 18692 unsigned ExtType = LHS.getOpcode(); 18693 18694 // It's not worth doing if at least one of the inputs isn't already an 18695 // extract, but we don't know which it'll be so we have to try both. 18696 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) { 18697 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); 18698 if (!RHS.getNode()) 18699 return SDValue(); 18700 18701 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); 18702 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) { 18703 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); 18704 if (!LHS.getNode()) 18705 return SDValue(); 18706 18707 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); 18708 } 18709 18710 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); 18711 } 18712 18713 static bool isCMP(SDValue Op) { 18714 return Op.getOpcode() == AArch64ISD::SUBS && 18715 !Op.getNode()->hasAnyUseOfValue(0); 18716 } 18717 18718 // (CSEL 1 0 CC Cond) => CC 18719 // (CSEL 0 1 CC Cond) => !CC 18720 static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) { 18721 if (Op.getOpcode() != AArch64ISD::CSEL) 18722 return std::nullopt; 18723 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2)); 18724 if (CC == AArch64CC::AL || CC == AArch64CC::NV) 18725 return std::nullopt; 18726 SDValue OpLHS = Op.getOperand(0); 18727 SDValue OpRHS = Op.getOperand(1); 18728 if (isOneConstant(OpLHS) && isNullConstant(OpRHS)) 18729 return CC; 18730 if (isNullConstant(OpLHS) && isOneConstant(OpRHS)) 18731 return getInvertedCondCode(CC); 18732 18733 return std::nullopt; 18734 } 18735 18736 // (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry) 18737 // (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry) 18738 static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) { 18739 SDValue CmpOp = Op->getOperand(2); 18740 if (!isCMP(CmpOp)) 18741 return SDValue(); 18742 18743 if (IsAdd) { 18744 if (!isOneConstant(CmpOp.getOperand(1))) 18745 return SDValue(); 18746 } else { 18747 if (!isNullConstant(CmpOp.getOperand(0))) 18748 return SDValue(); 18749 } 18750 18751 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1); 18752 auto CC = getCSETCondCode(CsetOp); 18753 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO)) 18754 return SDValue(); 18755 18756 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(), 18757 Op->getOperand(0), Op->getOperand(1), 18758 CsetOp.getOperand(3)); 18759 } 18760 18761 // (ADC x 0 cond) => (CINC x HS cond) 18762 static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) { 18763 SDValue LHS = N->getOperand(0); 18764 SDValue RHS = N->getOperand(1); 18765 SDValue Cond = N->getOperand(2); 18766 18767 if (!isNullConstant(RHS)) 18768 return SDValue(); 18769 18770 EVT VT = N->getValueType(0); 18771 SDLoc DL(N); 18772 18773 // (CINC x cc cond) <=> (CSINC x x !cc cond) 18774 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32); 18775 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond); 18776 } 18777 18778 // Transform vector add(zext i8 to i32, zext i8 to i32) 18779 // into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32) 18780 // This allows extra uses of saddl/uaddl at the lower vector widths, and less 18781 // extends. 18782 static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) { 18783 EVT VT = N->getValueType(0); 18784 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 || 18785 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && 18786 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) || 18787 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && 18788 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) || 18789 N->getOperand(0).getOperand(0).getValueType() != 18790 N->getOperand(1).getOperand(0).getValueType()) 18791 return SDValue(); 18792 18793 SDValue N0 = N->getOperand(0).getOperand(0); 18794 SDValue N1 = N->getOperand(1).getOperand(0); 18795 EVT InVT = N0.getValueType(); 18796 18797 EVT S1 = InVT.getScalarType(); 18798 EVT S2 = VT.getScalarType(); 18799 if ((S2 == MVT::i32 && S1 == MVT::i8) || 18800 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) { 18801 SDLoc DL(N); 18802 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), 18803 S2.getHalfSizedIntegerVT(*DAG.getContext()), 18804 VT.getVectorElementCount()); 18805 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0); 18806 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1); 18807 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1); 18808 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp); 18809 } 18810 return SDValue(); 18811 } 18812 18813 static SDValue performBuildVectorCombine(SDNode *N, 18814 TargetLowering::DAGCombinerInfo &DCI, 18815 SelectionDAG &DAG) { 18816 SDLoc DL(N); 18817 EVT VT = N->getValueType(0); 18818 18819 // A build vector of two extracted elements is equivalent to an 18820 // extract subvector where the inner vector is any-extended to the 18821 // extract_vector_elt VT. 18822 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0) 18823 // (extract_elt_iXX_to_i32 vec Idx+1)) 18824 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx) 18825 18826 // For now, only consider the v2i32 case, which arises as a result of 18827 // legalization. 18828 if (VT != MVT::v2i32) 18829 return SDValue(); 18830 18831 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1); 18832 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT. 18833 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 18834 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 18835 // Constant index. 18836 isa<ConstantSDNode>(Elt0->getOperand(1)) && 18837 isa<ConstantSDNode>(Elt1->getOperand(1)) && 18838 // Both EXTRACT_VECTOR_ELT from same vector... 18839 Elt0->getOperand(0) == Elt1->getOperand(0) && 18840 // ... and contiguous. First element's index +1 == second element's index. 18841 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) && 18842 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of 18843 // ResultType's known minimum vector length. 18844 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) { 18845 SDValue VecToExtend = Elt0->getOperand(0); 18846 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32); 18847 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT)) 18848 return SDValue(); 18849 18850 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL); 18851 18852 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend); 18853 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext, 18854 SubvectorIdx); 18855 } 18856 18857 return SDValue(); 18858 } 18859 18860 static SDValue performTruncateCombine(SDNode *N, 18861 SelectionDAG &DAG) { 18862 EVT VT = N->getValueType(0); 18863 SDValue N0 = N->getOperand(0); 18864 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() && 18865 N0.getOpcode() == AArch64ISD::DUP) { 18866 SDValue Op = N0.getOperand(0); 18867 if (VT.getScalarType() == MVT::i32 && 18868 N0.getOperand(0).getValueType().getScalarType() == MVT::i64) 18869 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op); 18870 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op); 18871 } 18872 18873 return SDValue(); 18874 } 18875 18876 // Check an node is an extend or shift operand 18877 static bool isExtendOrShiftOperand(SDValue N) { 18878 unsigned Opcode = N.getOpcode(); 18879 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) { 18880 EVT SrcVT; 18881 if (Opcode == ISD::SIGN_EXTEND_INREG) 18882 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT(); 18883 else 18884 SrcVT = N.getOperand(0).getValueType(); 18885 18886 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8; 18887 } else if (Opcode == ISD::AND) { 18888 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1)); 18889 if (!CSD) 18890 return false; 18891 uint64_t AndMask = CSD->getZExtValue(); 18892 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff; 18893 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) { 18894 return isa<ConstantSDNode>(N.getOperand(1)); 18895 } 18896 18897 return false; 18898 } 18899 18900 // (N - Y) + Z --> (Z - Y) + N 18901 // when N is an extend or shift operand 18902 static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, 18903 SelectionDAG &DAG) { 18904 auto IsOneUseExtend = [](SDValue N) { 18905 return N.hasOneUse() && isExtendOrShiftOperand(N); 18906 }; 18907 18908 // DAGCombiner will revert the combination when Z is constant cause 18909 // dead loop. So don't enable the combination when Z is constant. 18910 // If Z is one use shift C, we also can't do the optimization. 18911 // It will falling to self infinite loop. 18912 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z)) 18913 return SDValue(); 18914 18915 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse()) 18916 return SDValue(); 18917 18918 SDValue Shift = SUB.getOperand(0); 18919 if (!IsOneUseExtend(Shift)) 18920 return SDValue(); 18921 18922 SDLoc DL(N); 18923 EVT VT = N->getValueType(0); 18924 18925 SDValue Y = SUB.getOperand(1); 18926 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y); 18927 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift); 18928 } 18929 18930 static SDValue performAddCombineForShiftedOperands(SDNode *N, 18931 SelectionDAG &DAG) { 18932 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not 18933 // commutative. 18934 if (N->getOpcode() != ISD::ADD) 18935 return SDValue(); 18936 18937 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with 18938 // shifted register is only available for i32 and i64. 18939 EVT VT = N->getValueType(0); 18940 if (VT != MVT::i32 && VT != MVT::i64) 18941 return SDValue(); 18942 18943 SDLoc DL(N); 18944 SDValue LHS = N->getOperand(0); 18945 SDValue RHS = N->getOperand(1); 18946 18947 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG)) 18948 return Val; 18949 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG)) 18950 return Val; 18951 18952 uint64_t LHSImm = 0, RHSImm = 0; 18953 // If both operand are shifted by imm and shift amount is not greater than 4 18954 // for one operand, swap LHS and RHS to put operand with smaller shift amount 18955 // on RHS. 18956 // 18957 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with 18958 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD 18959 // with LSL (shift > 4). For the rest of processors, this is no-op for 18960 // performance or correctness. 18961 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) && 18962 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 && 18963 RHSImm > 4 && LHS.hasOneUse()) 18964 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS); 18965 18966 return SDValue(); 18967 } 18968 18969 // The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2)) 18970 // This reassociates it back to allow the creation of more mls instructions. 18971 static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) { 18972 if (N->getOpcode() != ISD::SUB) 18973 return SDValue(); 18974 18975 SDValue Add = N->getOperand(1); 18976 SDValue X = N->getOperand(0); 18977 if (Add.getOpcode() != ISD::ADD) 18978 return SDValue(); 18979 18980 if (!Add.hasOneUse()) 18981 return SDValue(); 18982 if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(X))) 18983 return SDValue(); 18984 18985 SDValue M1 = Add.getOperand(0); 18986 SDValue M2 = Add.getOperand(1); 18987 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL && 18988 M1.getOpcode() != AArch64ISD::UMULL) 18989 return SDValue(); 18990 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL && 18991 M2.getOpcode() != AArch64ISD::UMULL) 18992 return SDValue(); 18993 18994 EVT VT = N->getValueType(0); 18995 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1); 18996 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2); 18997 } 18998 18999 // Combine into mla/mls. 19000 // This works on the patterns of: 19001 // add v1, (mul v2, v3) 19002 // sub v1, (mul v2, v3) 19003 // for vectors of type <1 x i64> and <2 x i64> when SVE is available. 19004 // It will transform the add/sub to a scalable version, so that we can 19005 // make use of SVE's MLA/MLS that will be generated for that pattern 19006 static SDValue 19007 performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 19008 SelectionDAG &DAG = DCI.DAG; 19009 // Make sure that the types are legal 19010 if (!DCI.isAfterLegalizeDAG()) 19011 return SDValue(); 19012 // Before using SVE's features, check first if it's available. 19013 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE()) 19014 return SDValue(); 19015 19016 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB) 19017 return SDValue(); 19018 19019 if (!N->getValueType(0).isFixedLengthVector()) 19020 return SDValue(); 19021 19022 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue { 19023 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR) 19024 return SDValue(); 19025 19026 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero()) 19027 return SDValue(); 19028 19029 SDValue MulValue = Op1->getOperand(0); 19030 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED) 19031 return SDValue(); 19032 19033 if (!Op1.hasOneUse() || !MulValue.hasOneUse()) 19034 return SDValue(); 19035 19036 EVT ScalableVT = MulValue.getValueType(); 19037 if (!ScalableVT.isScalableVector()) 19038 return SDValue(); 19039 19040 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0); 19041 SDValue NewValue = 19042 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue}); 19043 return convertFromScalableVector(DAG, N->getValueType(0), NewValue); 19044 }; 19045 19046 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1))) 19047 return res; 19048 else if (N->getOpcode() == ISD::ADD) 19049 return performOpt(N->getOperand(1), N->getOperand(0)); 19050 19051 return SDValue(); 19052 } 19053 19054 // Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can 19055 // help, for example, to produce ssra from sshr+add. 19056 static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) { 19057 EVT VT = N->getValueType(0); 19058 if (VT != MVT::i64) 19059 return SDValue(); 19060 SDValue Op0 = N->getOperand(0); 19061 SDValue Op1 = N->getOperand(1); 19062 19063 // At least one of the operands should be an extract, and the other should be 19064 // something that is easy to convert to v1i64 type (in this case a load). 19065 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT && 19066 Op0.getOpcode() != ISD::LOAD) 19067 return SDValue(); 19068 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT && 19069 Op1.getOpcode() != ISD::LOAD) 19070 return SDValue(); 19071 19072 SDLoc DL(N); 19073 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 19074 Op0.getOperand(0).getValueType() == MVT::v1i64) { 19075 Op0 = Op0.getOperand(0); 19076 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1); 19077 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 19078 Op1.getOperand(0).getValueType() == MVT::v1i64) { 19079 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0); 19080 Op1 = Op1.getOperand(0); 19081 } else 19082 return SDValue(); 19083 19084 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, 19085 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1), 19086 DAG.getConstant(0, DL, MVT::i64)); 19087 } 19088 19089 static bool isLoadOrMultipleLoads(SDValue B, SmallVector<LoadSDNode *> &Loads) { 19090 SDValue BV = peekThroughOneUseBitcasts(B); 19091 if (!BV->hasOneUse()) 19092 return false; 19093 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) { 19094 if (!Ld || !Ld->isSimple()) 19095 return false; 19096 Loads.push_back(Ld); 19097 return true; 19098 } else if (BV.getOpcode() == ISD::BUILD_VECTOR || 19099 BV.getOpcode() == ISD::CONCAT_VECTORS) { 19100 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) { 19101 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op)); 19102 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse()) 19103 return false; 19104 Loads.push_back(Ld); 19105 } 19106 return true; 19107 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) { 19108 // Try to find a tree of shuffles and concats from how IR shuffles of loads 19109 // are lowered. Note that this only comes up because we do not always visit 19110 // operands before uses. After that is fixed this can be removed and in the 19111 // meantime this is fairly specific to the lowering we expect from IR. 19112 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45 19113 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43 19114 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8 19115 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64 19116 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64 19117 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8 19118 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64 19119 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8 19120 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64 19121 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE || 19122 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS || 19123 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS || 19124 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS || 19125 B.getOperand(1).getNumOperands() != 4) 19126 return false; 19127 auto SV1 = cast<ShuffleVectorSDNode>(B); 19128 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0)); 19129 int NumElts = B.getValueType().getVectorNumElements(); 19130 int NumSubElts = NumElts / 4; 19131 for (int I = 0; I < NumSubElts; I++) { 19132 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> 19133 if (SV1->getMaskElt(I) != I || 19134 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts || 19135 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 || 19136 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts) 19137 return false; 19138 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> 19139 if (SV2->getMaskElt(I) != I || 19140 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts || 19141 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts) 19142 return false; 19143 } 19144 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0)); 19145 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1)); 19146 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0)); 19147 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0)); 19148 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() || 19149 !Ld2->isSimple() || !Ld3->isSimple()) 19150 return false; 19151 Loads.push_back(Ld0); 19152 Loads.push_back(Ld1); 19153 Loads.push_back(Ld2); 19154 Loads.push_back(Ld3); 19155 return true; 19156 } 19157 return false; 19158 } 19159 19160 static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, 19161 SelectionDAG &DAG, 19162 unsigned &NumSubLoads) { 19163 if (!Op0.hasOneUse() || !Op1.hasOneUse()) 19164 return false; 19165 19166 SmallVector<LoadSDNode *> Loads0, Loads1; 19167 if (isLoadOrMultipleLoads(Op0, Loads0) && 19168 isLoadOrMultipleLoads(Op1, Loads1)) { 19169 if (NumSubLoads && Loads0.size() != NumSubLoads) 19170 return false; 19171 NumSubLoads = Loads0.size(); 19172 return Loads0.size() == Loads1.size() && 19173 all_of(zip(Loads0, Loads1), [&DAG](auto L) { 19174 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits(); 19175 return Size == get<1>(L)->getValueType(0).getSizeInBits() && 19176 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L), 19177 Size / 8, 1); 19178 }); 19179 } 19180 19181 if (Op0.getOpcode() != Op1.getOpcode()) 19182 return false; 19183 19184 switch (Op0.getOpcode()) { 19185 case ISD::ADD: 19186 case ISD::SUB: 19187 return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0), 19188 DAG, NumSubLoads) && 19189 areLoadedOffsetButOtherwiseSame(Op0.getOperand(1), Op1.getOperand(1), 19190 DAG, NumSubLoads); 19191 case ISD::SIGN_EXTEND: 19192 case ISD::ANY_EXTEND: 19193 case ISD::ZERO_EXTEND: 19194 EVT XVT = Op0.getOperand(0).getValueType(); 19195 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 && 19196 XVT.getScalarSizeInBits() != 32) 19197 return false; 19198 return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0), 19199 DAG, NumSubLoads); 19200 } 19201 return false; 19202 } 19203 19204 // This method attempts to fold trees of add(ext(load p), shl(ext(load p+4)) 19205 // into a single load of twice the size, that we extract the bottom part and top 19206 // part so that the shl can use a shll2 instruction. The two loads in that 19207 // example can also be larger trees of instructions, which are identical except 19208 // for the leaves which are all loads offset from the LHS, including 19209 // buildvectors of multiple loads. For example the RHS tree could be 19210 // sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4))) 19211 // Whilst it can be common for the larger loads to replace LDP instructions 19212 // (which doesn't gain anything on it's own), the larger loads can help create 19213 // more efficient code, and in buildvectors prevent the need for ld1 lane 19214 // inserts which can be slower than normal loads. 19215 static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) { 19216 EVT VT = N->getValueType(0); 19217 if (!VT.isFixedLengthVector() || 19218 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 && 19219 VT.getScalarSizeInBits() != 64)) 19220 return SDValue(); 19221 19222 SDValue Other = N->getOperand(0); 19223 SDValue Shift = N->getOperand(1); 19224 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB) 19225 std::swap(Shift, Other); 19226 APInt ShiftAmt; 19227 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() || 19228 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt)) 19229 return SDValue(); 19230 19231 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) || 19232 !ISD::isExtOpcode(Other.getOpcode()) || 19233 Shift.getOperand(0).getOperand(0).getValueType() != 19234 Other.getOperand(0).getValueType() || 19235 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse()) 19236 return SDValue(); 19237 19238 SDValue Op0 = Other.getOperand(0); 19239 SDValue Op1 = Shift.getOperand(0).getOperand(0); 19240 19241 unsigned NumSubLoads = 0; 19242 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads)) 19243 return SDValue(); 19244 19245 // Attempt to rule out some unprofitable cases using heuristics (some working 19246 // around suboptimal code generation), notably if the extend not be able to 19247 // use ushll2 instructions as the types are not large enough. Otherwise zip's 19248 // will need to be created which can increase the instruction count. 19249 unsigned NumElts = Op0.getValueType().getVectorNumElements(); 19250 unsigned NumSubElts = NumElts / NumSubLoads; 19251 if (NumSubElts * VT.getScalarSizeInBits() < 128 || 19252 (Other.getOpcode() != Shift.getOperand(0).getOpcode() && 19253 Op0.getValueType().getSizeInBits() < 128 && 19254 !DAG.getTargetLoweringInfo().isTypeLegal(Op0.getValueType()))) 19255 return SDValue(); 19256 19257 // Recreate the tree with the new combined loads. 19258 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree = 19259 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) { 19260 EVT DVT = 19261 Op0.getValueType().getDoubleNumVectorElementsVT(*DAG.getContext()); 19262 19263 SmallVector<LoadSDNode *> Loads0, Loads1; 19264 if (isLoadOrMultipleLoads(Op0, Loads0) && 19265 isLoadOrMultipleLoads(Op1, Loads1)) { 19266 EVT LoadVT = EVT::getVectorVT( 19267 *DAG.getContext(), Op0.getValueType().getScalarType(), 19268 Op0.getValueType().getVectorNumElements() / Loads0.size()); 19269 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext()); 19270 19271 SmallVector<SDValue> NewLoads; 19272 for (const auto &[L0, L1] : zip(Loads0, Loads1)) { 19273 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(), 19274 L0->getBasePtr(), L0->getPointerInfo(), 19275 L0->getOriginalAlign()); 19276 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1)); 19277 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1)); 19278 NewLoads.push_back(Load); 19279 } 19280 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads); 19281 } 19282 19283 SmallVector<SDValue> Ops; 19284 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values())) 19285 Ops.push_back(GenCombinedTree(O0, O1, DAG)); 19286 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops); 19287 }; 19288 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG); 19289 19290 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0); 19291 int Hi = NumSubElts, Lo = 0; 19292 for (unsigned i = 0; i < NumSubLoads; i++) { 19293 for (unsigned j = 0; j < NumSubElts; j++) { 19294 LowMask[i * NumSubElts + j] = Lo++; 19295 HighMask[i * NumSubElts + j] = Hi++; 19296 } 19297 Lo += NumSubElts; 19298 Hi += NumSubElts; 19299 } 19300 SDLoc DL(N); 19301 SDValue Ext0, Ext1; 19302 // Extract the top and bottom lanes, then extend the result. Possibly extend 19303 // the result then extract the lanes if the two operands match as it produces 19304 // slightly smaller code. 19305 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) { 19306 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), 19307 NewOp, DAG.getConstant(0, DL, MVT::i64)); 19308 SDValue SubH = 19309 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp, 19310 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64)); 19311 SDValue Extr0 = 19312 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask); 19313 SDValue Extr1 = 19314 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask); 19315 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0); 19316 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1); 19317 } else { 19318 EVT DVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext()); 19319 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp); 19320 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext, 19321 DAG.getConstant(0, DL, MVT::i64)); 19322 SDValue SubH = 19323 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext, 19324 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64)); 19325 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask); 19326 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask); 19327 } 19328 SDValue NShift = 19329 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1)); 19330 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift); 19331 } 19332 19333 static SDValue performAddSubCombine(SDNode *N, 19334 TargetLowering::DAGCombinerInfo &DCI) { 19335 // Try to change sum of two reductions. 19336 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG)) 19337 return Val; 19338 if (SDValue Val = performAddDotCombine(N, DCI.DAG)) 19339 return Val; 19340 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG)) 19341 return Val; 19342 if (SDValue Val = performNegCSelCombine(N, DCI.DAG)) 19343 return Val; 19344 if (SDValue Val = performVectorAddSubExtCombine(N, DCI.DAG)) 19345 return Val; 19346 if (SDValue Val = performAddCombineForShiftedOperands(N, DCI.DAG)) 19347 return Val; 19348 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG)) 19349 return Val; 19350 if (SDValue Val = performSVEMulAddSubCombine(N, DCI)) 19351 return Val; 19352 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG)) 19353 return Val; 19354 19355 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG)) 19356 return Val; 19357 19358 return performAddSubLongCombine(N, DCI); 19359 } 19360 19361 // Massage DAGs which we can use the high-half "long" operations on into 19362 // something isel will recognize better. E.g. 19363 // 19364 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> 19365 // (aarch64_neon_umull (extract_high (v2i64 vec))) 19366 // (extract_high (v2i64 (dup128 scalar))))) 19367 // 19368 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, 19369 TargetLowering::DAGCombinerInfo &DCI, 19370 SelectionDAG &DAG) { 19371 if (DCI.isBeforeLegalizeOps()) 19372 return SDValue(); 19373 19374 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1); 19375 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2); 19376 assert(LHS.getValueType().is64BitVector() && 19377 RHS.getValueType().is64BitVector() && 19378 "unexpected shape for long operation"); 19379 19380 // Either node could be a DUP, but it's not worth doing both of them (you'd 19381 // just as well use the non-high version) so look for a corresponding extract 19382 // operation on the other "wing". 19383 if (isEssentiallyExtractHighSubvector(LHS)) { 19384 RHS = tryExtendDUPToExtractHigh(RHS, DAG); 19385 if (!RHS.getNode()) 19386 return SDValue(); 19387 } else if (isEssentiallyExtractHighSubvector(RHS)) { 19388 LHS = tryExtendDUPToExtractHigh(LHS, DAG); 19389 if (!LHS.getNode()) 19390 return SDValue(); 19391 } else 19392 return SDValue(); 19393 19394 if (IID == Intrinsic::not_intrinsic) 19395 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS); 19396 19397 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), 19398 N->getOperand(0), LHS, RHS); 19399 } 19400 19401 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { 19402 MVT ElemTy = N->getSimpleValueType(0).getScalarType(); 19403 unsigned ElemBits = ElemTy.getSizeInBits(); 19404 19405 int64_t ShiftAmount; 19406 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) { 19407 APInt SplatValue, SplatUndef; 19408 unsigned SplatBitSize; 19409 bool HasAnyUndefs; 19410 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, 19411 HasAnyUndefs, ElemBits) || 19412 SplatBitSize != ElemBits) 19413 return SDValue(); 19414 19415 ShiftAmount = SplatValue.getSExtValue(); 19416 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 19417 ShiftAmount = CVN->getSExtValue(); 19418 } else 19419 return SDValue(); 19420 19421 // If the shift amount is zero, remove the shift intrinsic. 19422 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu) 19423 return N->getOperand(1); 19424 19425 unsigned Opcode; 19426 bool IsRightShift; 19427 switch (IID) { 19428 default: 19429 llvm_unreachable("Unknown shift intrinsic"); 19430 case Intrinsic::aarch64_neon_sqshl: 19431 Opcode = AArch64ISD::SQSHL_I; 19432 IsRightShift = false; 19433 break; 19434 case Intrinsic::aarch64_neon_uqshl: 19435 Opcode = AArch64ISD::UQSHL_I; 19436 IsRightShift = false; 19437 break; 19438 case Intrinsic::aarch64_neon_srshl: 19439 Opcode = AArch64ISD::SRSHR_I; 19440 IsRightShift = true; 19441 break; 19442 case Intrinsic::aarch64_neon_urshl: 19443 Opcode = AArch64ISD::URSHR_I; 19444 IsRightShift = true; 19445 break; 19446 case Intrinsic::aarch64_neon_sqshlu: 19447 Opcode = AArch64ISD::SQSHLU_I; 19448 IsRightShift = false; 19449 break; 19450 case Intrinsic::aarch64_neon_sshl: 19451 case Intrinsic::aarch64_neon_ushl: 19452 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular 19453 // left shift for positive shift amounts. For negative shifts we can use a 19454 // VASHR/VLSHR as appropiate. 19455 if (ShiftAmount < 0) { 19456 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR 19457 : AArch64ISD::VLSHR; 19458 ShiftAmount = -ShiftAmount; 19459 } else 19460 Opcode = AArch64ISD::VSHL; 19461 IsRightShift = false; 19462 break; 19463 } 19464 19465 EVT VT = N->getValueType(0); 19466 SDValue Op = N->getOperand(1); 19467 SDLoc dl(N); 19468 if (VT == MVT::i64) { 19469 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op); 19470 VT = MVT::v1i64; 19471 } 19472 19473 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { 19474 Op = DAG.getNode(Opcode, dl, VT, Op, 19475 DAG.getConstant(-ShiftAmount, dl, MVT::i32)); 19476 if (N->getValueType(0) == MVT::i64) 19477 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op, 19478 DAG.getConstant(0, dl, MVT::i64)); 19479 return Op; 19480 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) { 19481 Op = DAG.getNode(Opcode, dl, VT, Op, 19482 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 19483 if (N->getValueType(0) == MVT::i64) 19484 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op, 19485 DAG.getConstant(0, dl, MVT::i64)); 19486 return Op; 19487 } 19488 19489 return SDValue(); 19490 } 19491 19492 // The CRC32[BH] instructions ignore the high bits of their data operand. Since 19493 // the intrinsics must be legal and take an i32, this means there's almost 19494 // certainly going to be a zext in the DAG which we can eliminate. 19495 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { 19496 SDValue AndN = N->getOperand(2); 19497 if (AndN.getOpcode() != ISD::AND) 19498 return SDValue(); 19499 19500 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1)); 19501 if (!CMask || CMask->getZExtValue() != Mask) 19502 return SDValue(); 19503 19504 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, 19505 N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); 19506 } 19507 19508 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, 19509 SelectionDAG &DAG) { 19510 SDLoc dl(N); 19511 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), 19512 DAG.getNode(Opc, dl, 19513 N->getOperand(1).getSimpleValueType(), 19514 N->getOperand(1)), 19515 DAG.getConstant(0, dl, MVT::i64)); 19516 } 19517 19518 static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) { 19519 SDLoc DL(N); 19520 SDValue Op1 = N->getOperand(1); 19521 SDValue Op2 = N->getOperand(2); 19522 EVT ScalarTy = Op2.getValueType(); 19523 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) 19524 ScalarTy = MVT::i32; 19525 19526 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base). 19527 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0)); 19528 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2); 19529 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step); 19530 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1); 19531 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base); 19532 } 19533 19534 static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) { 19535 SDLoc dl(N); 19536 SDValue Scalar = N->getOperand(3); 19537 EVT ScalarTy = Scalar.getValueType(); 19538 19539 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) 19540 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); 19541 19542 SDValue Passthru = N->getOperand(1); 19543 SDValue Pred = N->getOperand(2); 19544 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0), 19545 Pred, Scalar, Passthru); 19546 } 19547 19548 static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { 19549 SDLoc dl(N); 19550 LLVMContext &Ctx = *DAG.getContext(); 19551 EVT VT = N->getValueType(0); 19552 19553 assert(VT.isScalableVector() && "Expected a scalable vector."); 19554 19555 // Current lowering only supports the SVE-ACLE types. 19556 if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock) 19557 return SDValue(); 19558 19559 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8; 19560 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8; 19561 EVT ByteVT = 19562 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize)); 19563 19564 // Convert everything to the domain of EXT (i.e bytes). 19565 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1)); 19566 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2)); 19567 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3), 19568 DAG.getConstant(ElemSize, dl, MVT::i32)); 19569 19570 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2); 19571 return DAG.getNode(ISD::BITCAST, dl, VT, EXT); 19572 } 19573 19574 static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, 19575 TargetLowering::DAGCombinerInfo &DCI, 19576 SelectionDAG &DAG) { 19577 if (DCI.isBeforeLegalize()) 19578 return SDValue(); 19579 19580 SDValue Comparator = N->getOperand(3); 19581 if (Comparator.getOpcode() == AArch64ISD::DUP || 19582 Comparator.getOpcode() == ISD::SPLAT_VECTOR) { 19583 unsigned IID = getIntrinsicID(N); 19584 EVT VT = N->getValueType(0); 19585 EVT CmpVT = N->getOperand(2).getValueType(); 19586 SDValue Pred = N->getOperand(1); 19587 SDValue Imm; 19588 SDLoc DL(N); 19589 19590 switch (IID) { 19591 default: 19592 llvm_unreachable("Called with wrong intrinsic!"); 19593 break; 19594 19595 // Signed comparisons 19596 case Intrinsic::aarch64_sve_cmpeq_wide: 19597 case Intrinsic::aarch64_sve_cmpne_wide: 19598 case Intrinsic::aarch64_sve_cmpge_wide: 19599 case Intrinsic::aarch64_sve_cmpgt_wide: 19600 case Intrinsic::aarch64_sve_cmplt_wide: 19601 case Intrinsic::aarch64_sve_cmple_wide: { 19602 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) { 19603 int64_t ImmVal = CN->getSExtValue(); 19604 if (ImmVal >= -16 && ImmVal <= 15) 19605 Imm = DAG.getConstant(ImmVal, DL, MVT::i32); 19606 else 19607 return SDValue(); 19608 } 19609 break; 19610 } 19611 // Unsigned comparisons 19612 case Intrinsic::aarch64_sve_cmphs_wide: 19613 case Intrinsic::aarch64_sve_cmphi_wide: 19614 case Intrinsic::aarch64_sve_cmplo_wide: 19615 case Intrinsic::aarch64_sve_cmpls_wide: { 19616 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) { 19617 uint64_t ImmVal = CN->getZExtValue(); 19618 if (ImmVal <= 127) 19619 Imm = DAG.getConstant(ImmVal, DL, MVT::i32); 19620 else 19621 return SDValue(); 19622 } 19623 break; 19624 } 19625 } 19626 19627 if (!Imm) 19628 return SDValue(); 19629 19630 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm); 19631 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred, 19632 N->getOperand(2), Splat, DAG.getCondCode(CC)); 19633 } 19634 19635 return SDValue(); 19636 } 19637 19638 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, 19639 AArch64CC::CondCode Cond) { 19640 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 19641 19642 SDLoc DL(Op); 19643 assert(Op.getValueType().isScalableVector() && 19644 TLI.isTypeLegal(Op.getValueType()) && 19645 "Expected legal scalable vector type!"); 19646 assert(Op.getValueType() == Pg.getValueType() && 19647 "Expected same type for PTEST operands"); 19648 19649 // Ensure target specific opcodes are using legal type. 19650 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); 19651 SDValue TVal = DAG.getConstant(1, DL, OutVT); 19652 SDValue FVal = DAG.getConstant(0, DL, OutVT); 19653 19654 // Ensure operands have type nxv16i1. 19655 if (Op.getValueType() != MVT::nxv16i1) { 19656 if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) && 19657 isZeroingInactiveLanes(Op)) 19658 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg); 19659 else 19660 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG); 19661 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op); 19662 } 19663 19664 // Set condition code (CC) flags. 19665 SDValue Test = DAG.getNode( 19666 Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST, 19667 DL, MVT::Other, Pg, Op); 19668 19669 // Convert CC to integer based on requested condition. 19670 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare. 19671 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32); 19672 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test); 19673 return DAG.getZExtOrTrunc(Res, DL, VT); 19674 } 19675 19676 static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, 19677 SelectionDAG &DAG) { 19678 SDLoc DL(N); 19679 19680 SDValue Pred = N->getOperand(1); 19681 SDValue VecToReduce = N->getOperand(2); 19682 19683 // NOTE: The integer reduction's result type is not always linked to the 19684 // operand's element type so we construct it from the intrinsic's result type. 19685 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0)); 19686 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); 19687 19688 // SVE reductions set the whole vector register with the first element 19689 // containing the reduction result, which we'll now extract. 19690 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 19691 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, 19692 Zero); 19693 } 19694 19695 static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, 19696 SelectionDAG &DAG) { 19697 SDLoc DL(N); 19698 19699 SDValue Pred = N->getOperand(1); 19700 SDValue VecToReduce = N->getOperand(2); 19701 19702 EVT ReduceVT = VecToReduce.getValueType(); 19703 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); 19704 19705 // SVE reductions set the whole vector register with the first element 19706 // containing the reduction result, which we'll now extract. 19707 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 19708 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, 19709 Zero); 19710 } 19711 19712 static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, 19713 SelectionDAG &DAG) { 19714 SDLoc DL(N); 19715 19716 SDValue Pred = N->getOperand(1); 19717 SDValue InitVal = N->getOperand(2); 19718 SDValue VecToReduce = N->getOperand(3); 19719 EVT ReduceVT = VecToReduce.getValueType(); 19720 19721 // Ordered reductions use the first lane of the result vector as the 19722 // reduction's initial value. 19723 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 19724 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT, 19725 DAG.getUNDEF(ReduceVT), InitVal, Zero); 19726 19727 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce); 19728 19729 // SVE reductions set the whole vector register with the first element 19730 // containing the reduction result, which we'll now extract. 19731 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, 19732 Zero); 19733 } 19734 19735 // If a merged operation has no inactive lanes we can relax it to a predicated 19736 // or unpredicated operation, which potentially allows better isel (perhaps 19737 // using immediate forms) or relaxing register reuse requirements. 19738 static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, 19739 SelectionDAG &DAG, bool UnpredOp = false, 19740 bool SwapOperands = false) { 19741 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!"); 19742 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!"); 19743 SDValue Pg = N->getOperand(1); 19744 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2); 19745 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3); 19746 19747 // ISD way to specify an all active predicate. 19748 if (isAllActivePredicate(DAG, Pg)) { 19749 if (UnpredOp) 19750 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2); 19751 19752 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2); 19753 } 19754 19755 // FUTURE: SplatVector(true) 19756 return SDValue(); 19757 } 19758 19759 static SDValue performIntrinsicCombine(SDNode *N, 19760 TargetLowering::DAGCombinerInfo &DCI, 19761 const AArch64Subtarget *Subtarget) { 19762 SelectionDAG &DAG = DCI.DAG; 19763 unsigned IID = getIntrinsicID(N); 19764 switch (IID) { 19765 default: 19766 break; 19767 case Intrinsic::get_active_lane_mask: { 19768 SDValue Res = SDValue(); 19769 EVT VT = N->getValueType(0); 19770 if (VT.isFixedLengthVector()) { 19771 // We can use the SVE whilelo instruction to lower this intrinsic by 19772 // creating the appropriate sequence of scalable vector operations and 19773 // then extracting a fixed-width subvector from the scalable vector. 19774 19775 SDLoc DL(N); 19776 SDValue ID = 19777 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64); 19778 19779 EVT WhileVT = EVT::getVectorVT( 19780 *DAG.getContext(), MVT::i1, 19781 ElementCount::getScalable(VT.getVectorNumElements())); 19782 19783 // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32. 19784 EVT PromVT = getPromotedVTForPredicate(WhileVT); 19785 19786 // Get the fixed-width equivalent of PromVT for extraction. 19787 EVT ExtVT = 19788 EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(), 19789 VT.getVectorElementCount()); 19790 19791 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID, 19792 N->getOperand(1), N->getOperand(2)); 19793 Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res); 19794 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res, 19795 DAG.getConstant(0, DL, MVT::i64)); 19796 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res); 19797 } 19798 return Res; 19799 } 19800 case Intrinsic::aarch64_neon_vcvtfxs2fp: 19801 case Intrinsic::aarch64_neon_vcvtfxu2fp: 19802 return tryCombineFixedPointConvert(N, DCI, DAG); 19803 case Intrinsic::aarch64_neon_saddv: 19804 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); 19805 case Intrinsic::aarch64_neon_uaddv: 19806 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); 19807 case Intrinsic::aarch64_neon_sminv: 19808 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); 19809 case Intrinsic::aarch64_neon_uminv: 19810 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); 19811 case Intrinsic::aarch64_neon_smaxv: 19812 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); 19813 case Intrinsic::aarch64_neon_umaxv: 19814 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); 19815 case Intrinsic::aarch64_neon_fmax: 19816 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0), 19817 N->getOperand(1), N->getOperand(2)); 19818 case Intrinsic::aarch64_neon_fmin: 19819 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0), 19820 N->getOperand(1), N->getOperand(2)); 19821 case Intrinsic::aarch64_neon_fmaxnm: 19822 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), 19823 N->getOperand(1), N->getOperand(2)); 19824 case Intrinsic::aarch64_neon_fminnm: 19825 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), 19826 N->getOperand(1), N->getOperand(2)); 19827 case Intrinsic::aarch64_neon_smull: 19828 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0), 19829 N->getOperand(1), N->getOperand(2)); 19830 case Intrinsic::aarch64_neon_umull: 19831 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0), 19832 N->getOperand(1), N->getOperand(2)); 19833 case Intrinsic::aarch64_neon_pmull: 19834 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0), 19835 N->getOperand(1), N->getOperand(2)); 19836 case Intrinsic::aarch64_neon_sqdmull: 19837 return tryCombineLongOpWithDup(IID, N, DCI, DAG); 19838 case Intrinsic::aarch64_neon_sqshl: 19839 case Intrinsic::aarch64_neon_uqshl: 19840 case Intrinsic::aarch64_neon_sqshlu: 19841 case Intrinsic::aarch64_neon_srshl: 19842 case Intrinsic::aarch64_neon_urshl: 19843 case Intrinsic::aarch64_neon_sshl: 19844 case Intrinsic::aarch64_neon_ushl: 19845 return tryCombineShiftImm(IID, N, DAG); 19846 case Intrinsic::aarch64_neon_sabd: 19847 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0), 19848 N->getOperand(1), N->getOperand(2)); 19849 case Intrinsic::aarch64_neon_uabd: 19850 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0), 19851 N->getOperand(1), N->getOperand(2)); 19852 case Intrinsic::aarch64_crc32b: 19853 case Intrinsic::aarch64_crc32cb: 19854 return tryCombineCRC32(0xff, N, DAG); 19855 case Intrinsic::aarch64_crc32h: 19856 case Intrinsic::aarch64_crc32ch: 19857 return tryCombineCRC32(0xffff, N, DAG); 19858 case Intrinsic::aarch64_sve_saddv: 19859 // There is no i64 version of SADDV because the sign is irrelevant. 19860 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64) 19861 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); 19862 else 19863 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG); 19864 case Intrinsic::aarch64_sve_uaddv: 19865 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); 19866 case Intrinsic::aarch64_sve_smaxv: 19867 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG); 19868 case Intrinsic::aarch64_sve_umaxv: 19869 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG); 19870 case Intrinsic::aarch64_sve_sminv: 19871 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG); 19872 case Intrinsic::aarch64_sve_uminv: 19873 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG); 19874 case Intrinsic::aarch64_sve_orv: 19875 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG); 19876 case Intrinsic::aarch64_sve_eorv: 19877 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG); 19878 case Intrinsic::aarch64_sve_andv: 19879 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG); 19880 case Intrinsic::aarch64_sve_index: 19881 return LowerSVEIntrinsicIndex(N, DAG); 19882 case Intrinsic::aarch64_sve_dup: 19883 return LowerSVEIntrinsicDUP(N, DAG); 19884 case Intrinsic::aarch64_sve_dup_x: 19885 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0), 19886 N->getOperand(1)); 19887 case Intrinsic::aarch64_sve_ext: 19888 return LowerSVEIntrinsicEXT(N, DAG); 19889 case Intrinsic::aarch64_sve_mul_u: 19890 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0), 19891 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19892 case Intrinsic::aarch64_sve_smulh_u: 19893 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0), 19894 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19895 case Intrinsic::aarch64_sve_umulh_u: 19896 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0), 19897 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19898 case Intrinsic::aarch64_sve_smin_u: 19899 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0), 19900 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19901 case Intrinsic::aarch64_sve_umin_u: 19902 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0), 19903 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19904 case Intrinsic::aarch64_sve_smax_u: 19905 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0), 19906 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19907 case Intrinsic::aarch64_sve_umax_u: 19908 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0), 19909 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19910 case Intrinsic::aarch64_sve_lsl_u: 19911 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0), 19912 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19913 case Intrinsic::aarch64_sve_lsr_u: 19914 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0), 19915 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19916 case Intrinsic::aarch64_sve_asr_u: 19917 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0), 19918 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19919 case Intrinsic::aarch64_sve_fadd_u: 19920 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0), 19921 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19922 case Intrinsic::aarch64_sve_fdiv_u: 19923 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0), 19924 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19925 case Intrinsic::aarch64_sve_fmax_u: 19926 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0), 19927 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19928 case Intrinsic::aarch64_sve_fmaxnm_u: 19929 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0), 19930 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19931 case Intrinsic::aarch64_sve_fmla_u: 19932 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0), 19933 N->getOperand(1), N->getOperand(3), N->getOperand(4), 19934 N->getOperand(2)); 19935 case Intrinsic::aarch64_sve_fmin_u: 19936 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0), 19937 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19938 case Intrinsic::aarch64_sve_fminnm_u: 19939 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0), 19940 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19941 case Intrinsic::aarch64_sve_fmul_u: 19942 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0), 19943 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19944 case Intrinsic::aarch64_sve_fsub_u: 19945 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0), 19946 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19947 case Intrinsic::aarch64_sve_add_u: 19948 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2), 19949 N->getOperand(3)); 19950 case Intrinsic::aarch64_sve_sub_u: 19951 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2), 19952 N->getOperand(3)); 19953 case Intrinsic::aarch64_sve_subr: 19954 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true); 19955 case Intrinsic::aarch64_sve_and_u: 19956 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2), 19957 N->getOperand(3)); 19958 case Intrinsic::aarch64_sve_bic_u: 19959 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0), 19960 N->getOperand(2), N->getOperand(3)); 19961 case Intrinsic::aarch64_sve_eor_u: 19962 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2), 19963 N->getOperand(3)); 19964 case Intrinsic::aarch64_sve_orr_u: 19965 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2), 19966 N->getOperand(3)); 19967 case Intrinsic::aarch64_sve_sabd_u: 19968 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0), 19969 N->getOperand(2), N->getOperand(3)); 19970 case Intrinsic::aarch64_sve_uabd_u: 19971 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0), 19972 N->getOperand(2), N->getOperand(3)); 19973 case Intrinsic::aarch64_sve_sdiv_u: 19974 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0), 19975 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19976 case Intrinsic::aarch64_sve_udiv_u: 19977 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0), 19978 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 19979 case Intrinsic::aarch64_sve_sqadd: 19980 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true); 19981 case Intrinsic::aarch64_sve_sqsub_u: 19982 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0), 19983 N->getOperand(2), N->getOperand(3)); 19984 case Intrinsic::aarch64_sve_uqadd: 19985 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true); 19986 case Intrinsic::aarch64_sve_uqsub_u: 19987 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0), 19988 N->getOperand(2), N->getOperand(3)); 19989 case Intrinsic::aarch64_sve_sqadd_x: 19990 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0), 19991 N->getOperand(1), N->getOperand(2)); 19992 case Intrinsic::aarch64_sve_sqsub_x: 19993 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0), 19994 N->getOperand(1), N->getOperand(2)); 19995 case Intrinsic::aarch64_sve_uqadd_x: 19996 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0), 19997 N->getOperand(1), N->getOperand(2)); 19998 case Intrinsic::aarch64_sve_uqsub_x: 19999 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0), 20000 N->getOperand(1), N->getOperand(2)); 20001 case Intrinsic::aarch64_sve_asrd: 20002 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0), 20003 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 20004 case Intrinsic::aarch64_sve_cmphs: 20005 if (!N->getOperand(2).getValueType().isFloatingPoint()) 20006 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 20007 N->getValueType(0), N->getOperand(1), N->getOperand(2), 20008 N->getOperand(3), DAG.getCondCode(ISD::SETUGE)); 20009 break; 20010 case Intrinsic::aarch64_sve_cmphi: 20011 if (!N->getOperand(2).getValueType().isFloatingPoint()) 20012 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 20013 N->getValueType(0), N->getOperand(1), N->getOperand(2), 20014 N->getOperand(3), DAG.getCondCode(ISD::SETUGT)); 20015 break; 20016 case Intrinsic::aarch64_sve_fcmpge: 20017 case Intrinsic::aarch64_sve_cmpge: 20018 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 20019 N->getValueType(0), N->getOperand(1), N->getOperand(2), 20020 N->getOperand(3), DAG.getCondCode(ISD::SETGE)); 20021 break; 20022 case Intrinsic::aarch64_sve_fcmpgt: 20023 case Intrinsic::aarch64_sve_cmpgt: 20024 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 20025 N->getValueType(0), N->getOperand(1), N->getOperand(2), 20026 N->getOperand(3), DAG.getCondCode(ISD::SETGT)); 20027 break; 20028 case Intrinsic::aarch64_sve_fcmpeq: 20029 case Intrinsic::aarch64_sve_cmpeq: 20030 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 20031 N->getValueType(0), N->getOperand(1), N->getOperand(2), 20032 N->getOperand(3), DAG.getCondCode(ISD::SETEQ)); 20033 break; 20034 case Intrinsic::aarch64_sve_fcmpne: 20035 case Intrinsic::aarch64_sve_cmpne: 20036 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 20037 N->getValueType(0), N->getOperand(1), N->getOperand(2), 20038 N->getOperand(3), DAG.getCondCode(ISD::SETNE)); 20039 break; 20040 case Intrinsic::aarch64_sve_fcmpuo: 20041 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), 20042 N->getValueType(0), N->getOperand(1), N->getOperand(2), 20043 N->getOperand(3), DAG.getCondCode(ISD::SETUO)); 20044 break; 20045 case Intrinsic::aarch64_sve_fadda: 20046 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG); 20047 case Intrinsic::aarch64_sve_faddv: 20048 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG); 20049 case Intrinsic::aarch64_sve_fmaxnmv: 20050 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG); 20051 case Intrinsic::aarch64_sve_fmaxv: 20052 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG); 20053 case Intrinsic::aarch64_sve_fminnmv: 20054 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG); 20055 case Intrinsic::aarch64_sve_fminv: 20056 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG); 20057 case Intrinsic::aarch64_sve_sel: 20058 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0), 20059 N->getOperand(1), N->getOperand(2), N->getOperand(3)); 20060 case Intrinsic::aarch64_sve_cmpeq_wide: 20061 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG); 20062 case Intrinsic::aarch64_sve_cmpne_wide: 20063 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG); 20064 case Intrinsic::aarch64_sve_cmpge_wide: 20065 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG); 20066 case Intrinsic::aarch64_sve_cmpgt_wide: 20067 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG); 20068 case Intrinsic::aarch64_sve_cmplt_wide: 20069 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG); 20070 case Intrinsic::aarch64_sve_cmple_wide: 20071 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG); 20072 case Intrinsic::aarch64_sve_cmphs_wide: 20073 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG); 20074 case Intrinsic::aarch64_sve_cmphi_wide: 20075 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG); 20076 case Intrinsic::aarch64_sve_cmplo_wide: 20077 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG); 20078 case Intrinsic::aarch64_sve_cmpls_wide: 20079 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG); 20080 case Intrinsic::aarch64_sve_ptest_any: 20081 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), 20082 AArch64CC::ANY_ACTIVE); 20083 case Intrinsic::aarch64_sve_ptest_first: 20084 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), 20085 AArch64CC::FIRST_ACTIVE); 20086 case Intrinsic::aarch64_sve_ptest_last: 20087 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), 20088 AArch64CC::LAST_ACTIVE); 20089 } 20090 return SDValue(); 20091 } 20092 20093 static bool isCheapToExtend(const SDValue &N) { 20094 unsigned OC = N->getOpcode(); 20095 return OC == ISD::LOAD || OC == ISD::MLOAD || 20096 ISD::isConstantSplatVectorAllZeros(N.getNode()); 20097 } 20098 20099 static SDValue 20100 performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 20101 SelectionDAG &DAG) { 20102 // If we have (sext (setcc A B)) and A and B are cheap to extend, 20103 // we can move the sext into the arguments and have the same result. For 20104 // example, if A and B are both loads, we can make those extending loads and 20105 // avoid an extra instruction. This pattern appears often in VLS code 20106 // generation where the inputs to the setcc have a different size to the 20107 // instruction that wants to use the result of the setcc. 20108 assert(N->getOpcode() == ISD::SIGN_EXTEND && 20109 N->getOperand(0)->getOpcode() == ISD::SETCC); 20110 const SDValue SetCC = N->getOperand(0); 20111 20112 const SDValue CCOp0 = SetCC.getOperand(0); 20113 const SDValue CCOp1 = SetCC.getOperand(1); 20114 if (!CCOp0->getValueType(0).isInteger() || 20115 !CCOp1->getValueType(0).isInteger()) 20116 return SDValue(); 20117 20118 ISD::CondCode Code = 20119 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get(); 20120 20121 ISD::NodeType ExtType = 20122 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 20123 20124 if (isCheapToExtend(SetCC.getOperand(0)) && 20125 isCheapToExtend(SetCC.getOperand(1))) { 20126 const SDValue Ext1 = 20127 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0); 20128 const SDValue Ext2 = 20129 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1); 20130 20131 return DAG.getSetCC( 20132 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2, 20133 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get()); 20134 } 20135 20136 return SDValue(); 20137 } 20138 20139 static SDValue performExtendCombine(SDNode *N, 20140 TargetLowering::DAGCombinerInfo &DCI, 20141 SelectionDAG &DAG) { 20142 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then 20143 // we can convert that DUP into another extract_high (of a bigger DUP), which 20144 // helps the backend to decide that an sabdl2 would be useful, saving a real 20145 // extract_high operation. 20146 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && 20147 (N->getOperand(0).getOpcode() == ISD::ABDU || 20148 N->getOperand(0).getOpcode() == ISD::ABDS)) { 20149 SDNode *ABDNode = N->getOperand(0).getNode(); 20150 SDValue NewABD = 20151 tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG); 20152 if (!NewABD.getNode()) 20153 return SDValue(); 20154 20155 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); 20156 } 20157 20158 if (N->getValueType(0).isFixedLengthVector() && 20159 N->getOpcode() == ISD::SIGN_EXTEND && 20160 N->getOperand(0)->getOpcode() == ISD::SETCC) 20161 return performSignExtendSetCCCombine(N, DCI, DAG); 20162 20163 return SDValue(); 20164 } 20165 20166 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, 20167 SDValue SplatVal, unsigned NumVecElts) { 20168 assert(!St.isTruncatingStore() && "cannot split truncating vector store"); 20169 Align OrigAlignment = St.getAlign(); 20170 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8; 20171 20172 // Create scalar stores. This is at least as good as the code sequence for a 20173 // split unaligned store which is a dup.s, ext.b, and two stores. 20174 // Most of the time the three stores should be replaced by store pair 20175 // instructions (stp). 20176 SDLoc DL(&St); 20177 SDValue BasePtr = St.getBasePtr(); 20178 uint64_t BaseOffset = 0; 20179 20180 const MachinePointerInfo &PtrInfo = St.getPointerInfo(); 20181 SDValue NewST1 = 20182 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo, 20183 OrigAlignment, St.getMemOperand()->getFlags()); 20184 20185 // As this in ISel, we will not merge this add which may degrade results. 20186 if (BasePtr->getOpcode() == ISD::ADD && 20187 isa<ConstantSDNode>(BasePtr->getOperand(1))) { 20188 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue(); 20189 BasePtr = BasePtr->getOperand(0); 20190 } 20191 20192 unsigned Offset = EltOffset; 20193 while (--NumVecElts) { 20194 Align Alignment = commonAlignment(OrigAlignment, Offset); 20195 SDValue OffsetPtr = 20196 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 20197 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64)); 20198 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, 20199 PtrInfo.getWithOffset(Offset), Alignment, 20200 St.getMemOperand()->getFlags()); 20201 Offset += EltOffset; 20202 } 20203 return NewST1; 20204 } 20205 20206 // Returns an SVE type that ContentTy can be trivially sign or zero extended 20207 // into. 20208 static MVT getSVEContainerType(EVT ContentTy) { 20209 assert(ContentTy.isSimple() && "No SVE containers for extended types"); 20210 20211 switch (ContentTy.getSimpleVT().SimpleTy) { 20212 default: 20213 llvm_unreachable("No known SVE container for this MVT type"); 20214 case MVT::nxv2i8: 20215 case MVT::nxv2i16: 20216 case MVT::nxv2i32: 20217 case MVT::nxv2i64: 20218 case MVT::nxv2f32: 20219 case MVT::nxv2f64: 20220 return MVT::nxv2i64; 20221 case MVT::nxv4i8: 20222 case MVT::nxv4i16: 20223 case MVT::nxv4i32: 20224 case MVT::nxv4f32: 20225 return MVT::nxv4i32; 20226 case MVT::nxv8i8: 20227 case MVT::nxv8i16: 20228 case MVT::nxv8f16: 20229 case MVT::nxv8bf16: 20230 return MVT::nxv8i16; 20231 case MVT::nxv16i8: 20232 return MVT::nxv16i8; 20233 } 20234 } 20235 20236 static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) { 20237 SDLoc DL(N); 20238 EVT VT = N->getValueType(0); 20239 20240 if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock) 20241 return SDValue(); 20242 20243 EVT ContainerVT = VT; 20244 if (ContainerVT.isInteger()) 20245 ContainerVT = getSVEContainerType(ContainerVT); 20246 20247 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other); 20248 SDValue Ops[] = { N->getOperand(0), // Chain 20249 N->getOperand(2), // Pg 20250 N->getOperand(3), // Base 20251 DAG.getValueType(VT) }; 20252 20253 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops); 20254 SDValue LoadChain = SDValue(Load.getNode(), 1); 20255 20256 if (ContainerVT.isInteger() && (VT != ContainerVT)) 20257 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0)); 20258 20259 return DAG.getMergeValues({ Load, LoadChain }, DL); 20260 } 20261 20262 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { 20263 SDLoc DL(N); 20264 EVT VT = N->getValueType(0); 20265 EVT PtrTy = N->getOperand(3).getValueType(); 20266 20267 EVT LoadVT = VT; 20268 if (VT.isFloatingPoint()) 20269 LoadVT = VT.changeTypeToInteger(); 20270 20271 auto *MINode = cast<MemIntrinsicSDNode>(N); 20272 SDValue PassThru = DAG.getConstant(0, DL, LoadVT); 20273 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(), 20274 MINode->getOperand(3), DAG.getUNDEF(PtrTy), 20275 MINode->getOperand(2), PassThru, 20276 MINode->getMemoryVT(), MINode->getMemOperand(), 20277 ISD::UNINDEXED, ISD::NON_EXTLOAD, false); 20278 20279 if (VT.isFloatingPoint()) { 20280 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) }; 20281 return DAG.getMergeValues(Ops, DL); 20282 } 20283 20284 return L; 20285 } 20286 20287 template <unsigned Opcode> 20288 static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) { 20289 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO || 20290 Opcode == AArch64ISD::LD1RO_MERGE_ZERO, 20291 "Unsupported opcode."); 20292 SDLoc DL(N); 20293 EVT VT = N->getValueType(0); 20294 20295 EVT LoadVT = VT; 20296 if (VT.isFloatingPoint()) 20297 LoadVT = VT.changeTypeToInteger(); 20298 20299 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)}; 20300 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops); 20301 SDValue LoadChain = SDValue(Load.getNode(), 1); 20302 20303 if (VT.isFloatingPoint()) 20304 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0)); 20305 20306 return DAG.getMergeValues({Load, LoadChain}, DL); 20307 } 20308 20309 static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) { 20310 SDLoc DL(N); 20311 SDValue Data = N->getOperand(2); 20312 EVT DataVT = Data.getValueType(); 20313 EVT HwSrcVt = getSVEContainerType(DataVT); 20314 SDValue InputVT = DAG.getValueType(DataVT); 20315 20316 if (DataVT.isFloatingPoint()) 20317 InputVT = DAG.getValueType(HwSrcVt); 20318 20319 SDValue SrcNew; 20320 if (Data.getValueType().isFloatingPoint()) 20321 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data); 20322 else 20323 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data); 20324 20325 SDValue Ops[] = { N->getOperand(0), // Chain 20326 SrcNew, 20327 N->getOperand(4), // Base 20328 N->getOperand(3), // Pg 20329 InputVT 20330 }; 20331 20332 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops); 20333 } 20334 20335 static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { 20336 SDLoc DL(N); 20337 20338 SDValue Data = N->getOperand(2); 20339 EVT DataVT = Data.getValueType(); 20340 EVT PtrTy = N->getOperand(4).getValueType(); 20341 20342 if (DataVT.isFloatingPoint()) 20343 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data); 20344 20345 auto *MINode = cast<MemIntrinsicSDNode>(N); 20346 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4), 20347 DAG.getUNDEF(PtrTy), MINode->getOperand(3), 20348 MINode->getMemoryVT(), MINode->getMemOperand(), 20349 ISD::UNINDEXED, false, false); 20350 } 20351 20352 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The 20353 /// load store optimizer pass will merge them to store pair stores. This should 20354 /// be better than a movi to create the vector zero followed by a vector store 20355 /// if the zero constant is not re-used, since one instructions and one register 20356 /// live range will be removed. 20357 /// 20358 /// For example, the final generated code should be: 20359 /// 20360 /// stp xzr, xzr, [x0] 20361 /// 20362 /// instead of: 20363 /// 20364 /// movi v0.2d, #0 20365 /// str q0, [x0] 20366 /// 20367 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { 20368 SDValue StVal = St.getValue(); 20369 EVT VT = StVal.getValueType(); 20370 20371 // Avoid scalarizing zero splat stores for scalable vectors. 20372 if (VT.isScalableVector()) 20373 return SDValue(); 20374 20375 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or 20376 // 2, 3 or 4 i32 elements. 20377 int NumVecElts = VT.getVectorNumElements(); 20378 if (!(((NumVecElts == 2 || NumVecElts == 3) && 20379 VT.getVectorElementType().getSizeInBits() == 64) || 20380 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) && 20381 VT.getVectorElementType().getSizeInBits() == 32))) 20382 return SDValue(); 20383 20384 if (StVal.getOpcode() != ISD::BUILD_VECTOR) 20385 return SDValue(); 20386 20387 // If the zero constant has more than one use then the vector store could be 20388 // better since the constant mov will be amortized and stp q instructions 20389 // should be able to be formed. 20390 if (!StVal.hasOneUse()) 20391 return SDValue(); 20392 20393 // If the store is truncating then it's going down to i16 or smaller, which 20394 // means it can be implemented in a single store anyway. 20395 if (St.isTruncatingStore()) 20396 return SDValue(); 20397 20398 // If the immediate offset of the address operand is too large for the stp 20399 // instruction, then bail out. 20400 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) { 20401 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1); 20402 if (Offset < -512 || Offset > 504) 20403 return SDValue(); 20404 } 20405 20406 for (int I = 0; I < NumVecElts; ++I) { 20407 SDValue EltVal = StVal.getOperand(I); 20408 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal)) 20409 return SDValue(); 20410 } 20411 20412 // Use a CopyFromReg WZR/XZR here to prevent 20413 // DAGCombiner::MergeConsecutiveStores from undoing this transformation. 20414 SDLoc DL(&St); 20415 unsigned ZeroReg; 20416 EVT ZeroVT; 20417 if (VT.getVectorElementType().getSizeInBits() == 32) { 20418 ZeroReg = AArch64::WZR; 20419 ZeroVT = MVT::i32; 20420 } else { 20421 ZeroReg = AArch64::XZR; 20422 ZeroVT = MVT::i64; 20423 } 20424 SDValue SplatVal = 20425 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT); 20426 return splitStoreSplat(DAG, St, SplatVal, NumVecElts); 20427 } 20428 20429 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar 20430 /// value. The load store optimizer pass will merge them to store pair stores. 20431 /// This has better performance than a splat of the scalar followed by a split 20432 /// vector store. Even if the stores are not merged it is four stores vs a dup, 20433 /// followed by an ext.b and two stores. 20434 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) { 20435 SDValue StVal = St.getValue(); 20436 EVT VT = StVal.getValueType(); 20437 20438 // Don't replace floating point stores, they possibly won't be transformed to 20439 // stp because of the store pair suppress pass. 20440 if (VT.isFloatingPoint()) 20441 return SDValue(); 20442 20443 // We can express a splat as store pair(s) for 2 or 4 elements. 20444 unsigned NumVecElts = VT.getVectorNumElements(); 20445 if (NumVecElts != 4 && NumVecElts != 2) 20446 return SDValue(); 20447 20448 // If the store is truncating then it's going down to i16 or smaller, which 20449 // means it can be implemented in a single store anyway. 20450 if (St.isTruncatingStore()) 20451 return SDValue(); 20452 20453 // Check that this is a splat. 20454 // Make sure that each of the relevant vector element locations are inserted 20455 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32. 20456 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1); 20457 SDValue SplatVal; 20458 for (unsigned I = 0; I < NumVecElts; ++I) { 20459 // Check for insert vector elements. 20460 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) 20461 return SDValue(); 20462 20463 // Check that same value is inserted at each vector element. 20464 if (I == 0) 20465 SplatVal = StVal.getOperand(1); 20466 else if (StVal.getOperand(1) != SplatVal) 20467 return SDValue(); 20468 20469 // Check insert element index. 20470 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2)); 20471 if (!CIndex) 20472 return SDValue(); 20473 uint64_t IndexVal = CIndex->getZExtValue(); 20474 if (IndexVal >= NumVecElts) 20475 return SDValue(); 20476 IndexNotInserted.reset(IndexVal); 20477 20478 StVal = StVal.getOperand(0); 20479 } 20480 // Check that all vector element locations were inserted to. 20481 if (IndexNotInserted.any()) 20482 return SDValue(); 20483 20484 return splitStoreSplat(DAG, St, SplatVal, NumVecElts); 20485 } 20486 20487 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 20488 SelectionDAG &DAG, 20489 const AArch64Subtarget *Subtarget) { 20490 20491 StoreSDNode *S = cast<StoreSDNode>(N); 20492 if (S->isVolatile() || S->isIndexed()) 20493 return SDValue(); 20494 20495 SDValue StVal = S->getValue(); 20496 EVT VT = StVal.getValueType(); 20497 20498 if (!VT.isFixedLengthVector()) 20499 return SDValue(); 20500 20501 // If we get a splat of zeros, convert this vector store to a store of 20502 // scalars. They will be merged into store pairs of xzr thereby removing one 20503 // instruction and one register. 20504 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S)) 20505 return ReplacedZeroSplat; 20506 20507 // FIXME: The logic for deciding if an unaligned store should be split should 20508 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be 20509 // a call to that function here. 20510 20511 if (!Subtarget->isMisaligned128StoreSlow()) 20512 return SDValue(); 20513 20514 // Don't split at -Oz. 20515 if (DAG.getMachineFunction().getFunction().hasMinSize()) 20516 return SDValue(); 20517 20518 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting 20519 // those up regresses performance on micro-benchmarks and olden/bh. 20520 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64) 20521 return SDValue(); 20522 20523 // Split unaligned 16B stores. They are terrible for performance. 20524 // Don't split stores with alignment of 1 or 2. Code that uses clang vector 20525 // extensions can use this to mark that it does not want splitting to happen 20526 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of 20527 // eliminating alignment hazards is only 1 in 8 for alignment of 2. 20528 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) || 20529 S->getAlign() <= Align(2)) 20530 return SDValue(); 20531 20532 // If we get a splat of a scalar convert this vector store to a store of 20533 // scalars. They will be merged into store pairs thereby removing two 20534 // instructions. 20535 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S)) 20536 return ReplacedSplat; 20537 20538 SDLoc DL(S); 20539 20540 // Split VT into two. 20541 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 20542 unsigned NumElts = HalfVT.getVectorNumElements(); 20543 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 20544 DAG.getConstant(0, DL, MVT::i64)); 20545 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 20546 DAG.getConstant(NumElts, DL, MVT::i64)); 20547 SDValue BasePtr = S->getBasePtr(); 20548 SDValue NewST1 = 20549 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), 20550 S->getAlign(), S->getMemOperand()->getFlags()); 20551 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 20552 DAG.getConstant(8, DL, MVT::i64)); 20553 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, 20554 S->getPointerInfo(), S->getAlign(), 20555 S->getMemOperand()->getFlags()); 20556 } 20557 20558 static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) { 20559 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!"); 20560 20561 // splice(pg, op1, undef) -> op1 20562 if (N->getOperand(2).isUndef()) 20563 return N->getOperand(1); 20564 20565 return SDValue(); 20566 } 20567 20568 static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, 20569 const AArch64Subtarget *Subtarget) { 20570 assert((N->getOpcode() == AArch64ISD::UUNPKHI || 20571 N->getOpcode() == AArch64ISD::UUNPKLO) && 20572 "Unexpected Opcode!"); 20573 20574 // uunpklo/hi undef -> undef 20575 if (N->getOperand(0).isUndef()) 20576 return DAG.getUNDEF(N->getValueType(0)); 20577 20578 // If this is a masked load followed by an UUNPKLO, fold this into a masked 20579 // extending load. We can do this even if this is already a masked 20580 // {z,}extload. 20581 if (N->getOperand(0).getOpcode() == ISD::MLOAD && 20582 N->getOpcode() == AArch64ISD::UUNPKLO) { 20583 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0)); 20584 SDValue Mask = MLD->getMask(); 20585 SDLoc DL(N); 20586 20587 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD && 20588 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE && 20589 (MLD->getPassThru()->isUndef() || 20590 isZerosVector(MLD->getPassThru().getNode()))) { 20591 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); 20592 unsigned PgPattern = Mask->getConstantOperandVal(0); 20593 EVT VT = N->getValueType(0); 20594 20595 // Ensure we can double the size of the predicate pattern 20596 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern); 20597 if (NumElts && 20598 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) { 20599 Mask = 20600 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern); 20601 SDValue PassThru = DAG.getConstant(0, DL, VT); 20602 SDValue NewLoad = DAG.getMaskedLoad( 20603 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask, 20604 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(), 20605 MLD->getAddressingMode(), ISD::ZEXTLOAD); 20606 20607 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1)); 20608 20609 return NewLoad; 20610 } 20611 } 20612 } 20613 20614 return SDValue(); 20615 } 20616 20617 // Try to simplify: 20618 // t1 = nxv8i16 add(X, 1 << (ShiftValue - 1)) 20619 // t2 = nxv8i16 srl(t1, ShiftValue) 20620 // to 20621 // t1 = nxv8i16 rshrnb(X, shiftvalue). 20622 // rshrnb will zero the top half bits of each element. Therefore, this combine 20623 // should only be performed when a following instruction with the rshrnb 20624 // as an operand does not care about the top half of each element. For example, 20625 // a uzp1 or a truncating store. 20626 static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, 20627 const AArch64Subtarget *Subtarget) { 20628 EVT VT = Srl->getValueType(0); 20629 20630 if (!VT.isScalableVector() || !Subtarget->hasSVE2() || 20631 Srl->getOpcode() != ISD::SRL) 20632 return SDValue(); 20633 20634 EVT ResVT; 20635 if (VT == MVT::nxv8i16) 20636 ResVT = MVT::nxv16i8; 20637 else if (VT == MVT::nxv4i32) 20638 ResVT = MVT::nxv8i16; 20639 else if (VT == MVT::nxv2i64) 20640 ResVT = MVT::nxv4i32; 20641 else 20642 return SDValue(); 20643 20644 auto SrlOp1 = 20645 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Srl->getOperand(1))); 20646 if (!SrlOp1) 20647 return SDValue(); 20648 unsigned ShiftValue = SrlOp1->getZExtValue(); 20649 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits()) 20650 return SDValue(); 20651 20652 SDValue Add = Srl->getOperand(0); 20653 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse()) 20654 return SDValue(); 20655 auto AddOp1 = 20656 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1))); 20657 if (!AddOp1) 20658 return SDValue(); 20659 uint64_t AddValue = AddOp1->getZExtValue(); 20660 if (AddValue != 1ULL << (ShiftValue - 1)) 20661 return SDValue(); 20662 20663 SDLoc DL(Srl); 20664 SDValue Rshrnb = DAG.getNode( 20665 AArch64ISD::RSHRNB_I, DL, ResVT, 20666 {Add->getOperand(0), DAG.getTargetConstant(ShiftValue, DL, MVT::i32)}); 20667 return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb); 20668 } 20669 20670 static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, 20671 const AArch64Subtarget *Subtarget) { 20672 SDLoc DL(N); 20673 SDValue Op0 = N->getOperand(0); 20674 SDValue Op1 = N->getOperand(1); 20675 EVT ResVT = N->getValueType(0); 20676 20677 // uzp1(x, undef) -> concat(truncate(x), undef) 20678 if (Op1.getOpcode() == ISD::UNDEF) { 20679 EVT BCVT = MVT::Other, HalfVT = MVT::Other; 20680 switch (ResVT.getSimpleVT().SimpleTy) { 20681 default: 20682 break; 20683 case MVT::v16i8: 20684 BCVT = MVT::v8i16; 20685 HalfVT = MVT::v8i8; 20686 break; 20687 case MVT::v8i16: 20688 BCVT = MVT::v4i32; 20689 HalfVT = MVT::v4i16; 20690 break; 20691 case MVT::v4i32: 20692 BCVT = MVT::v2i64; 20693 HalfVT = MVT::v2i32; 20694 break; 20695 } 20696 if (BCVT != MVT::Other) { 20697 SDValue BC = DAG.getBitcast(BCVT, Op0); 20698 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC); 20699 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc, 20700 DAG.getUNDEF(HalfVT)); 20701 } 20702 } 20703 20704 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget)) 20705 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1); 20706 20707 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget)) 20708 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb); 20709 20710 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z) 20711 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) { 20712 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) { 20713 SDValue X = Op0.getOperand(0).getOperand(0); 20714 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1); 20715 } 20716 } 20717 20718 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z) 20719 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) { 20720 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) { 20721 SDValue Z = Op1.getOperand(0).getOperand(1); 20722 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z); 20723 } 20724 } 20725 20726 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y)) 20727 // Only implemented on little-endian subtargets. 20728 bool IsLittleEndian = DAG.getDataLayout().isLittleEndian(); 20729 20730 // This optimization only works on little endian. 20731 if (!IsLittleEndian) 20732 return SDValue(); 20733 20734 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8) 20735 return SDValue(); 20736 20737 auto getSourceOp = [](SDValue Operand) -> SDValue { 20738 const unsigned Opcode = Operand.getOpcode(); 20739 if (Opcode == ISD::TRUNCATE) 20740 return Operand->getOperand(0); 20741 if (Opcode == ISD::BITCAST && 20742 Operand->getOperand(0).getOpcode() == ISD::TRUNCATE) 20743 return Operand->getOperand(0)->getOperand(0); 20744 return SDValue(); 20745 }; 20746 20747 SDValue SourceOp0 = getSourceOp(Op0); 20748 SDValue SourceOp1 = getSourceOp(Op1); 20749 20750 if (!SourceOp0 || !SourceOp1) 20751 return SDValue(); 20752 20753 if (SourceOp0.getValueType() != SourceOp1.getValueType() || 20754 !SourceOp0.getValueType().isSimple()) 20755 return SDValue(); 20756 20757 EVT ResultTy; 20758 20759 switch (SourceOp0.getSimpleValueType().SimpleTy) { 20760 case MVT::v2i64: 20761 ResultTy = MVT::v4i32; 20762 break; 20763 case MVT::v4i32: 20764 ResultTy = MVT::v8i16; 20765 break; 20766 case MVT::v8i16: 20767 ResultTy = MVT::v16i8; 20768 break; 20769 default: 20770 return SDValue(); 20771 } 20772 20773 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0); 20774 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1); 20775 SDValue UzpResult = 20776 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1); 20777 20778 EVT BitcastResultTy; 20779 20780 switch (ResVT.getSimpleVT().SimpleTy) { 20781 case MVT::v2i32: 20782 BitcastResultTy = MVT::v2i64; 20783 break; 20784 case MVT::v4i16: 20785 BitcastResultTy = MVT::v4i32; 20786 break; 20787 case MVT::v8i8: 20788 BitcastResultTy = MVT::v8i16; 20789 break; 20790 default: 20791 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}"); 20792 } 20793 20794 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, 20795 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult)); 20796 } 20797 20798 static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) { 20799 unsigned Opc = N->getOpcode(); 20800 20801 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads 20802 Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) || 20803 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads 20804 Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) && 20805 "Invalid opcode."); 20806 20807 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO || 20808 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO; 20809 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO || 20810 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO; 20811 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO || 20812 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO || 20813 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO || 20814 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO; 20815 20816 SDLoc DL(N); 20817 SDValue Chain = N->getOperand(0); 20818 SDValue Pg = N->getOperand(1); 20819 SDValue Base = N->getOperand(2); 20820 SDValue Offset = N->getOperand(3); 20821 SDValue Ty = N->getOperand(4); 20822 20823 EVT ResVT = N->getValueType(0); 20824 20825 const auto OffsetOpc = Offset.getOpcode(); 20826 const bool OffsetIsZExt = 20827 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU; 20828 const bool OffsetIsSExt = 20829 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU; 20830 20831 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible. 20832 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) { 20833 SDValue ExtPg = Offset.getOperand(0); 20834 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode()); 20835 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType(); 20836 20837 // If the predicate for the sign- or zero-extended offset is the 20838 // same as the predicate used for this load and the sign-/zero-extension 20839 // was from a 32-bits... 20840 if (ExtPg == Pg && ExtFromEVT == MVT::i32) { 20841 SDValue UnextendedOffset = Offset.getOperand(1); 20842 20843 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true); 20844 if (Signed) 20845 NewOpc = getSignExtendedGatherOpcode(NewOpc); 20846 20847 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other}, 20848 {Chain, Pg, Base, UnextendedOffset, Ty}); 20849 } 20850 } 20851 20852 return SDValue(); 20853 } 20854 20855 /// Optimize a vector shift instruction and its operand if shifted out 20856 /// bits are not used. 20857 static SDValue performVectorShiftCombine(SDNode *N, 20858 const AArch64TargetLowering &TLI, 20859 TargetLowering::DAGCombinerInfo &DCI) { 20860 assert(N->getOpcode() == AArch64ISD::VASHR || 20861 N->getOpcode() == AArch64ISD::VLSHR); 20862 20863 SDValue Op = N->getOperand(0); 20864 unsigned OpScalarSize = Op.getScalarValueSizeInBits(); 20865 20866 unsigned ShiftImm = N->getConstantOperandVal(1); 20867 assert(OpScalarSize > ShiftImm && "Invalid shift imm"); 20868 20869 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits. 20870 if (N->getOpcode() == AArch64ISD::VASHR && 20871 Op.getOpcode() == AArch64ISD::VSHL && 20872 N->getOperand(1) == Op.getOperand(1)) 20873 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm) 20874 return Op.getOperand(0); 20875 20876 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm); 20877 APInt DemandedMask = ~ShiftedOutBits; 20878 20879 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI)) 20880 return SDValue(N, 0); 20881 20882 return SDValue(); 20883 } 20884 20885 static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) { 20886 // sunpklo(sext(pred)) -> sext(extract_low_half(pred)) 20887 // This transform works in partnership with performSetCCPunpkCombine to 20888 // remove unnecessary transfer of predicates into standard registers and back 20889 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND && 20890 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() == 20891 MVT::i1) { 20892 SDValue CC = N->getOperand(0)->getOperand(0); 20893 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext()); 20894 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC, 20895 DAG.getVectorIdxConstant(0, SDLoc(N))); 20896 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk); 20897 } 20898 20899 return SDValue(); 20900 } 20901 20902 /// Target-specific DAG combine function for post-increment LD1 (lane) and 20903 /// post-increment LD1R. 20904 static SDValue performPostLD1Combine(SDNode *N, 20905 TargetLowering::DAGCombinerInfo &DCI, 20906 bool IsLaneOp) { 20907 if (DCI.isBeforeLegalizeOps()) 20908 return SDValue(); 20909 20910 SelectionDAG &DAG = DCI.DAG; 20911 EVT VT = N->getValueType(0); 20912 20913 if (!VT.is128BitVector() && !VT.is64BitVector()) 20914 return SDValue(); 20915 20916 unsigned LoadIdx = IsLaneOp ? 1 : 0; 20917 SDNode *LD = N->getOperand(LoadIdx).getNode(); 20918 // If it is not LOAD, can not do such combine. 20919 if (LD->getOpcode() != ISD::LOAD) 20920 return SDValue(); 20921 20922 // The vector lane must be a constant in the LD1LANE opcode. 20923 SDValue Lane; 20924 if (IsLaneOp) { 20925 Lane = N->getOperand(2); 20926 auto *LaneC = dyn_cast<ConstantSDNode>(Lane); 20927 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements()) 20928 return SDValue(); 20929 } 20930 20931 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD); 20932 EVT MemVT = LoadSDN->getMemoryVT(); 20933 // Check if memory operand is the same type as the vector element. 20934 if (MemVT != VT.getVectorElementType()) 20935 return SDValue(); 20936 20937 // Check if there are other uses. If so, do not combine as it will introduce 20938 // an extra load. 20939 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; 20940 ++UI) { 20941 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. 20942 continue; 20943 if (*UI != N) 20944 return SDValue(); 20945 } 20946 20947 // If there is one use and it can splat the value, prefer that operation. 20948 // TODO: This could be expanded to more operations if they reliably use the 20949 // index variants. 20950 if (N->hasOneUse()) { 20951 unsigned UseOpc = N->use_begin()->getOpcode(); 20952 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA) 20953 return SDValue(); 20954 } 20955 20956 SDValue Addr = LD->getOperand(1); 20957 SDValue Vector = N->getOperand(0); 20958 // Search for a use of the address operand that is an increment. 20959 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = 20960 Addr.getNode()->use_end(); UI != UE; ++UI) { 20961 SDNode *User = *UI; 20962 if (User->getOpcode() != ISD::ADD 20963 || UI.getUse().getResNo() != Addr.getResNo()) 20964 continue; 20965 20966 // If the increment is a constant, it must match the memory ref size. 20967 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 20968 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 20969 uint32_t IncVal = CInc->getZExtValue(); 20970 unsigned NumBytes = VT.getScalarSizeInBits() / 8; 20971 if (IncVal != NumBytes) 20972 continue; 20973 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 20974 } 20975 20976 // To avoid cycle construction make sure that neither the load nor the add 20977 // are predecessors to each other or the Vector. 20978 SmallPtrSet<const SDNode *, 32> Visited; 20979 SmallVector<const SDNode *, 16> Worklist; 20980 Visited.insert(Addr.getNode()); 20981 Worklist.push_back(User); 20982 Worklist.push_back(LD); 20983 Worklist.push_back(Vector.getNode()); 20984 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) || 20985 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 20986 continue; 20987 20988 SmallVector<SDValue, 8> Ops; 20989 Ops.push_back(LD->getOperand(0)); // Chain 20990 if (IsLaneOp) { 20991 Ops.push_back(Vector); // The vector to be inserted 20992 Ops.push_back(Lane); // The lane to be inserted in the vector 20993 } 20994 Ops.push_back(Addr); 20995 Ops.push_back(Inc); 20996 20997 EVT Tys[3] = { VT, MVT::i64, MVT::Other }; 20998 SDVTList SDTys = DAG.getVTList(Tys); 20999 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; 21000 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, 21001 MemVT, 21002 LoadSDN->getMemOperand()); 21003 21004 // Update the uses. 21005 SDValue NewResults[] = { 21006 SDValue(LD, 0), // The result of load 21007 SDValue(UpdN.getNode(), 2) // Chain 21008 }; 21009 DCI.CombineTo(LD, NewResults); 21010 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result 21011 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register 21012 21013 break; 21014 } 21015 return SDValue(); 21016 } 21017 21018 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during 21019 /// address translation. 21020 static bool performTBISimplification(SDValue Addr, 21021 TargetLowering::DAGCombinerInfo &DCI, 21022 SelectionDAG &DAG) { 21023 APInt DemandedMask = APInt::getLowBitsSet(64, 56); 21024 KnownBits Known; 21025 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 21026 !DCI.isBeforeLegalizeOps()); 21027 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 21028 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) { 21029 DCI.CommitTargetLoweringOpt(TLO); 21030 return true; 21031 } 21032 return false; 21033 } 21034 21035 static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) { 21036 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) && 21037 "Expected STORE dag node in input!"); 21038 21039 if (auto Store = dyn_cast<StoreSDNode>(N)) { 21040 if (!Store->isTruncatingStore() || Store->isIndexed()) 21041 return SDValue(); 21042 SDValue Ext = Store->getValue(); 21043 auto ExtOpCode = Ext.getOpcode(); 21044 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND && 21045 ExtOpCode != ISD::ANY_EXTEND) 21046 return SDValue(); 21047 SDValue Orig = Ext->getOperand(0); 21048 if (Store->getMemoryVT() != Orig.getValueType()) 21049 return SDValue(); 21050 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig, 21051 Store->getBasePtr(), Store->getMemOperand()); 21052 } 21053 21054 return SDValue(); 21055 } 21056 21057 // Perform TBI simplification if supported by the target and try to break up 21058 // nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit 21059 // load instructions can be selected. 21060 static SDValue performLOADCombine(SDNode *N, 21061 TargetLowering::DAGCombinerInfo &DCI, 21062 SelectionDAG &DAG, 21063 const AArch64Subtarget *Subtarget) { 21064 if (Subtarget->supportsAddressTopByteIgnored()) 21065 performTBISimplification(N->getOperand(1), DCI, DAG); 21066 21067 LoadSDNode *LD = cast<LoadSDNode>(N); 21068 EVT MemVT = LD->getMemoryVT(); 21069 if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian()) 21070 return SDValue(N, 0); 21071 21072 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 || 21073 MemVT.getSizeInBits() % 256 == 0 || 21074 256 % MemVT.getScalarSizeInBits() != 0) 21075 return SDValue(N, 0); 21076 21077 SDLoc DL(LD); 21078 SDValue Chain = LD->getChain(); 21079 SDValue BasePtr = LD->getBasePtr(); 21080 SDNodeFlags Flags = LD->getFlags(); 21081 SmallVector<SDValue, 4> LoadOps; 21082 SmallVector<SDValue, 4> LoadOpsChain; 21083 // Replace any non temporal load over 256-bit with a series of 256 bit loads 21084 // and a scalar/vector load less than 256. This way we can utilize 256-bit 21085 // loads and reduce the amount of load instructions generated. 21086 MVT NewVT = 21087 MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(), 21088 256 / MemVT.getVectorElementType().getSizeInBits()); 21089 unsigned Num256Loads = MemVT.getSizeInBits() / 256; 21090 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32. 21091 for (unsigned I = 0; I < Num256Loads; I++) { 21092 unsigned PtrOffset = I * 32; 21093 SDValue NewPtr = DAG.getMemBasePlusOffset( 21094 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags); 21095 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset); 21096 SDValue NewLoad = DAG.getLoad( 21097 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset), 21098 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo()); 21099 LoadOps.push_back(NewLoad); 21100 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1)); 21101 } 21102 21103 // Process remaining bits of the load operation. 21104 // This is done by creating an UNDEF vector to match the size of the 21105 // 256-bit loads and inserting the remaining load to it. We extract the 21106 // original load type at the end using EXTRACT_SUBVECTOR instruction. 21107 unsigned BitsRemaining = MemVT.getSizeInBits() % 256; 21108 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8; 21109 MVT RemainingVT = MVT::getVectorVT( 21110 MemVT.getVectorElementType().getSimpleVT(), 21111 BitsRemaining / MemVT.getVectorElementType().getSizeInBits()); 21112 SDValue NewPtr = DAG.getMemBasePlusOffset( 21113 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags); 21114 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset); 21115 SDValue RemainingLoad = 21116 DAG.getLoad(RemainingVT, DL, Chain, NewPtr, 21117 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign, 21118 LD->getMemOperand()->getFlags(), LD->getAAInfo()); 21119 SDValue UndefVector = DAG.getUNDEF(NewVT); 21120 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL); 21121 SDValue ExtendedReminingLoad = 21122 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, 21123 {UndefVector, RemainingLoad, InsertIdx}); 21124 LoadOps.push_back(ExtendedReminingLoad); 21125 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1)); 21126 EVT ConcatVT = 21127 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), 21128 LoadOps.size() * NewVT.getVectorNumElements()); 21129 SDValue ConcatVectors = 21130 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps); 21131 // Extract the original vector type size. 21132 SDValue ExtractSubVector = 21133 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, 21134 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)}); 21135 SDValue TokenFactor = 21136 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain); 21137 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL); 21138 } 21139 21140 static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) { 21141 EVT VecVT = Op.getValueType(); 21142 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 && 21143 "Need boolean vector type."); 21144 21145 if (Depth > 3) 21146 return MVT::INVALID_SIMPLE_VALUE_TYPE; 21147 21148 // We can get the base type from a vector compare or truncate. 21149 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE) 21150 return Op.getOperand(0).getValueType(); 21151 21152 // If an operand is a bool vector, continue looking. 21153 EVT BaseVT = MVT::INVALID_SIMPLE_VALUE_TYPE; 21154 for (SDValue Operand : Op->op_values()) { 21155 if (Operand.getValueType() != VecVT) 21156 continue; 21157 21158 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1); 21159 if (!BaseVT.isSimple()) 21160 BaseVT = OperandVT; 21161 else if (OperandVT != BaseVT) 21162 return MVT::INVALID_SIMPLE_VALUE_TYPE; 21163 } 21164 21165 return BaseVT; 21166 } 21167 21168 // When converting a <N x iX> vector to <N x i1> to store or use as a scalar 21169 // iN, we can use a trick that extracts the i^th bit from the i^th element and 21170 // then performs a vector add to get a scalar bitmask. This requires that each 21171 // element's bits are either all 1 or all 0. 21172 static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { 21173 SDLoc DL(N); 21174 SDValue ComparisonResult(N, 0); 21175 EVT VecVT = ComparisonResult.getValueType(); 21176 assert(VecVT.isVector() && "Must be a vector type"); 21177 21178 unsigned NumElts = VecVT.getVectorNumElements(); 21179 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 21180 return SDValue(); 21181 21182 if (VecVT.getVectorElementType() != MVT::i1 && 21183 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT)) 21184 return SDValue(); 21185 21186 // If we can find the original types to work on instead of a vector of i1, 21187 // we can avoid extend/extract conversion instructions. 21188 if (VecVT.getVectorElementType() == MVT::i1) { 21189 VecVT = tryGetOriginalBoolVectorType(ComparisonResult); 21190 if (!VecVT.isSimple()) { 21191 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector 21192 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts); 21193 } 21194 } 21195 VecVT = VecVT.changeVectorElementTypeToInteger(); 21196 21197 // Large vectors don't map directly to this conversion, so to avoid too many 21198 // edge cases, we don't apply it here. The conversion will likely still be 21199 // applied later via multiple smaller vectors, whose results are concatenated. 21200 if (VecVT.getSizeInBits() > 128) 21201 return SDValue(); 21202 21203 // Ensure that all elements' bits are either 0s or 1s. 21204 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT); 21205 21206 SmallVector<SDValue, 16> MaskConstants; 21207 if (VecVT == MVT::v16i8) { 21208 // v16i8 is a special case, as we have 16 entries but only 8 positional bits 21209 // per entry. We split it into two halves, apply the mask, zip the halves to 21210 // create 8x 16-bit values, and the perform the vector reduce. 21211 for (unsigned Half = 0; Half < 2; ++Half) { 21212 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) { 21213 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32)); 21214 } 21215 } 21216 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants); 21217 SDValue RepresentativeBits = 21218 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask); 21219 21220 SDValue UpperRepresentativeBits = 21221 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits, 21222 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32)); 21223 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT, 21224 RepresentativeBits, UpperRepresentativeBits); 21225 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped); 21226 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped); 21227 } 21228 21229 // All other vector sizes. 21230 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1); 21231 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) { 21232 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64)); 21233 } 21234 21235 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants); 21236 SDValue RepresentativeBits = 21237 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask); 21238 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>( 21239 NumElts, VecVT.getVectorElementType().getSizeInBits())); 21240 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits); 21241 } 21242 21243 static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, 21244 StoreSDNode *Store) { 21245 if (!Store->isTruncatingStore()) 21246 return SDValue(); 21247 21248 SDLoc DL(Store); 21249 SDValue VecOp = Store->getValue(); 21250 EVT VT = VecOp.getValueType(); 21251 EVT MemVT = Store->getMemoryVT(); 21252 21253 if (!MemVT.isVector() || !VT.isVector() || 21254 MemVT.getVectorElementType() != MVT::i1) 21255 return SDValue(); 21256 21257 // If we are storing a vector that we are currently building, let 21258 // `scalarizeVectorStore()` handle this more efficiently. 21259 if (VecOp.getOpcode() == ISD::BUILD_VECTOR) 21260 return SDValue(); 21261 21262 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp); 21263 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG); 21264 if (!VectorBits) 21265 return SDValue(); 21266 21267 EVT StoreVT = 21268 EVT::getIntegerVT(*DAG.getContext(), MemVT.getStoreSizeInBits()); 21269 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT); 21270 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(), 21271 Store->getMemOperand()); 21272 } 21273 21274 bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) { 21275 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) || 21276 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) || 21277 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32); 21278 } 21279 21280 static SDValue performSTORECombine(SDNode *N, 21281 TargetLowering::DAGCombinerInfo &DCI, 21282 SelectionDAG &DAG, 21283 const AArch64Subtarget *Subtarget) { 21284 StoreSDNode *ST = cast<StoreSDNode>(N); 21285 SDValue Chain = ST->getChain(); 21286 SDValue Value = ST->getValue(); 21287 SDValue Ptr = ST->getBasePtr(); 21288 EVT ValueVT = Value.getValueType(); 21289 21290 auto hasValidElementTypeForFPTruncStore = [](EVT VT) { 21291 EVT EltVT = VT.getVectorElementType(); 21292 return EltVT == MVT::f32 || EltVT == MVT::f64; 21293 }; 21294 21295 // If this is an FP_ROUND followed by a store, fold this into a truncating 21296 // store. We can do this even if this is already a truncstore. 21297 // We purposefully don't care about legality of the nodes here as we know 21298 // they can be split down into something legal. 21299 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND && 21300 Value.getNode()->hasOneUse() && ST->isUnindexed() && 21301 Subtarget->useSVEForFixedLengthVectors() && 21302 ValueVT.isFixedLengthVector() && 21303 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() && 21304 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType())) 21305 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, 21306 ST->getMemoryVT(), ST->getMemOperand()); 21307 21308 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget)) 21309 return Split; 21310 21311 if (Subtarget->supportsAddressTopByteIgnored() && 21312 performTBISimplification(N->getOperand(2), DCI, DAG)) 21313 return SDValue(N, 0); 21314 21315 if (SDValue Store = foldTruncStoreOfExt(DAG, N)) 21316 return Store; 21317 21318 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST)) 21319 return Store; 21320 21321 if (ST->isTruncatingStore()) { 21322 EVT StoreVT = ST->getMemoryVT(); 21323 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT)) 21324 return SDValue(); 21325 if (SDValue Rshrnb = 21326 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) { 21327 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(), 21328 StoreVT, ST->getMemOperand()); 21329 } 21330 } 21331 21332 return SDValue(); 21333 } 21334 21335 static SDValue performMSTORECombine(SDNode *N, 21336 TargetLowering::DAGCombinerInfo &DCI, 21337 SelectionDAG &DAG, 21338 const AArch64Subtarget *Subtarget) { 21339 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N); 21340 SDValue Value = MST->getValue(); 21341 SDValue Mask = MST->getMask(); 21342 SDLoc DL(N); 21343 21344 // If this is a UZP1 followed by a masked store, fold this into a masked 21345 // truncating store. We can do this even if this is already a masked 21346 // truncstore. 21347 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() && 21348 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE && 21349 Value.getValueType().isInteger()) { 21350 Value = Value.getOperand(0); 21351 if (Value.getOpcode() == ISD::BITCAST) { 21352 EVT HalfVT = 21353 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); 21354 EVT InVT = Value.getOperand(0).getValueType(); 21355 21356 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) { 21357 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); 21358 unsigned PgPattern = Mask->getConstantOperandVal(0); 21359 21360 // Ensure we can double the size of the predicate pattern 21361 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern); 21362 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <= 21363 MinSVESize) { 21364 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1), 21365 PgPattern); 21366 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0), 21367 MST->getBasePtr(), MST->getOffset(), Mask, 21368 MST->getMemoryVT(), MST->getMemOperand(), 21369 MST->getAddressingMode(), 21370 /*IsTruncating=*/true); 21371 } 21372 } 21373 } 21374 } 21375 21376 if (MST->isTruncatingStore()) { 21377 EVT ValueVT = Value->getValueType(0); 21378 EVT MemVT = MST->getMemoryVT(); 21379 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) 21380 return SDValue(); 21381 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) { 21382 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(), 21383 MST->getOffset(), MST->getMask(), 21384 MST->getMemoryVT(), MST->getMemOperand(), 21385 MST->getAddressingMode(), true); 21386 } 21387 } 21388 21389 return SDValue(); 21390 } 21391 21392 /// \return true if part of the index was folded into the Base. 21393 static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, 21394 SDLoc DL, SelectionDAG &DAG) { 21395 // This function assumes a vector of i64 indices. 21396 EVT IndexVT = Index.getValueType(); 21397 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64) 21398 return false; 21399 21400 // Simplify: 21401 // BasePtr = Ptr 21402 // Index = X + splat(Offset) 21403 // -> 21404 // BasePtr = Ptr + Offset * scale. 21405 // Index = X 21406 if (Index.getOpcode() == ISD::ADD) { 21407 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) { 21408 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale); 21409 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset); 21410 Index = Index.getOperand(0); 21411 return true; 21412 } 21413 } 21414 21415 // Simplify: 21416 // BasePtr = Ptr 21417 // Index = (X + splat(Offset)) << splat(Shift) 21418 // -> 21419 // BasePtr = Ptr + (Offset << Shift) * scale) 21420 // Index = X << splat(shift) 21421 if (Index.getOpcode() == ISD::SHL && 21422 Index.getOperand(0).getOpcode() == ISD::ADD) { 21423 SDValue Add = Index.getOperand(0); 21424 SDValue ShiftOp = Index.getOperand(1); 21425 SDValue OffsetOp = Add.getOperand(1); 21426 if (auto Shift = DAG.getSplatValue(ShiftOp)) 21427 if (auto Offset = DAG.getSplatValue(OffsetOp)) { 21428 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift); 21429 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale); 21430 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset); 21431 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(), 21432 Add.getOperand(0), ShiftOp); 21433 return true; 21434 } 21435 } 21436 21437 return false; 21438 } 21439 21440 // Analyse the specified address returning true if a more optimal addressing 21441 // mode is available. When returning true all parameters are updated to reflect 21442 // their recommended values. 21443 static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, 21444 SDValue &BasePtr, SDValue &Index, 21445 SelectionDAG &DAG) { 21446 // Try to iteratively fold parts of the index into the base pointer to 21447 // simplify the index as much as possible. 21448 bool Changed = false; 21449 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG)) 21450 Changed = true; 21451 21452 // Only consider element types that are pointer sized as smaller types can 21453 // be easily promoted. 21454 EVT IndexVT = Index.getValueType(); 21455 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64) 21456 return Changed; 21457 21458 // Can indices be trivially shrunk? 21459 EVT DataVT = N->getOperand(1).getValueType(); 21460 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it 21461 // will later be re-extended to 64 bits in legalization 21462 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64) 21463 return Changed; 21464 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) { 21465 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32); 21466 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index); 21467 return true; 21468 } 21469 21470 // Match: 21471 // Index = step(const) 21472 int64_t Stride = 0; 21473 if (Index.getOpcode() == ISD::STEP_VECTOR) { 21474 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue(); 21475 } 21476 // Match: 21477 // Index = step(const) << shift(const) 21478 else if (Index.getOpcode() == ISD::SHL && 21479 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) { 21480 SDValue RHS = Index.getOperand(1); 21481 if (auto *Shift = 21482 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) { 21483 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1); 21484 Stride = Step << Shift->getZExtValue(); 21485 } 21486 } 21487 21488 // Return early because no supported pattern is found. 21489 if (Stride == 0) 21490 return Changed; 21491 21492 if (Stride < std::numeric_limits<int32_t>::min() || 21493 Stride > std::numeric_limits<int32_t>::max()) 21494 return Changed; 21495 21496 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); 21497 unsigned MaxVScale = 21498 Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock; 21499 int64_t LastElementOffset = 21500 IndexVT.getVectorMinNumElements() * Stride * MaxVScale; 21501 21502 if (LastElementOffset < std::numeric_limits<int32_t>::min() || 21503 LastElementOffset > std::numeric_limits<int32_t>::max()) 21504 return Changed; 21505 21506 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32); 21507 // Stride does not scale explicitly by 'Scale', because it happens in 21508 // the gather/scatter addressing mode. 21509 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride)); 21510 return true; 21511 } 21512 21513 static SDValue performMaskedGatherScatterCombine( 21514 SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { 21515 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N); 21516 assert(MGS && "Can only combine gather load or scatter store nodes"); 21517 21518 if (!DCI.isBeforeLegalize()) 21519 return SDValue(); 21520 21521 SDLoc DL(MGS); 21522 SDValue Chain = MGS->getChain(); 21523 SDValue Scale = MGS->getScale(); 21524 SDValue Index = MGS->getIndex(); 21525 SDValue Mask = MGS->getMask(); 21526 SDValue BasePtr = MGS->getBasePtr(); 21527 ISD::MemIndexType IndexType = MGS->getIndexType(); 21528 21529 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG)) 21530 return SDValue(); 21531 21532 // Here we catch such cases early and change MGATHER's IndexType to allow 21533 // the use of an Index that's more legalisation friendly. 21534 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) { 21535 SDValue PassThru = MGT->getPassThru(); 21536 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; 21537 return DAG.getMaskedGather( 21538 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL, 21539 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType()); 21540 } 21541 auto *MSC = cast<MaskedScatterSDNode>(MGS); 21542 SDValue Data = MSC->getValue(); 21543 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale}; 21544 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL, 21545 Ops, MSC->getMemOperand(), IndexType, 21546 MSC->isTruncatingStore()); 21547 } 21548 21549 /// Target-specific DAG combine function for NEON load/store intrinsics 21550 /// to merge base address updates. 21551 static SDValue performNEONPostLDSTCombine(SDNode *N, 21552 TargetLowering::DAGCombinerInfo &DCI, 21553 SelectionDAG &DAG) { 21554 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 21555 return SDValue(); 21556 21557 unsigned AddrOpIdx = N->getNumOperands() - 1; 21558 SDValue Addr = N->getOperand(AddrOpIdx); 21559 21560 // Search for a use of the address operand that is an increment. 21561 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 21562 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 21563 SDNode *User = *UI; 21564 if (User->getOpcode() != ISD::ADD || 21565 UI.getUse().getResNo() != Addr.getResNo()) 21566 continue; 21567 21568 // Check that the add is independent of the load/store. Otherwise, folding 21569 // it would create a cycle. 21570 SmallPtrSet<const SDNode *, 32> Visited; 21571 SmallVector<const SDNode *, 16> Worklist; 21572 Visited.insert(Addr.getNode()); 21573 Worklist.push_back(N); 21574 Worklist.push_back(User); 21575 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 21576 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 21577 continue; 21578 21579 // Find the new opcode for the updating load/store. 21580 bool IsStore = false; 21581 bool IsLaneOp = false; 21582 bool IsDupOp = false; 21583 unsigned NewOpc = 0; 21584 unsigned NumVecs = 0; 21585 unsigned IntNo = N->getConstantOperandVal(1); 21586 switch (IntNo) { 21587 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 21588 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; 21589 NumVecs = 2; break; 21590 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; 21591 NumVecs = 3; break; 21592 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; 21593 NumVecs = 4; break; 21594 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; 21595 NumVecs = 2; IsStore = true; break; 21596 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; 21597 NumVecs = 3; IsStore = true; break; 21598 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; 21599 NumVecs = 4; IsStore = true; break; 21600 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; 21601 NumVecs = 2; break; 21602 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; 21603 NumVecs = 3; break; 21604 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; 21605 NumVecs = 4; break; 21606 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; 21607 NumVecs = 2; IsStore = true; break; 21608 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; 21609 NumVecs = 3; IsStore = true; break; 21610 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; 21611 NumVecs = 4; IsStore = true; break; 21612 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; 21613 NumVecs = 2; IsDupOp = true; break; 21614 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; 21615 NumVecs = 3; IsDupOp = true; break; 21616 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; 21617 NumVecs = 4; IsDupOp = true; break; 21618 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; 21619 NumVecs = 2; IsLaneOp = true; break; 21620 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; 21621 NumVecs = 3; IsLaneOp = true; break; 21622 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; 21623 NumVecs = 4; IsLaneOp = true; break; 21624 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; 21625 NumVecs = 2; IsStore = true; IsLaneOp = true; break; 21626 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; 21627 NumVecs = 3; IsStore = true; IsLaneOp = true; break; 21628 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; 21629 NumVecs = 4; IsStore = true; IsLaneOp = true; break; 21630 } 21631 21632 EVT VecTy; 21633 if (IsStore) 21634 VecTy = N->getOperand(2).getValueType(); 21635 else 21636 VecTy = N->getValueType(0); 21637 21638 // If the increment is a constant, it must match the memory ref size. 21639 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 21640 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 21641 uint32_t IncVal = CInc->getZExtValue(); 21642 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 21643 if (IsLaneOp || IsDupOp) 21644 NumBytes /= VecTy.getVectorNumElements(); 21645 if (IncVal != NumBytes) 21646 continue; 21647 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 21648 } 21649 SmallVector<SDValue, 8> Ops; 21650 Ops.push_back(N->getOperand(0)); // Incoming chain 21651 // Load lane and store have vector list as input. 21652 if (IsLaneOp || IsStore) 21653 for (unsigned i = 2; i < AddrOpIdx; ++i) 21654 Ops.push_back(N->getOperand(i)); 21655 Ops.push_back(Addr); // Base register 21656 Ops.push_back(Inc); 21657 21658 // Return Types. 21659 EVT Tys[6]; 21660 unsigned NumResultVecs = (IsStore ? 0 : NumVecs); 21661 unsigned n; 21662 for (n = 0; n < NumResultVecs; ++n) 21663 Tys[n] = VecTy; 21664 Tys[n++] = MVT::i64; // Type of write back register 21665 Tys[n] = MVT::Other; // Type of the chain 21666 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2)); 21667 21668 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); 21669 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, 21670 MemInt->getMemoryVT(), 21671 MemInt->getMemOperand()); 21672 21673 // Update the uses. 21674 std::vector<SDValue> NewResults; 21675 for (unsigned i = 0; i < NumResultVecs; ++i) { 21676 NewResults.push_back(SDValue(UpdN.getNode(), i)); 21677 } 21678 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); 21679 DCI.CombineTo(N, NewResults); 21680 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 21681 21682 break; 21683 } 21684 return SDValue(); 21685 } 21686 21687 // Checks to see if the value is the prescribed width and returns information 21688 // about its extension mode. 21689 static 21690 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { 21691 ExtType = ISD::NON_EXTLOAD; 21692 switch(V.getNode()->getOpcode()) { 21693 default: 21694 return false; 21695 case ISD::LOAD: { 21696 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode()); 21697 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) 21698 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { 21699 ExtType = LoadNode->getExtensionType(); 21700 return true; 21701 } 21702 return false; 21703 } 21704 case ISD::AssertSext: { 21705 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 21706 if ((TypeNode->getVT() == MVT::i8 && width == 8) 21707 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 21708 ExtType = ISD::SEXTLOAD; 21709 return true; 21710 } 21711 return false; 21712 } 21713 case ISD::AssertZext: { 21714 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 21715 if ((TypeNode->getVT() == MVT::i8 && width == 8) 21716 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 21717 ExtType = ISD::ZEXTLOAD; 21718 return true; 21719 } 21720 return false; 21721 } 21722 case ISD::Constant: 21723 case ISD::TargetConstant: { 21724 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < 21725 1LL << (width - 1); 21726 } 21727 } 21728 21729 return true; 21730 } 21731 21732 // This function does a whole lot of voodoo to determine if the tests are 21733 // equivalent without and with a mask. Essentially what happens is that given a 21734 // DAG resembling: 21735 // 21736 // +-------------+ +-------------+ +-------------+ +-------------+ 21737 // | Input | | AddConstant | | CompConstant| | CC | 21738 // +-------------+ +-------------+ +-------------+ +-------------+ 21739 // | | | | 21740 // V V | +----------+ 21741 // +-------------+ +----+ | | 21742 // | ADD | |0xff| | | 21743 // +-------------+ +----+ | | 21744 // | | | | 21745 // V V | | 21746 // +-------------+ | | 21747 // | AND | | | 21748 // +-------------+ | | 21749 // | | | 21750 // +-----+ | | 21751 // | | | 21752 // V V V 21753 // +-------------+ 21754 // | CMP | 21755 // +-------------+ 21756 // 21757 // The AND node may be safely removed for some combinations of inputs. In 21758 // particular we need to take into account the extension type of the Input, 21759 // the exact values of AddConstant, CompConstant, and CC, along with the nominal 21760 // width of the input (this can work for any width inputs, the above graph is 21761 // specific to 8 bits. 21762 // 21763 // The specific equations were worked out by generating output tables for each 21764 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The 21765 // problem was simplified by working with 4 bit inputs, which means we only 21766 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero 21767 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 21768 // patterns present in both extensions (0,7). For every distinct set of 21769 // AddConstant and CompConstants bit patterns we can consider the masked and 21770 // unmasked versions to be equivalent if the result of this function is true for 21771 // all 16 distinct bit patterns of for the current extension type of Input (w0). 21772 // 21773 // sub w8, w0, w1 21774 // and w10, w8, #0x0f 21775 // cmp w8, w2 21776 // cset w9, AArch64CC 21777 // cmp w10, w2 21778 // cset w11, AArch64CC 21779 // cmp w9, w11 21780 // cset w0, eq 21781 // ret 21782 // 21783 // Since the above function shows when the outputs are equivalent it defines 21784 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and 21785 // would be expensive to run during compiles. The equations below were written 21786 // in a test harness that confirmed they gave equivalent outputs to the above 21787 // for all inputs function, so they can be used determine if the removal is 21788 // legal instead. 21789 // 21790 // isEquivalentMaskless() is the code for testing if the AND can be removed 21791 // factored out of the DAG recognition as the DAG can take several forms. 21792 21793 static bool isEquivalentMaskless(unsigned CC, unsigned width, 21794 ISD::LoadExtType ExtType, int AddConstant, 21795 int CompConstant) { 21796 // By being careful about our equations and only writing the in term 21797 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can 21798 // make them generally applicable to all bit widths. 21799 int MaxUInt = (1 << width); 21800 21801 // For the purposes of these comparisons sign extending the type is 21802 // equivalent to zero extending the add and displacing it by half the integer 21803 // width. Provided we are careful and make sure our equations are valid over 21804 // the whole range we can just adjust the input and avoid writing equations 21805 // for sign extended inputs. 21806 if (ExtType == ISD::SEXTLOAD) 21807 AddConstant -= (1 << (width-1)); 21808 21809 switch(CC) { 21810 case AArch64CC::LE: 21811 case AArch64CC::GT: 21812 if ((AddConstant == 0) || 21813 (CompConstant == MaxUInt - 1 && AddConstant < 0) || 21814 (AddConstant >= 0 && CompConstant < 0) || 21815 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) 21816 return true; 21817 break; 21818 case AArch64CC::LT: 21819 case AArch64CC::GE: 21820 if ((AddConstant == 0) || 21821 (AddConstant >= 0 && CompConstant <= 0) || 21822 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) 21823 return true; 21824 break; 21825 case AArch64CC::HI: 21826 case AArch64CC::LS: 21827 if ((AddConstant >= 0 && CompConstant < 0) || 21828 (AddConstant <= 0 && CompConstant >= -1 && 21829 CompConstant < AddConstant + MaxUInt)) 21830 return true; 21831 break; 21832 case AArch64CC::PL: 21833 case AArch64CC::MI: 21834 if ((AddConstant == 0) || 21835 (AddConstant > 0 && CompConstant <= 0) || 21836 (AddConstant < 0 && CompConstant <= AddConstant)) 21837 return true; 21838 break; 21839 case AArch64CC::LO: 21840 case AArch64CC::HS: 21841 if ((AddConstant >= 0 && CompConstant <= 0) || 21842 (AddConstant <= 0 && CompConstant >= 0 && 21843 CompConstant <= AddConstant + MaxUInt)) 21844 return true; 21845 break; 21846 case AArch64CC::EQ: 21847 case AArch64CC::NE: 21848 if ((AddConstant > 0 && CompConstant < 0) || 21849 (AddConstant < 0 && CompConstant >= 0 && 21850 CompConstant < AddConstant + MaxUInt) || 21851 (AddConstant >= 0 && CompConstant >= 0 && 21852 CompConstant >= AddConstant) || 21853 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) 21854 return true; 21855 break; 21856 case AArch64CC::VS: 21857 case AArch64CC::VC: 21858 case AArch64CC::AL: 21859 case AArch64CC::NV: 21860 return true; 21861 case AArch64CC::Invalid: 21862 break; 21863 } 21864 21865 return false; 21866 } 21867 21868 // (X & C) >u Mask --> (X & (C & (~Mask)) != 0 21869 // (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0 21870 static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, 21871 SDNode *AndNode, SelectionDAG &DAG, 21872 unsigned CCIndex, unsigned CmpIndex, 21873 unsigned CC) { 21874 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1)); 21875 if (!SubsC) 21876 return SDValue(); 21877 21878 APInt SubsAP = SubsC->getAPIntValue(); 21879 if (CC == AArch64CC::HI) { 21880 if (!SubsAP.isMask()) 21881 return SDValue(); 21882 } else if (CC == AArch64CC::LO) { 21883 if (!SubsAP.isPowerOf2()) 21884 return SDValue(); 21885 } else 21886 return SDValue(); 21887 21888 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1)); 21889 if (!AndC) 21890 return SDValue(); 21891 21892 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1); 21893 21894 SDLoc DL(N); 21895 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue(); 21896 SDValue ANDS = DAG.getNode( 21897 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0), 21898 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0))); 21899 SDValue AArch64_CC = 21900 DAG.getConstant(CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL, 21901 N->getOperand(CCIndex)->getValueType(0)); 21902 21903 // For now, only performCSELCombine and performBRCONDCombine call this 21904 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4 21905 // operands. So just init the ops direct to simplify the code. If we have some 21906 // other case with different CCIndex, CmpIndex, we need to use for loop to 21907 // rewrite the code here. 21908 // TODO: Do we need to assert number of operand is 4 here? 21909 assert((CCIndex == 2 && CmpIndex == 3) && 21910 "Expected CCIndex to be 2 and CmpIndex to be 3."); 21911 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC, 21912 ANDS.getValue(1)}; 21913 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops); 21914 } 21915 21916 static 21917 SDValue performCONDCombine(SDNode *N, 21918 TargetLowering::DAGCombinerInfo &DCI, 21919 SelectionDAG &DAG, unsigned CCIndex, 21920 unsigned CmpIndex) { 21921 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue(); 21922 SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); 21923 unsigned CondOpcode = SubsNode->getOpcode(); 21924 21925 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0)) 21926 return SDValue(); 21927 21928 // There is a SUBS feeding this condition. Is it fed by a mask we can 21929 // use? 21930 21931 SDNode *AndNode = SubsNode->getOperand(0).getNode(); 21932 unsigned MaskBits = 0; 21933 21934 if (AndNode->getOpcode() != ISD::AND) 21935 return SDValue(); 21936 21937 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex, 21938 CmpIndex, CC)) 21939 return Val; 21940 21941 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) { 21942 uint32_t CNV = CN->getZExtValue(); 21943 if (CNV == 255) 21944 MaskBits = 8; 21945 else if (CNV == 65535) 21946 MaskBits = 16; 21947 } 21948 21949 if (!MaskBits) 21950 return SDValue(); 21951 21952 SDValue AddValue = AndNode->getOperand(0); 21953 21954 if (AddValue.getOpcode() != ISD::ADD) 21955 return SDValue(); 21956 21957 // The basic dag structure is correct, grab the inputs and validate them. 21958 21959 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); 21960 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); 21961 SDValue SubsInputValue = SubsNode->getOperand(1); 21962 21963 // The mask is present and the provenance of all the values is a smaller type, 21964 // lets see if the mask is superfluous. 21965 21966 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) || 21967 !isa<ConstantSDNode>(SubsInputValue.getNode())) 21968 return SDValue(); 21969 21970 ISD::LoadExtType ExtType; 21971 21972 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || 21973 !checkValueWidth(AddInputValue2, MaskBits, ExtType) || 21974 !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) 21975 return SDValue(); 21976 21977 if(!isEquivalentMaskless(CC, MaskBits, ExtType, 21978 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(), 21979 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue())) 21980 return SDValue(); 21981 21982 // The AND is not necessary, remove it. 21983 21984 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), 21985 SubsNode->getValueType(1)); 21986 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; 21987 21988 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); 21989 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); 21990 21991 return SDValue(N, 0); 21992 } 21993 21994 // Optimize compare with zero and branch. 21995 static SDValue performBRCONDCombine(SDNode *N, 21996 TargetLowering::DAGCombinerInfo &DCI, 21997 SelectionDAG &DAG) { 21998 MachineFunction &MF = DAG.getMachineFunction(); 21999 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions 22000 // will not be produced, as they are conditional branch instructions that do 22001 // not set flags. 22002 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening)) 22003 return SDValue(); 22004 22005 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3)) 22006 N = NV.getNode(); 22007 SDValue Chain = N->getOperand(0); 22008 SDValue Dest = N->getOperand(1); 22009 SDValue CCVal = N->getOperand(2); 22010 SDValue Cmp = N->getOperand(3); 22011 22012 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!"); 22013 unsigned CC = CCVal->getAsZExtVal(); 22014 if (CC != AArch64CC::EQ && CC != AArch64CC::NE) 22015 return SDValue(); 22016 22017 unsigned CmpOpc = Cmp.getOpcode(); 22018 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) 22019 return SDValue(); 22020 22021 // Only attempt folding if there is only one use of the flag and no use of the 22022 // value. 22023 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) 22024 return SDValue(); 22025 22026 SDValue LHS = Cmp.getOperand(0); 22027 SDValue RHS = Cmp.getOperand(1); 22028 22029 assert(LHS.getValueType() == RHS.getValueType() && 22030 "Expected the value type to be the same for both operands!"); 22031 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 22032 return SDValue(); 22033 22034 if (isNullConstant(LHS)) 22035 std::swap(LHS, RHS); 22036 22037 if (!isNullConstant(RHS)) 22038 return SDValue(); 22039 22040 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || 22041 LHS.getOpcode() == ISD::SRL) 22042 return SDValue(); 22043 22044 // Fold the compare into the branch instruction. 22045 SDValue BR; 22046 if (CC == AArch64CC::EQ) 22047 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 22048 else 22049 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 22050 22051 // Do not add new nodes to DAG combiner worklist. 22052 DCI.CombineTo(N, BR, false); 22053 22054 return SDValue(); 22055 } 22056 22057 static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) { 22058 unsigned CC = N->getConstantOperandVal(2); 22059 SDValue SUBS = N->getOperand(3); 22060 SDValue Zero, CTTZ; 22061 22062 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) { 22063 Zero = N->getOperand(0); 22064 CTTZ = N->getOperand(1); 22065 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) { 22066 Zero = N->getOperand(1); 22067 CTTZ = N->getOperand(0); 22068 } else 22069 return SDValue(); 22070 22071 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) || 22072 (CTTZ.getOpcode() == ISD::TRUNCATE && 22073 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ)) 22074 return SDValue(); 22075 22076 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) && 22077 "Illegal type in CTTZ folding"); 22078 22079 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1))) 22080 return SDValue(); 22081 22082 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE 22083 ? CTTZ.getOperand(0).getOperand(0) 22084 : CTTZ.getOperand(0); 22085 22086 if (X != SUBS.getOperand(0)) 22087 return SDValue(); 22088 22089 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE 22090 ? CTTZ.getOperand(0).getValueSizeInBits() 22091 : CTTZ.getValueSizeInBits(); 22092 SDValue BitWidthMinusOne = 22093 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType()); 22094 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ, 22095 BitWidthMinusOne); 22096 } 22097 22098 // (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond) 22099 // (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond) 22100 // Where x and y are constants and x != y 22101 22102 // (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond) 22103 // (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond) 22104 // Where x and y are constants and x != y 22105 static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) { 22106 SDValue L = Op->getOperand(0); 22107 SDValue R = Op->getOperand(1); 22108 AArch64CC::CondCode OpCC = 22109 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2)); 22110 22111 SDValue OpCmp = Op->getOperand(3); 22112 if (!isCMP(OpCmp)) 22113 return SDValue(); 22114 22115 SDValue CmpLHS = OpCmp.getOperand(0); 22116 SDValue CmpRHS = OpCmp.getOperand(1); 22117 22118 if (CmpRHS.getOpcode() == AArch64ISD::CSEL) 22119 std::swap(CmpLHS, CmpRHS); 22120 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL) 22121 return SDValue(); 22122 22123 SDValue X = CmpLHS->getOperand(0); 22124 SDValue Y = CmpLHS->getOperand(1); 22125 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) { 22126 return SDValue(); 22127 } 22128 22129 // If one of the constant is opaque constant, x,y sdnode is still different 22130 // but the real value maybe the same. So check APInt here to make sure the 22131 // code is correct. 22132 ConstantSDNode *CX = cast<ConstantSDNode>(X); 22133 ConstantSDNode *CY = cast<ConstantSDNode>(Y); 22134 if (CX->getAPIntValue() == CY->getAPIntValue()) 22135 return SDValue(); 22136 22137 AArch64CC::CondCode CC = 22138 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2)); 22139 SDValue Cond = CmpLHS->getOperand(3); 22140 22141 if (CmpRHS == Y) 22142 CC = AArch64CC::getInvertedCondCode(CC); 22143 else if (CmpRHS != X) 22144 return SDValue(); 22145 22146 if (OpCC == AArch64CC::NE) 22147 CC = AArch64CC::getInvertedCondCode(CC); 22148 else if (OpCC != AArch64CC::EQ) 22149 return SDValue(); 22150 22151 SDLoc DL(Op); 22152 EVT VT = Op->getValueType(0); 22153 22154 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32); 22155 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond); 22156 } 22157 22158 // Optimize CSEL instructions 22159 static SDValue performCSELCombine(SDNode *N, 22160 TargetLowering::DAGCombinerInfo &DCI, 22161 SelectionDAG &DAG) { 22162 // CSEL x, x, cc -> x 22163 if (N->getOperand(0) == N->getOperand(1)) 22164 return N->getOperand(0); 22165 22166 if (SDValue R = foldCSELOfCSEL(N, DAG)) 22167 return R; 22168 22169 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1 22170 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1 22171 if (SDValue Folded = foldCSELofCTTZ(N, DAG)) 22172 return Folded; 22173 22174 return performCONDCombine(N, DCI, DAG, 2, 3); 22175 } 22176 22177 // Try to re-use an already extended operand of a vector SetCC feeding a 22178 // extended select. Doing so avoids requiring another full extension of the 22179 // SET_CC result when lowering the select. 22180 static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) { 22181 EVT Op0MVT = Op->getOperand(0).getValueType(); 22182 if (!Op0MVT.isVector() || Op->use_empty()) 22183 return SDValue(); 22184 22185 // Make sure that all uses of Op are VSELECTs with result matching types where 22186 // the result type has a larger element type than the SetCC operand. 22187 SDNode *FirstUse = *Op->use_begin(); 22188 if (FirstUse->getOpcode() != ISD::VSELECT) 22189 return SDValue(); 22190 EVT UseMVT = FirstUse->getValueType(0); 22191 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits()) 22192 return SDValue(); 22193 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) { 22194 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT; 22195 })) 22196 return SDValue(); 22197 22198 APInt V; 22199 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V)) 22200 return SDValue(); 22201 22202 SDLoc DL(Op); 22203 SDValue Op0ExtV; 22204 SDValue Op1ExtV; 22205 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get(); 22206 // Check if the first operand of the SET_CC is already extended. If it is, 22207 // split the SET_CC and re-use the extended version of the operand. 22208 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT), 22209 Op->getOperand(0)); 22210 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT), 22211 Op->getOperand(0)); 22212 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) { 22213 Op0ExtV = SDValue(Op0SExt, 0); 22214 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1)); 22215 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) { 22216 Op0ExtV = SDValue(Op0ZExt, 0); 22217 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1)); 22218 } else 22219 return SDValue(); 22220 22221 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1), 22222 Op0ExtV, Op1ExtV, Op->getOperand(2)); 22223 } 22224 22225 static SDValue 22226 performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 22227 SelectionDAG &DAG) { 22228 SDValue Vec = N->getOperand(0); 22229 if (DCI.isBeforeLegalize() && 22230 Vec.getValueType().getVectorElementType() == MVT::i1 && 22231 Vec.getValueType().isFixedLengthVector() && 22232 Vec.getValueType().isPow2VectorType()) { 22233 SDLoc DL(N); 22234 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL, 22235 DAG); 22236 } 22237 22238 return SDValue(); 22239 } 22240 22241 static SDValue performSETCCCombine(SDNode *N, 22242 TargetLowering::DAGCombinerInfo &DCI, 22243 SelectionDAG &DAG) { 22244 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!"); 22245 SDValue LHS = N->getOperand(0); 22246 SDValue RHS = N->getOperand(1); 22247 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get(); 22248 SDLoc DL(N); 22249 EVT VT = N->getValueType(0); 22250 22251 if (SDValue V = tryToWidenSetCCOperands(N, DAG)) 22252 return V; 22253 22254 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X 22255 if (Cond == ISD::SETNE && isOneConstant(RHS) && 22256 LHS->getOpcode() == AArch64ISD::CSEL && 22257 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) && 22258 LHS->hasOneUse()) { 22259 // Invert CSEL's condition. 22260 auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2)); 22261 auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue()); 22262 auto NewCond = getInvertedCondCode(OldCond); 22263 22264 // csel 0, 1, !cond, X 22265 SDValue CSEL = 22266 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0), 22267 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32), 22268 LHS.getOperand(3)); 22269 return DAG.getZExtOrTrunc(CSEL, DL, VT); 22270 } 22271 22272 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne 22273 if (Cond == ISD::SETNE && isNullConstant(RHS) && 22274 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) && 22275 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() && 22276 LHS->hasOneUse()) { 22277 EVT TstVT = LHS->getValueType(0); 22278 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) { 22279 // this pattern will get better opt in emitComparison 22280 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1); 22281 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0), 22282 DAG.getConstant(TstImm, DL, TstVT)); 22283 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2)); 22284 } 22285 } 22286 22287 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne) 22288 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne) 22289 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne) 22290 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne) 22291 if (DCI.isBeforeLegalize() && VT.isScalarInteger() && 22292 (Cond == ISD::SETEQ || Cond == ISD::SETNE) && 22293 (isNullConstant(RHS) || isAllOnesConstant(RHS)) && 22294 LHS->getOpcode() == ISD::BITCAST) { 22295 EVT ToVT = LHS->getValueType(0); 22296 EVT FromVT = LHS->getOperand(0).getValueType(); 22297 if (FromVT.isFixedLengthVector() && 22298 FromVT.getVectorElementType() == MVT::i1) { 22299 bool IsNull = isNullConstant(RHS); 22300 LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND, 22301 DL, MVT::i1, LHS->getOperand(0)); 22302 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT, 22303 LHS); 22304 return DAG.getSetCC(DL, VT, LHS, RHS, Cond); 22305 } 22306 } 22307 22308 // Try to perform the memcmp when the result is tested for [in]equality with 0 22309 if (SDValue V = performOrXorChainCombine(N, DAG)) 22310 return V; 22311 22312 return SDValue(); 22313 } 22314 22315 // Replace a flag-setting operator (eg ANDS) with the generic version 22316 // (eg AND) if the flag is unused. 22317 static SDValue performFlagSettingCombine(SDNode *N, 22318 TargetLowering::DAGCombinerInfo &DCI, 22319 unsigned GenericOpcode) { 22320 SDLoc DL(N); 22321 SDValue LHS = N->getOperand(0); 22322 SDValue RHS = N->getOperand(1); 22323 EVT VT = N->getValueType(0); 22324 22325 // If the flag result isn't used, convert back to a generic opcode. 22326 if (!N->hasAnyUseOfValue(1)) { 22327 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops()); 22328 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)}, 22329 DL); 22330 } 22331 22332 // Combine identical generic nodes into this node, re-using the result. 22333 if (SDNode *Generic = DCI.DAG.getNodeIfExists( 22334 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS})) 22335 DCI.CombineTo(Generic, SDValue(N, 0)); 22336 22337 return SDValue(); 22338 } 22339 22340 static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) { 22341 // setcc_merge_zero pred 22342 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne 22343 // => extract_subvector (inner setcc_merge_zero) 22344 SDValue Pred = N->getOperand(0); 22345 SDValue LHS = N->getOperand(1); 22346 SDValue RHS = N->getOperand(2); 22347 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get(); 22348 22349 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) || 22350 LHS->getOpcode() != ISD::SIGN_EXTEND) 22351 return SDValue(); 22352 22353 SDValue Extract = LHS->getOperand(0); 22354 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR || 22355 Extract->getValueType(0) != N->getValueType(0) || 22356 Extract->getConstantOperandVal(1) != 0) 22357 return SDValue(); 22358 22359 SDValue InnerSetCC = Extract->getOperand(0); 22360 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO) 22361 return SDValue(); 22362 22363 // By this point we've effectively got 22364 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive 22365 // lanes are already zero then the trunc(sext()) sequence is redundant and we 22366 // can operate on A directly. 22367 SDValue InnerPred = InnerSetCC.getOperand(0); 22368 if (Pred.getOpcode() == AArch64ISD::PTRUE && 22369 InnerPred.getOpcode() == AArch64ISD::PTRUE && 22370 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) && 22371 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 && 22372 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256) 22373 return Extract; 22374 22375 return SDValue(); 22376 } 22377 22378 static SDValue 22379 performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 22380 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && 22381 "Unexpected opcode!"); 22382 22383 SelectionDAG &DAG = DCI.DAG; 22384 SDValue Pred = N->getOperand(0); 22385 SDValue LHS = N->getOperand(1); 22386 SDValue RHS = N->getOperand(2); 22387 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get(); 22388 22389 if (SDValue V = performSetCCPunpkCombine(N, DAG)) 22390 return V; 22391 22392 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) && 22393 LHS->getOpcode() == ISD::SIGN_EXTEND && 22394 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) { 22395 // setcc_merge_zero( 22396 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0)) 22397 // => setcc_merge_zero(pred, ...) 22398 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && 22399 LHS->getOperand(0)->getOperand(0) == Pred) 22400 return LHS->getOperand(0); 22401 22402 // setcc_merge_zero( 22403 // all_active, extend(nxvNi1 ...), != splat(0)) 22404 // -> nxvNi1 ... 22405 if (isAllActivePredicate(DAG, Pred)) 22406 return LHS->getOperand(0); 22407 22408 // setcc_merge_zero( 22409 // pred, extend(nxvNi1 ...), != splat(0)) 22410 // -> nxvNi1 and(pred, ...) 22411 if (DCI.isAfterLegalizeDAG()) 22412 // Do this after legalization to allow more folds on setcc_merge_zero 22413 // to be recognized. 22414 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), 22415 LHS->getOperand(0), Pred); 22416 } 22417 22418 return SDValue(); 22419 } 22420 22421 // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test 22422 // as well as whether the test should be inverted. This code is required to 22423 // catch these cases (as opposed to standard dag combines) because 22424 // AArch64ISD::TBZ is matched during legalization. 22425 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, 22426 SelectionDAG &DAG) { 22427 22428 if (!Op->hasOneUse()) 22429 return Op; 22430 22431 // We don't handle undef/constant-fold cases below, as they should have 22432 // already been taken care of (e.g. and of 0, test of undefined shifted bits, 22433 // etc.) 22434 22435 // (tbz (trunc x), b) -> (tbz x, b) 22436 // This case is just here to enable more of the below cases to be caught. 22437 if (Op->getOpcode() == ISD::TRUNCATE && 22438 Bit < Op->getValueType(0).getSizeInBits()) { 22439 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 22440 } 22441 22442 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. 22443 if (Op->getOpcode() == ISD::ANY_EXTEND && 22444 Bit < Op->getOperand(0).getValueSizeInBits()) { 22445 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 22446 } 22447 22448 if (Op->getNumOperands() != 2) 22449 return Op; 22450 22451 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 22452 if (!C) 22453 return Op; 22454 22455 switch (Op->getOpcode()) { 22456 default: 22457 return Op; 22458 22459 // (tbz (and x, m), b) -> (tbz x, b) 22460 case ISD::AND: 22461 if ((C->getZExtValue() >> Bit) & 1) 22462 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 22463 return Op; 22464 22465 // (tbz (shl x, c), b) -> (tbz x, b-c) 22466 case ISD::SHL: 22467 if (C->getZExtValue() <= Bit && 22468 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { 22469 Bit = Bit - C->getZExtValue(); 22470 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 22471 } 22472 return Op; 22473 22474 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x 22475 case ISD::SRA: 22476 Bit = Bit + C->getZExtValue(); 22477 if (Bit >= Op->getValueType(0).getSizeInBits()) 22478 Bit = Op->getValueType(0).getSizeInBits() - 1; 22479 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 22480 22481 // (tbz (srl x, c), b) -> (tbz x, b+c) 22482 case ISD::SRL: 22483 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { 22484 Bit = Bit + C->getZExtValue(); 22485 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 22486 } 22487 return Op; 22488 22489 // (tbz (xor x, -1), b) -> (tbnz x, b) 22490 case ISD::XOR: 22491 if ((C->getZExtValue() >> Bit) & 1) 22492 Invert = !Invert; 22493 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 22494 } 22495 } 22496 22497 // Optimize test single bit zero/non-zero and branch. 22498 static SDValue performTBZCombine(SDNode *N, 22499 TargetLowering::DAGCombinerInfo &DCI, 22500 SelectionDAG &DAG) { 22501 unsigned Bit = N->getConstantOperandVal(2); 22502 bool Invert = false; 22503 SDValue TestSrc = N->getOperand(1); 22504 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG); 22505 22506 if (TestSrc == NewTestSrc) 22507 return SDValue(); 22508 22509 unsigned NewOpc = N->getOpcode(); 22510 if (Invert) { 22511 if (NewOpc == AArch64ISD::TBZ) 22512 NewOpc = AArch64ISD::TBNZ; 22513 else { 22514 assert(NewOpc == AArch64ISD::TBNZ); 22515 NewOpc = AArch64ISD::TBZ; 22516 } 22517 } 22518 22519 SDLoc DL(N); 22520 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc, 22521 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); 22522 } 22523 22524 // Swap vselect operands where it may allow a predicated operation to achieve 22525 // the `sel`. 22526 // 22527 // (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b))) 22528 // => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a)) 22529 static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) { 22530 auto SelectA = N->getOperand(1); 22531 auto SelectB = N->getOperand(2); 22532 auto NTy = N->getValueType(0); 22533 22534 if (!NTy.isScalableVector()) 22535 return SDValue(); 22536 SDValue SetCC = N->getOperand(0); 22537 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse()) 22538 return SDValue(); 22539 22540 switch (SelectB.getOpcode()) { 22541 default: 22542 return SDValue(); 22543 case ISD::FMUL: 22544 case ISD::FSUB: 22545 case ISD::FADD: 22546 break; 22547 } 22548 if (SelectA != SelectB.getOperand(0)) 22549 return SDValue(); 22550 22551 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); 22552 ISD::CondCode InverseCC = 22553 ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType()); 22554 auto InverseSetCC = 22555 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0), 22556 SetCC.getOperand(1), InverseCC); 22557 22558 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy, 22559 {InverseSetCC, SelectB, SelectA}); 22560 } 22561 22562 // vselect (v1i1 setcc) -> 22563 // vselect (v1iXX setcc) (XX is the size of the compared operand type) 22564 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as 22565 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine 22566 // such VSELECT. 22567 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { 22568 if (auto SwapResult = trySwapVSelectOperands(N, DAG)) 22569 return SwapResult; 22570 22571 SDValue N0 = N->getOperand(0); 22572 EVT CCVT = N0.getValueType(); 22573 22574 if (isAllActivePredicate(DAG, N0)) 22575 return N->getOperand(1); 22576 22577 if (isAllInactivePredicate(N0)) 22578 return N->getOperand(2); 22579 22580 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform 22581 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the 22582 // supported types. 22583 SDValue SetCC = N->getOperand(0); 22584 if (SetCC.getOpcode() == ISD::SETCC && 22585 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) { 22586 SDValue CmpLHS = SetCC.getOperand(0); 22587 EVT VT = CmpLHS.getValueType(); 22588 SDNode *CmpRHS = SetCC.getOperand(1).getNode(); 22589 SDNode *SplatLHS = N->getOperand(1).getNode(); 22590 SDNode *SplatRHS = N->getOperand(2).getNode(); 22591 APInt SplatLHSVal; 22592 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() && 22593 VT.isSimple() && 22594 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, 22595 MVT::v2i32, MVT::v4i32, MVT::v2i64}), 22596 VT.getSimpleVT().SimpleTy) && 22597 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) && 22598 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) && 22599 ISD::isConstantSplatVectorAllOnes(SplatRHS)) { 22600 unsigned NumElts = VT.getVectorNumElements(); 22601 SmallVector<SDValue, 8> Ops( 22602 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N), 22603 VT.getScalarType())); 22604 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops); 22605 22606 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val); 22607 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1)); 22608 return Or; 22609 } 22610 } 22611 22612 EVT CmpVT = N0.getOperand(0).getValueType(); 22613 if (N0.getOpcode() != ISD::SETCC || 22614 CCVT.getVectorElementCount() != ElementCount::getFixed(1) || 22615 CCVT.getVectorElementType() != MVT::i1 || 22616 CmpVT.getVectorElementType().isFloatingPoint()) 22617 return SDValue(); 22618 22619 EVT ResVT = N->getValueType(0); 22620 // Only combine when the result type is of the same size as the compared 22621 // operands. 22622 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) 22623 return SDValue(); 22624 22625 SDValue IfTrue = N->getOperand(1); 22626 SDValue IfFalse = N->getOperand(2); 22627 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), 22628 N0.getOperand(0), N0.getOperand(1), 22629 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 22630 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, 22631 IfTrue, IfFalse); 22632 } 22633 22634 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with 22635 /// the compare-mask instructions rather than going via NZCV, even if LHS and 22636 /// RHS are really scalar. This replaces any scalar setcc in the above pattern 22637 /// with a vector one followed by a DUP shuffle on the result. 22638 static SDValue performSelectCombine(SDNode *N, 22639 TargetLowering::DAGCombinerInfo &DCI) { 22640 SelectionDAG &DAG = DCI.DAG; 22641 SDValue N0 = N->getOperand(0); 22642 EVT ResVT = N->getValueType(0); 22643 22644 if (N0.getOpcode() != ISD::SETCC) 22645 return SDValue(); 22646 22647 if (ResVT.isScalableVT()) 22648 return SDValue(); 22649 22650 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered 22651 // scalar SetCCResultType. We also don't expect vectors, because we assume 22652 // that selects fed by vector SETCCs are canonicalized to VSELECT. 22653 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && 22654 "Scalar-SETCC feeding SELECT has unexpected result type!"); 22655 22656 // If NumMaskElts == 0, the comparison is larger than select result. The 22657 // largest real NEON comparison is 64-bits per lane, which means the result is 22658 // at most 32-bits and an illegal vector. Just bail out for now. 22659 EVT SrcVT = N0.getOperand(0).getValueType(); 22660 22661 // Don't try to do this optimization when the setcc itself has i1 operands. 22662 // There are no legal vectors of i1, so this would be pointless. v1f16 is 22663 // ruled out to prevent the creation of setcc that need to be scalarized. 22664 if (SrcVT == MVT::i1 || 22665 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16)) 22666 return SDValue(); 22667 22668 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); 22669 if (!ResVT.isVector() || NumMaskElts == 0) 22670 return SDValue(); 22671 22672 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); 22673 EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); 22674 22675 // Also bail out if the vector CCVT isn't the same size as ResVT. 22676 // This can happen if the SETCC operand size doesn't divide the ResVT size 22677 // (e.g., f64 vs v3f32). 22678 if (CCVT.getSizeInBits() != ResVT.getSizeInBits()) 22679 return SDValue(); 22680 22681 // Make sure we didn't create illegal types, if we're not supposed to. 22682 assert(DCI.isBeforeLegalize() || 22683 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)); 22684 22685 // First perform a vector comparison, where lane 0 is the one we're interested 22686 // in. 22687 SDLoc DL(N0); 22688 SDValue LHS = 22689 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); 22690 SDValue RHS = 22691 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); 22692 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); 22693 22694 // Now duplicate the comparison mask we want across all other lanes. 22695 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); 22696 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask); 22697 Mask = DAG.getNode(ISD::BITCAST, DL, 22698 ResVT.changeVectorElementTypeToInteger(), Mask); 22699 22700 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); 22701 } 22702 22703 static SDValue performDUPCombine(SDNode *N, 22704 TargetLowering::DAGCombinerInfo &DCI) { 22705 EVT VT = N->getValueType(0); 22706 SDLoc DL(N); 22707 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the 22708 // 128bit vector version. 22709 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) { 22710 EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext()); 22711 SmallVector<SDValue> Ops(N->ops()); 22712 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(), 22713 DCI.DAG.getVTList(LVT), Ops)) { 22714 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0), 22715 DCI.DAG.getConstant(0, DL, MVT::i64)); 22716 } 22717 } 22718 22719 if (N->getOpcode() == AArch64ISD::DUP) { 22720 if (DCI.isAfterLegalizeDAG()) { 22721 // If scalar dup's operand is extract_vector_elt, try to combine them into 22722 // duplane. For example, 22723 // 22724 // t21: i32 = extract_vector_elt t19, Constant:i64<0> 22725 // t18: v4i32 = AArch64ISD::DUP t21 22726 // ==> 22727 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0> 22728 SDValue EXTRACT_VEC_ELT = N->getOperand(0); 22729 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 22730 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) { 22731 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); 22732 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0), 22733 EXTRACT_VEC_ELT.getOperand(1)); 22734 } 22735 } 22736 } 22737 22738 return performPostLD1Combine(N, DCI, false); 22739 } 22740 22741 return SDValue(); 22742 } 22743 22744 /// Get rid of unnecessary NVCASTs (that don't change the type). 22745 static SDValue performNVCASTCombine(SDNode *N) { 22746 if (N->getValueType(0) == N->getOperand(0).getValueType()) 22747 return N->getOperand(0); 22748 22749 return SDValue(); 22750 } 22751 22752 // If all users of the globaladdr are of the form (globaladdr + constant), find 22753 // the smallest constant, fold it into the globaladdr's offset and rewrite the 22754 // globaladdr as (globaladdr + constant) - constant. 22755 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, 22756 const AArch64Subtarget *Subtarget, 22757 const TargetMachine &TM) { 22758 auto *GN = cast<GlobalAddressSDNode>(N); 22759 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) != 22760 AArch64II::MO_NO_FLAG) 22761 return SDValue(); 22762 22763 uint64_t MinOffset = -1ull; 22764 for (SDNode *N : GN->uses()) { 22765 if (N->getOpcode() != ISD::ADD) 22766 return SDValue(); 22767 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0)); 22768 if (!C) 22769 C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 22770 if (!C) 22771 return SDValue(); 22772 MinOffset = std::min(MinOffset, C->getZExtValue()); 22773 } 22774 uint64_t Offset = MinOffset + GN->getOffset(); 22775 22776 // Require that the new offset is larger than the existing one. Otherwise, we 22777 // can end up oscillating between two possible DAGs, for example, 22778 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1). 22779 if (Offset <= uint64_t(GN->getOffset())) 22780 return SDValue(); 22781 22782 // Check whether folding this offset is legal. It must not go out of bounds of 22783 // the referenced object to avoid violating the code model, and must be 22784 // smaller than 2^20 because this is the largest offset expressible in all 22785 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF 22786 // stores an immediate signed 21 bit offset.) 22787 // 22788 // This check also prevents us from folding negative offsets, which will end 22789 // up being treated in the same way as large positive ones. They could also 22790 // cause code model violations, and aren't really common enough to matter. 22791 if (Offset >= (1 << 20)) 22792 return SDValue(); 22793 22794 const GlobalValue *GV = GN->getGlobal(); 22795 Type *T = GV->getValueType(); 22796 if (!T->isSized() || 22797 Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T)) 22798 return SDValue(); 22799 22800 SDLoc DL(GN); 22801 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset); 22802 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result, 22803 DAG.getConstant(MinOffset, DL, MVT::i64)); 22804 } 22805 22806 static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, 22807 const AArch64Subtarget *Subtarget) { 22808 SDValue BR = N->getOperand(0); 22809 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE || 22810 !BR.getValueType().isScalarInteger()) 22811 return SDValue(); 22812 22813 SDLoc DL(N); 22814 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0)); 22815 } 22816 22817 // Turns the vector of indices into a vector of byte offstes by scaling Offset 22818 // by (BitWidth / 8). 22819 static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, 22820 SDLoc DL, unsigned BitWidth) { 22821 assert(Offset.getValueType().isScalableVector() && 22822 "This method is only for scalable vectors of offsets"); 22823 22824 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64); 22825 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift); 22826 22827 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift); 22828 } 22829 22830 /// Check if the value of \p OffsetInBytes can be used as an immediate for 22831 /// the gather load/prefetch and scatter store instructions with vector base and 22832 /// immediate offset addressing mode: 22833 /// 22834 /// [<Zn>.[S|D]{, #<imm>}] 22835 /// 22836 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31. 22837 inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, 22838 unsigned ScalarSizeInBytes) { 22839 // The immediate is not a multiple of the scalar size. 22840 if (OffsetInBytes % ScalarSizeInBytes) 22841 return false; 22842 22843 // The immediate is out of range. 22844 if (OffsetInBytes / ScalarSizeInBytes > 31) 22845 return false; 22846 22847 return true; 22848 } 22849 22850 /// Check if the value of \p Offset represents a valid immediate for the SVE 22851 /// gather load/prefetch and scatter store instructiona with vector base and 22852 /// immediate offset addressing mode: 22853 /// 22854 /// [<Zn>.[S|D]{, #<imm>}] 22855 /// 22856 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31. 22857 static bool isValidImmForSVEVecImmAddrMode(SDValue Offset, 22858 unsigned ScalarSizeInBytes) { 22859 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode()); 22860 return OffsetConst && isValidImmForSVEVecImmAddrMode( 22861 OffsetConst->getZExtValue(), ScalarSizeInBytes); 22862 } 22863 22864 static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, 22865 unsigned Opcode, 22866 bool OnlyPackedOffsets = true) { 22867 const SDValue Src = N->getOperand(2); 22868 const EVT SrcVT = Src->getValueType(0); 22869 assert(SrcVT.isScalableVector() && 22870 "Scatter stores are only possible for SVE vectors"); 22871 22872 SDLoc DL(N); 22873 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT(); 22874 22875 // Make sure that source data will fit into an SVE register 22876 if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock) 22877 return SDValue(); 22878 22879 // For FPs, ACLE only supports _packed_ single and double precision types. 22880 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes. 22881 if (SrcElVT.isFloatingPoint()) 22882 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) && 22883 ((Opcode != AArch64ISD::SST1Q_PRED && 22884 Opcode != AArch64ISD::SST1Q_INDEX_PRED) || 22885 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16)))) 22886 return SDValue(); 22887 22888 // Depending on the addressing mode, this is either a pointer or a vector of 22889 // pointers (that fits into one register) 22890 SDValue Base = N->getOperand(4); 22891 // Depending on the addressing mode, this is either a single offset or a 22892 // vector of offsets (that fits into one register) 22893 SDValue Offset = N->getOperand(5); 22894 22895 // For "scalar + vector of indices", just scale the indices. This only 22896 // applies to non-temporal scatters because there's no instruction that takes 22897 // indicies. 22898 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) { 22899 Offset = 22900 getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits()); 22901 Opcode = AArch64ISD::SSTNT1_PRED; 22902 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) { 22903 Offset = 22904 getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits()); 22905 Opcode = AArch64ISD::SST1Q_PRED; 22906 } 22907 22908 // In the case of non-temporal gather loads there's only one SVE instruction 22909 // per data-size: "scalar + vector", i.e. 22910 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] 22911 // Since we do have intrinsics that allow the arguments to be in a different 22912 // order, we may need to swap them to match the spec. 22913 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) && 22914 Offset.getValueType().isVector()) 22915 std::swap(Base, Offset); 22916 22917 // SST1_IMM requires that the offset is an immediate that is: 22918 // * a multiple of #SizeInBytes, 22919 // * in the range [0, 31 x #SizeInBytes], 22920 // where #SizeInBytes is the size in bytes of the stored items. For 22921 // immediates outside that range and non-immediate scalar offsets use SST1 or 22922 // SST1_UXTW instead. 22923 if (Opcode == AArch64ISD::SST1_IMM_PRED) { 22924 if (!isValidImmForSVEVecImmAddrMode(Offset, 22925 SrcVT.getScalarSizeInBits() / 8)) { 22926 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) 22927 Opcode = AArch64ISD::SST1_UXTW_PRED; 22928 else 22929 Opcode = AArch64ISD::SST1_PRED; 22930 22931 std::swap(Base, Offset); 22932 } 22933 } 22934 22935 auto &TLI = DAG.getTargetLoweringInfo(); 22936 if (!TLI.isTypeLegal(Base.getValueType())) 22937 return SDValue(); 22938 22939 // Some scatter store variants allow unpacked offsets, but only as nxv2i32 22940 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to 22941 // nxv2i64. Legalize accordingly. 22942 if (!OnlyPackedOffsets && 22943 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32) 22944 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0); 22945 22946 if (!TLI.isTypeLegal(Offset.getValueType())) 22947 return SDValue(); 22948 22949 // Source value type that is representable in hardware 22950 EVT HwSrcVt = getSVEContainerType(SrcVT); 22951 22952 // Keep the original type of the input data to store - this is needed to be 22953 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For 22954 // FP values we want the integer equivalent, so just use HwSrcVt. 22955 SDValue InputVT = DAG.getValueType(SrcVT); 22956 if (SrcVT.isFloatingPoint()) 22957 InputVT = DAG.getValueType(HwSrcVt); 22958 22959 SDVTList VTs = DAG.getVTList(MVT::Other); 22960 SDValue SrcNew; 22961 22962 if (Src.getValueType().isFloatingPoint()) 22963 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src); 22964 else 22965 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src); 22966 22967 SDValue Ops[] = {N->getOperand(0), // Chain 22968 SrcNew, 22969 N->getOperand(3), // Pg 22970 Base, 22971 Offset, 22972 InputVT}; 22973 22974 return DAG.getNode(Opcode, DL, VTs, Ops); 22975 } 22976 22977 static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, 22978 unsigned Opcode, 22979 bool OnlyPackedOffsets = true) { 22980 const EVT RetVT = N->getValueType(0); 22981 assert(RetVT.isScalableVector() && 22982 "Gather loads are only possible for SVE vectors"); 22983 22984 SDLoc DL(N); 22985 22986 // Make sure that the loaded data will fit into an SVE register 22987 if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock) 22988 return SDValue(); 22989 22990 // Depending on the addressing mode, this is either a pointer or a vector of 22991 // pointers (that fits into one register) 22992 SDValue Base = N->getOperand(3); 22993 // Depending on the addressing mode, this is either a single offset or a 22994 // vector of offsets (that fits into one register) 22995 SDValue Offset = N->getOperand(4); 22996 22997 // For "scalar + vector of indices", scale the indices to obtain unscaled 22998 // offsets. This applies to non-temporal and quadword gathers, which do not 22999 // have an addressing mode with scaled offset. 23000 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) { 23001 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL, 23002 RetVT.getScalarSizeInBits()); 23003 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO; 23004 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) { 23005 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL, 23006 RetVT.getScalarSizeInBits()); 23007 Opcode = AArch64ISD::GLD1Q_MERGE_ZERO; 23008 } 23009 23010 // In the case of non-temporal gather loads and quadword gather loads there's 23011 // only one addressing mode : "vector + scalar", e.g. 23012 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] 23013 // Since we do have intrinsics that allow the arguments to be in a different 23014 // order, we may need to swap them to match the spec. 23015 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO || 23016 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) && 23017 Offset.getValueType().isVector()) 23018 std::swap(Base, Offset); 23019 23020 // GLD{FF}1_IMM requires that the offset is an immediate that is: 23021 // * a multiple of #SizeInBytes, 23022 // * in the range [0, 31 x #SizeInBytes], 23023 // where #SizeInBytes is the size in bytes of the loaded items. For 23024 // immediates outside that range and non-immediate scalar offsets use 23025 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead. 23026 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO || 23027 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) { 23028 if (!isValidImmForSVEVecImmAddrMode(Offset, 23029 RetVT.getScalarSizeInBits() / 8)) { 23030 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) 23031 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) 23032 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO 23033 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO; 23034 else 23035 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO) 23036 ? AArch64ISD::GLD1_MERGE_ZERO 23037 : AArch64ISD::GLDFF1_MERGE_ZERO; 23038 23039 std::swap(Base, Offset); 23040 } 23041 } 23042 23043 auto &TLI = DAG.getTargetLoweringInfo(); 23044 if (!TLI.isTypeLegal(Base.getValueType())) 23045 return SDValue(); 23046 23047 // Some gather load variants allow unpacked offsets, but only as nxv2i32 23048 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to 23049 // nxv2i64. Legalize accordingly. 23050 if (!OnlyPackedOffsets && 23051 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32) 23052 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0); 23053 23054 // Return value type that is representable in hardware 23055 EVT HwRetVt = getSVEContainerType(RetVT); 23056 23057 // Keep the original output value type around - this is needed to be able to 23058 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP 23059 // values we want the integer equivalent, so just use HwRetVT. 23060 SDValue OutVT = DAG.getValueType(RetVT); 23061 if (RetVT.isFloatingPoint()) 23062 OutVT = DAG.getValueType(HwRetVt); 23063 23064 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other); 23065 SDValue Ops[] = {N->getOperand(0), // Chain 23066 N->getOperand(2), // Pg 23067 Base, Offset, OutVT}; 23068 23069 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops); 23070 SDValue LoadChain = SDValue(Load.getNode(), 1); 23071 23072 if (RetVT.isInteger() && (RetVT != HwRetVt)) 23073 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0)); 23074 23075 // If the original return value was FP, bitcast accordingly. Doing it here 23076 // means that we can avoid adding TableGen patterns for FPs. 23077 if (RetVT.isFloatingPoint()) 23078 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0)); 23079 23080 return DAG.getMergeValues({Load, LoadChain}, DL); 23081 } 23082 23083 static SDValue 23084 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 23085 SelectionDAG &DAG) { 23086 SDLoc DL(N); 23087 SDValue Src = N->getOperand(0); 23088 unsigned Opc = Src->getOpcode(); 23089 23090 // Sign extend of an unsigned unpack -> signed unpack 23091 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) { 23092 23093 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI 23094 : AArch64ISD::SUNPKLO; 23095 23096 // Push the sign extend to the operand of the unpack 23097 // This is necessary where, for example, the operand of the unpack 23098 // is another unpack: 23099 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8) 23100 // -> 23101 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8) 23102 // -> 23103 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd)) 23104 SDValue ExtOp = Src->getOperand(0); 23105 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT(); 23106 EVT EltTy = VT.getVectorElementType(); 23107 (void)EltTy; 23108 23109 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) && 23110 "Sign extending from an invalid type"); 23111 23112 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext()); 23113 23114 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(), 23115 ExtOp, DAG.getValueType(ExtVT)); 23116 23117 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext); 23118 } 23119 23120 if (DCI.isBeforeLegalizeOps()) 23121 return SDValue(); 23122 23123 if (!EnableCombineMGatherIntrinsics) 23124 return SDValue(); 23125 23126 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates 23127 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes. 23128 unsigned NewOpc; 23129 unsigned MemVTOpNum = 4; 23130 switch (Opc) { 23131 case AArch64ISD::LD1_MERGE_ZERO: 23132 NewOpc = AArch64ISD::LD1S_MERGE_ZERO; 23133 MemVTOpNum = 3; 23134 break; 23135 case AArch64ISD::LDNF1_MERGE_ZERO: 23136 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO; 23137 MemVTOpNum = 3; 23138 break; 23139 case AArch64ISD::LDFF1_MERGE_ZERO: 23140 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO; 23141 MemVTOpNum = 3; 23142 break; 23143 case AArch64ISD::GLD1_MERGE_ZERO: 23144 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO; 23145 break; 23146 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 23147 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO; 23148 break; 23149 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 23150 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO; 23151 break; 23152 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 23153 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; 23154 break; 23155 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 23156 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO; 23157 break; 23158 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 23159 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; 23160 break; 23161 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 23162 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO; 23163 break; 23164 case AArch64ISD::GLDFF1_MERGE_ZERO: 23165 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO; 23166 break; 23167 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO: 23168 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO; 23169 break; 23170 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO: 23171 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO; 23172 break; 23173 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO: 23174 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO; 23175 break; 23176 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO: 23177 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO; 23178 break; 23179 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO: 23180 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO; 23181 break; 23182 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO: 23183 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO; 23184 break; 23185 case AArch64ISD::GLDNT1_MERGE_ZERO: 23186 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO; 23187 break; 23188 default: 23189 return SDValue(); 23190 } 23191 23192 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT(); 23193 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT(); 23194 23195 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse()) 23196 return SDValue(); 23197 23198 EVT DstVT = N->getValueType(0); 23199 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other); 23200 23201 SmallVector<SDValue, 5> Ops; 23202 for (unsigned I = 0; I < Src->getNumOperands(); ++I) 23203 Ops.push_back(Src->getOperand(I)); 23204 23205 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops); 23206 DCI.CombineTo(N, ExtLoad); 23207 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1)); 23208 23209 // Return N so it doesn't get rechecked 23210 return SDValue(N, 0); 23211 } 23212 23213 /// Legalize the gather prefetch (scalar + vector addressing mode) when the 23214 /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset 23215 /// != nxv2i32) do not need legalization. 23216 static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) { 23217 const unsigned OffsetPos = 4; 23218 SDValue Offset = N->getOperand(OffsetPos); 23219 23220 // Not an unpacked vector, bail out. 23221 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32) 23222 return SDValue(); 23223 23224 // Extend the unpacked offset vector to 64-bit lanes. 23225 SDLoc DL(N); 23226 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset); 23227 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end()); 23228 // Replace the offset operand with the 64-bit one. 23229 Ops[OffsetPos] = Offset; 23230 23231 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); 23232 } 23233 23234 /// Combines a node carrying the intrinsic 23235 /// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses 23236 /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to 23237 /// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the 23238 /// sve gather prefetch instruction with vector plus immediate addressing mode. 23239 static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, 23240 unsigned ScalarSizeInBytes) { 23241 const unsigned ImmPos = 4, OffsetPos = 3; 23242 // No need to combine the node if the immediate is valid... 23243 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes)) 23244 return SDValue(); 23245 23246 // ...otherwise swap the offset base with the offset... 23247 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end()); 23248 std::swap(Ops[ImmPos], Ops[OffsetPos]); 23249 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to 23250 // `aarch64_sve_prfb_gather_uxtw_index`. 23251 SDLoc DL(N); 23252 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL, 23253 MVT::i64); 23254 23255 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); 23256 } 23257 23258 // Return true if the vector operation can guarantee only the first lane of its 23259 // result contains data, with all bits in other lanes set to zero. 23260 static bool isLanes1toNKnownZero(SDValue Op) { 23261 switch (Op.getOpcode()) { 23262 default: 23263 return false; 23264 case AArch64ISD::ANDV_PRED: 23265 case AArch64ISD::EORV_PRED: 23266 case AArch64ISD::FADDA_PRED: 23267 case AArch64ISD::FADDV_PRED: 23268 case AArch64ISD::FMAXNMV_PRED: 23269 case AArch64ISD::FMAXV_PRED: 23270 case AArch64ISD::FMINNMV_PRED: 23271 case AArch64ISD::FMINV_PRED: 23272 case AArch64ISD::ORV_PRED: 23273 case AArch64ISD::SADDV_PRED: 23274 case AArch64ISD::SMAXV_PRED: 23275 case AArch64ISD::SMINV_PRED: 23276 case AArch64ISD::UADDV_PRED: 23277 case AArch64ISD::UMAXV_PRED: 23278 case AArch64ISD::UMINV_PRED: 23279 return true; 23280 } 23281 } 23282 23283 static SDValue removeRedundantInsertVectorElt(SDNode *N) { 23284 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!"); 23285 SDValue InsertVec = N->getOperand(0); 23286 SDValue InsertElt = N->getOperand(1); 23287 SDValue InsertIdx = N->getOperand(2); 23288 23289 // We only care about inserts into the first element... 23290 if (!isNullConstant(InsertIdx)) 23291 return SDValue(); 23292 // ...of a zero'd vector... 23293 if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode())) 23294 return SDValue(); 23295 // ...where the inserted data was previously extracted... 23296 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 23297 return SDValue(); 23298 23299 SDValue ExtractVec = InsertElt.getOperand(0); 23300 SDValue ExtractIdx = InsertElt.getOperand(1); 23301 23302 // ...from the first element of a vector. 23303 if (!isNullConstant(ExtractIdx)) 23304 return SDValue(); 23305 23306 // If we get here we are effectively trying to zero lanes 1-N of a vector. 23307 23308 // Ensure there's no type conversion going on. 23309 if (N->getValueType(0) != ExtractVec.getValueType()) 23310 return SDValue(); 23311 23312 if (!isLanes1toNKnownZero(ExtractVec)) 23313 return SDValue(); 23314 23315 // The explicit zeroing is redundant. 23316 return ExtractVec; 23317 } 23318 23319 static SDValue 23320 performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 23321 if (SDValue Res = removeRedundantInsertVectorElt(N)) 23322 return Res; 23323 23324 return performPostLD1Combine(N, DCI, true); 23325 } 23326 23327 static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) { 23328 EVT Ty = N->getValueType(0); 23329 if (Ty.isInteger()) 23330 return SDValue(); 23331 23332 EVT IntTy = Ty.changeVectorElementTypeToInteger(); 23333 EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount()); 23334 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() < 23335 IntTy.getVectorElementType().getScalarSizeInBits()) 23336 return SDValue(); 23337 23338 SDLoc DL(N); 23339 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)), 23340 DL, ExtIntTy); 23341 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)), 23342 DL, ExtIntTy); 23343 SDValue Idx = N->getOperand(2); 23344 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx); 23345 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy); 23346 return DAG.getBitcast(Ty, Trunc); 23347 } 23348 23349 static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, 23350 TargetLowering::DAGCombinerInfo &DCI, 23351 const AArch64Subtarget *Subtarget) { 23352 SDValue N0 = N->getOperand(0); 23353 EVT VT = N->getValueType(0); 23354 23355 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded. 23356 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND) 23357 return SDValue(); 23358 23359 auto hasValidElementTypeForFPExtLoad = [](EVT VT) { 23360 EVT EltVT = VT.getVectorElementType(); 23361 return EltVT == MVT::f32 || EltVT == MVT::f64; 23362 }; 23363 23364 // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) 23365 // We purposefully don't care about legality of the nodes here as we know 23366 // they can be split down into something legal. 23367 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) && 23368 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() && 23369 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) && 23370 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) { 23371 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 23372 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, 23373 LN0->getChain(), LN0->getBasePtr(), 23374 N0.getValueType(), LN0->getMemOperand()); 23375 DCI.CombineTo(N, ExtLoad); 23376 DCI.CombineTo( 23377 N0.getNode(), 23378 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad, 23379 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)), 23380 ExtLoad.getValue(1)); 23381 return SDValue(N, 0); // Return N so it doesn't get rechecked! 23382 } 23383 23384 return SDValue(); 23385 } 23386 23387 static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, 23388 const AArch64Subtarget *Subtarget) { 23389 EVT VT = N->getValueType(0); 23390 23391 // Don't expand for NEON, SVE2 or SME 23392 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME()) 23393 return SDValue(); 23394 23395 SDLoc DL(N); 23396 23397 SDValue Mask = N->getOperand(0); 23398 SDValue In1 = N->getOperand(1); 23399 SDValue In2 = N->getOperand(2); 23400 23401 SDValue InvMask = DAG.getNOT(DL, Mask, VT); 23402 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1); 23403 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2); 23404 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv); 23405 } 23406 23407 static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) { 23408 EVT VT = N->getValueType(0); 23409 23410 SDValue Insert = N->getOperand(0); 23411 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR) 23412 return SDValue(); 23413 23414 if (!Insert.getOperand(0).isUndef()) 23415 return SDValue(); 23416 23417 uint64_t IdxInsert = Insert.getConstantOperandVal(2); 23418 uint64_t IdxDupLane = N->getConstantOperandVal(1); 23419 if (IdxInsert != 0 || IdxDupLane != 0) 23420 return SDValue(); 23421 23422 SDValue Bitcast = Insert.getOperand(1); 23423 if (Bitcast.getOpcode() != ISD::BITCAST) 23424 return SDValue(); 23425 23426 SDValue Subvec = Bitcast.getOperand(0); 23427 EVT SubvecVT = Subvec.getValueType(); 23428 if (!SubvecVT.is128BitVector()) 23429 return SDValue(); 23430 EVT NewSubvecVT = 23431 getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType()); 23432 23433 SDLoc DL(N); 23434 SDValue NewInsert = 23435 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT, 23436 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2)); 23437 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT, 23438 NewInsert, N->getOperand(1)); 23439 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128); 23440 } 23441 23442 // Try to combine mull with uzp1. 23443 static SDValue tryCombineMULLWithUZP1(SDNode *N, 23444 TargetLowering::DAGCombinerInfo &DCI, 23445 SelectionDAG &DAG) { 23446 if (DCI.isBeforeLegalizeOps()) 23447 return SDValue(); 23448 23449 SDValue LHS = N->getOperand(0); 23450 SDValue RHS = N->getOperand(1); 23451 23452 SDValue ExtractHigh; 23453 SDValue ExtractLow; 23454 SDValue TruncHigh; 23455 SDValue TruncLow; 23456 SDLoc DL(N); 23457 23458 // Check the operands are trunc and extract_high. 23459 if (isEssentiallyExtractHighSubvector(LHS) && 23460 RHS.getOpcode() == ISD::TRUNCATE) { 23461 TruncHigh = RHS; 23462 if (LHS.getOpcode() == ISD::BITCAST) 23463 ExtractHigh = LHS.getOperand(0); 23464 else 23465 ExtractHigh = LHS; 23466 } else if (isEssentiallyExtractHighSubvector(RHS) && 23467 LHS.getOpcode() == ISD::TRUNCATE) { 23468 TruncHigh = LHS; 23469 if (LHS.getOpcode() == ISD::BITCAST) 23470 ExtractHigh = RHS.getOperand(0); 23471 else 23472 ExtractHigh = RHS; 23473 } else 23474 return SDValue(); 23475 23476 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op 23477 // with uzp1. 23478 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll 23479 SDValue TruncHighOp = TruncHigh.getOperand(0); 23480 EVT TruncHighOpVT = TruncHighOp.getValueType(); 23481 if (TruncHighOp.getOpcode() == AArch64ISD::DUP || 23482 DAG.isSplatValue(TruncHighOp, false)) 23483 return SDValue(); 23484 23485 // Check there is other extract_high with same source vector. 23486 // For example, 23487 // 23488 // t18: v4i16 = extract_subvector t2, Constant:i64<0> 23489 // t12: v4i16 = truncate t11 23490 // t31: v4i32 = AArch64ISD::SMULL t18, t12 23491 // t23: v4i16 = extract_subvector t2, Constant:i64<4> 23492 // t16: v4i16 = truncate t15 23493 // t30: v4i32 = AArch64ISD::SMULL t23, t1 23494 // 23495 // This dagcombine assumes the two extract_high uses same source vector in 23496 // order to detect the pair of the mull. If they have different source vector, 23497 // this code will not work. 23498 bool HasFoundMULLow = true; 23499 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0); 23500 if (ExtractHighSrcVec->use_size() != 2) 23501 HasFoundMULLow = false; 23502 23503 // Find ExtractLow. 23504 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) { 23505 if (User == ExtractHigh.getNode()) 23506 continue; 23507 23508 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR || 23509 !isNullConstant(User->getOperand(1))) { 23510 HasFoundMULLow = false; 23511 break; 23512 } 23513 23514 ExtractLow.setNode(User); 23515 } 23516 23517 if (!ExtractLow || !ExtractLow->hasOneUse()) 23518 HasFoundMULLow = false; 23519 23520 // Check ExtractLow's user. 23521 if (HasFoundMULLow) { 23522 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin(); 23523 if (ExtractLowUser->getOpcode() != N->getOpcode()) { 23524 HasFoundMULLow = false; 23525 } else { 23526 if (ExtractLowUser->getOperand(0) == ExtractLow) { 23527 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE) 23528 TruncLow = ExtractLowUser->getOperand(1); 23529 else 23530 HasFoundMULLow = false; 23531 } else { 23532 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE) 23533 TruncLow = ExtractLowUser->getOperand(0); 23534 else 23535 HasFoundMULLow = false; 23536 } 23537 } 23538 } 23539 23540 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op 23541 // with uzp1. 23542 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll 23543 EVT TruncHighVT = TruncHigh.getValueType(); 23544 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext()); 23545 SDValue TruncLowOp = 23546 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT); 23547 EVT TruncLowOpVT = TruncLowOp.getValueType(); 23548 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP || 23549 DAG.isSplatValue(TruncLowOp, false))) 23550 return SDValue(); 23551 23552 // Create uzp1, extract_high and extract_low. 23553 if (TruncHighOpVT != UZP1VT) 23554 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp); 23555 if (TruncLowOpVT != UZP1VT) 23556 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp); 23557 23558 SDValue UZP1 = 23559 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp); 23560 SDValue HighIdxCst = 23561 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64); 23562 SDValue NewTruncHigh = 23563 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst); 23564 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh); 23565 23566 if (HasFoundMULLow) { 23567 EVT TruncLowVT = TruncLow.getValueType(); 23568 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT, 23569 UZP1, ExtractLow.getOperand(1)); 23570 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow); 23571 } 23572 23573 return SDValue(N, 0); 23574 } 23575 23576 static SDValue performMULLCombine(SDNode *N, 23577 TargetLowering::DAGCombinerInfo &DCI, 23578 SelectionDAG &DAG) { 23579 if (SDValue Val = 23580 tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG)) 23581 return Val; 23582 23583 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG)) 23584 return Val; 23585 23586 return SDValue(); 23587 } 23588 23589 static SDValue 23590 performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 23591 SelectionDAG &DAG) { 23592 // Let's do below transform. 23593 // 23594 // t34: v4i32 = AArch64ISD::UADDLV t2 23595 // t35: i32 = extract_vector_elt t34, Constant:i64<0> 23596 // t7: i64 = zero_extend t35 23597 // t20: v1i64 = scalar_to_vector t7 23598 // ==> 23599 // t34: v4i32 = AArch64ISD::UADDLV t2 23600 // t39: v2i32 = extract_subvector t34, Constant:i64<0> 23601 // t40: v1i64 = AArch64ISD::NVCAST t39 23602 if (DCI.isBeforeLegalizeOps()) 23603 return SDValue(); 23604 23605 EVT VT = N->getValueType(0); 23606 if (VT != MVT::v1i64) 23607 return SDValue(); 23608 23609 SDValue ZEXT = N->getOperand(0); 23610 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64) 23611 return SDValue(); 23612 23613 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0); 23614 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 23615 EXTRACT_VEC_ELT.getValueType() != MVT::i32) 23616 return SDValue(); 23617 23618 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1))) 23619 return SDValue(); 23620 23621 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0); 23622 if (UADDLV.getOpcode() != AArch64ISD::UADDLV || 23623 UADDLV.getValueType() != MVT::v4i32 || 23624 UADDLV.getOperand(0).getValueType() != MVT::v8i8) 23625 return SDValue(); 23626 23627 // Let's generate new sequence with AArch64ISD::NVCAST. 23628 SDLoc DL(N); 23629 SDValue EXTRACT_SUBVEC = 23630 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV, 23631 DAG.getConstant(0, DL, MVT::i64)); 23632 SDValue NVCAST = 23633 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC); 23634 23635 return NVCAST; 23636 } 23637 23638 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, 23639 DAGCombinerInfo &DCI) const { 23640 SelectionDAG &DAG = DCI.DAG; 23641 switch (N->getOpcode()) { 23642 default: 23643 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); 23644 break; 23645 case ISD::VECREDUCE_AND: 23646 case ISD::VECREDUCE_OR: 23647 case ISD::VECREDUCE_XOR: 23648 return performVecReduceBitwiseCombine(N, DCI, DAG); 23649 case ISD::ADD: 23650 case ISD::SUB: 23651 return performAddSubCombine(N, DCI); 23652 case ISD::BUILD_VECTOR: 23653 return performBuildVectorCombine(N, DCI, DAG); 23654 case ISD::TRUNCATE: 23655 return performTruncateCombine(N, DAG); 23656 case AArch64ISD::ANDS: 23657 return performFlagSettingCombine(N, DCI, ISD::AND); 23658 case AArch64ISD::ADC: 23659 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true)) 23660 return R; 23661 return foldADCToCINC(N, DAG); 23662 case AArch64ISD::SBC: 23663 return foldOverflowCheck(N, DAG, /* IsAdd */ false); 23664 case AArch64ISD::ADCS: 23665 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true)) 23666 return R; 23667 return performFlagSettingCombine(N, DCI, AArch64ISD::ADC); 23668 case AArch64ISD::SBCS: 23669 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false)) 23670 return R; 23671 return performFlagSettingCombine(N, DCI, AArch64ISD::SBC); 23672 case ISD::XOR: 23673 return performXorCombine(N, DAG, DCI, Subtarget); 23674 case ISD::MUL: 23675 return performMulCombine(N, DAG, DCI, Subtarget); 23676 case ISD::SINT_TO_FP: 23677 case ISD::UINT_TO_FP: 23678 return performIntToFpCombine(N, DAG, Subtarget); 23679 case ISD::FP_TO_SINT: 23680 case ISD::FP_TO_UINT: 23681 case ISD::FP_TO_SINT_SAT: 23682 case ISD::FP_TO_UINT_SAT: 23683 return performFpToIntCombine(N, DAG, DCI, Subtarget); 23684 case ISD::FDIV: 23685 return performFDivCombine(N, DAG, DCI, Subtarget); 23686 case ISD::OR: 23687 return performORCombine(N, DCI, Subtarget, *this); 23688 case ISD::AND: 23689 return performANDCombine(N, DCI); 23690 case ISD::FADD: 23691 return performFADDCombine(N, DCI); 23692 case ISD::INTRINSIC_WO_CHAIN: 23693 return performIntrinsicCombine(N, DCI, Subtarget); 23694 case ISD::ANY_EXTEND: 23695 case ISD::ZERO_EXTEND: 23696 case ISD::SIGN_EXTEND: 23697 return performExtendCombine(N, DCI, DAG); 23698 case ISD::SIGN_EXTEND_INREG: 23699 return performSignExtendInRegCombine(N, DCI, DAG); 23700 case ISD::CONCAT_VECTORS: 23701 return performConcatVectorsCombine(N, DCI, DAG); 23702 case ISD::EXTRACT_SUBVECTOR: 23703 return performExtractSubvectorCombine(N, DCI, DAG); 23704 case ISD::INSERT_SUBVECTOR: 23705 return performInsertSubvectorCombine(N, DCI, DAG); 23706 case ISD::SELECT: 23707 return performSelectCombine(N, DCI); 23708 case ISD::VSELECT: 23709 return performVSelectCombine(N, DCI.DAG); 23710 case ISD::SETCC: 23711 return performSETCCCombine(N, DCI, DAG); 23712 case ISD::LOAD: 23713 return performLOADCombine(N, DCI, DAG, Subtarget); 23714 case ISD::STORE: 23715 return performSTORECombine(N, DCI, DAG, Subtarget); 23716 case ISD::MSTORE: 23717 return performMSTORECombine(N, DCI, DAG, Subtarget); 23718 case ISD::MGATHER: 23719 case ISD::MSCATTER: 23720 return performMaskedGatherScatterCombine(N, DCI, DAG); 23721 case ISD::VECTOR_SPLICE: 23722 return performSVESpliceCombine(N, DAG); 23723 case ISD::FP_EXTEND: 23724 return performFPExtendCombine(N, DAG, DCI, Subtarget); 23725 case AArch64ISD::BRCOND: 23726 return performBRCONDCombine(N, DCI, DAG); 23727 case AArch64ISD::TBNZ: 23728 case AArch64ISD::TBZ: 23729 return performTBZCombine(N, DCI, DAG); 23730 case AArch64ISD::CSEL: 23731 return performCSELCombine(N, DCI, DAG); 23732 case AArch64ISD::DUP: 23733 case AArch64ISD::DUPLANE8: 23734 case AArch64ISD::DUPLANE16: 23735 case AArch64ISD::DUPLANE32: 23736 case AArch64ISD::DUPLANE64: 23737 return performDUPCombine(N, DCI); 23738 case AArch64ISD::DUPLANE128: 23739 return performDupLane128Combine(N, DAG); 23740 case AArch64ISD::NVCAST: 23741 return performNVCASTCombine(N); 23742 case AArch64ISD::SPLICE: 23743 return performSpliceCombine(N, DAG); 23744 case AArch64ISD::UUNPKLO: 23745 case AArch64ISD::UUNPKHI: 23746 return performUnpackCombine(N, DAG, Subtarget); 23747 case AArch64ISD::UZP1: 23748 return performUzpCombine(N, DAG, Subtarget); 23749 case AArch64ISD::SETCC_MERGE_ZERO: 23750 return performSetccMergeZeroCombine(N, DCI); 23751 case AArch64ISD::REINTERPRET_CAST: 23752 return performReinterpretCastCombine(N); 23753 case AArch64ISD::GLD1_MERGE_ZERO: 23754 case AArch64ISD::GLD1_SCALED_MERGE_ZERO: 23755 case AArch64ISD::GLD1_UXTW_MERGE_ZERO: 23756 case AArch64ISD::GLD1_SXTW_MERGE_ZERO: 23757 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: 23758 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: 23759 case AArch64ISD::GLD1_IMM_MERGE_ZERO: 23760 case AArch64ISD::GLD1S_MERGE_ZERO: 23761 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO: 23762 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO: 23763 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO: 23764 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO: 23765 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO: 23766 case AArch64ISD::GLD1S_IMM_MERGE_ZERO: 23767 return performGLD1Combine(N, DAG); 23768 case AArch64ISD::VASHR: 23769 case AArch64ISD::VLSHR: 23770 return performVectorShiftCombine(N, *this, DCI); 23771 case AArch64ISD::SUNPKLO: 23772 return performSunpkloCombine(N, DAG); 23773 case AArch64ISD::BSP: 23774 return performBSPExpandForSVE(N, DAG, Subtarget); 23775 case ISD::INSERT_VECTOR_ELT: 23776 return performInsertVectorEltCombine(N, DCI); 23777 case ISD::EXTRACT_VECTOR_ELT: 23778 return performExtractVectorEltCombine(N, DCI, Subtarget); 23779 case ISD::VECREDUCE_ADD: 23780 return performVecReduceAddCombine(N, DCI.DAG, Subtarget); 23781 case AArch64ISD::UADDV: 23782 return performUADDVCombine(N, DAG); 23783 case AArch64ISD::SMULL: 23784 case AArch64ISD::UMULL: 23785 case AArch64ISD::PMULL: 23786 return performMULLCombine(N, DCI, DAG); 23787 case ISD::INTRINSIC_VOID: 23788 case ISD::INTRINSIC_W_CHAIN: 23789 switch (N->getConstantOperandVal(1)) { 23790 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: 23791 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/); 23792 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: 23793 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/); 23794 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: 23795 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/); 23796 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: 23797 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/); 23798 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: 23799 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: 23800 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: 23801 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: 23802 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: 23803 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: 23804 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: 23805 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: 23806 return legalizeSVEGatherPrefetchOffsVec(N, DAG); 23807 case Intrinsic::aarch64_neon_ld2: 23808 case Intrinsic::aarch64_neon_ld3: 23809 case Intrinsic::aarch64_neon_ld4: 23810 case Intrinsic::aarch64_neon_ld1x2: 23811 case Intrinsic::aarch64_neon_ld1x3: 23812 case Intrinsic::aarch64_neon_ld1x4: 23813 case Intrinsic::aarch64_neon_ld2lane: 23814 case Intrinsic::aarch64_neon_ld3lane: 23815 case Intrinsic::aarch64_neon_ld4lane: 23816 case Intrinsic::aarch64_neon_ld2r: 23817 case Intrinsic::aarch64_neon_ld3r: 23818 case Intrinsic::aarch64_neon_ld4r: 23819 case Intrinsic::aarch64_neon_st2: 23820 case Intrinsic::aarch64_neon_st3: 23821 case Intrinsic::aarch64_neon_st4: 23822 case Intrinsic::aarch64_neon_st1x2: 23823 case Intrinsic::aarch64_neon_st1x3: 23824 case Intrinsic::aarch64_neon_st1x4: 23825 case Intrinsic::aarch64_neon_st2lane: 23826 case Intrinsic::aarch64_neon_st3lane: 23827 case Intrinsic::aarch64_neon_st4lane: 23828 return performNEONPostLDSTCombine(N, DCI, DAG); 23829 case Intrinsic::aarch64_sve_ldnt1: 23830 return performLDNT1Combine(N, DAG); 23831 case Intrinsic::aarch64_sve_ld1rq: 23832 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG); 23833 case Intrinsic::aarch64_sve_ld1ro: 23834 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG); 23835 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: 23836 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); 23837 case Intrinsic::aarch64_sve_ldnt1_gather: 23838 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); 23839 case Intrinsic::aarch64_sve_ldnt1_gather_index: 23840 return performGatherLoadCombine(N, DAG, 23841 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO); 23842 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: 23843 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO); 23844 case Intrinsic::aarch64_sve_ld1: 23845 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO); 23846 case Intrinsic::aarch64_sve_ldnf1: 23847 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO); 23848 case Intrinsic::aarch64_sve_ldff1: 23849 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO); 23850 case Intrinsic::aarch64_sve_st1: 23851 return performST1Combine(N, DAG); 23852 case Intrinsic::aarch64_sve_stnt1: 23853 return performSTNT1Combine(N, DAG); 23854 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: 23855 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); 23856 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: 23857 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); 23858 case Intrinsic::aarch64_sve_stnt1_scatter: 23859 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED); 23860 case Intrinsic::aarch64_sve_stnt1_scatter_index: 23861 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED); 23862 case Intrinsic::aarch64_sve_ld1_gather: 23863 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO); 23864 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset: 23865 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset: 23866 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO); 23867 case Intrinsic::aarch64_sve_ld1q_gather_index: 23868 return performGatherLoadCombine(N, DAG, 23869 AArch64ISD::GLD1Q_INDEX_MERGE_ZERO); 23870 case Intrinsic::aarch64_sve_ld1_gather_index: 23871 return performGatherLoadCombine(N, DAG, 23872 AArch64ISD::GLD1_SCALED_MERGE_ZERO); 23873 case Intrinsic::aarch64_sve_ld1_gather_sxtw: 23874 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO, 23875 /*OnlyPackedOffsets=*/false); 23876 case Intrinsic::aarch64_sve_ld1_gather_uxtw: 23877 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO, 23878 /*OnlyPackedOffsets=*/false); 23879 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: 23880 return performGatherLoadCombine(N, DAG, 23881 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO, 23882 /*OnlyPackedOffsets=*/false); 23883 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: 23884 return performGatherLoadCombine(N, DAG, 23885 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO, 23886 /*OnlyPackedOffsets=*/false); 23887 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: 23888 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO); 23889 case Intrinsic::aarch64_sve_ldff1_gather: 23890 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO); 23891 case Intrinsic::aarch64_sve_ldff1_gather_index: 23892 return performGatherLoadCombine(N, DAG, 23893 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO); 23894 case Intrinsic::aarch64_sve_ldff1_gather_sxtw: 23895 return performGatherLoadCombine(N, DAG, 23896 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO, 23897 /*OnlyPackedOffsets=*/false); 23898 case Intrinsic::aarch64_sve_ldff1_gather_uxtw: 23899 return performGatherLoadCombine(N, DAG, 23900 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO, 23901 /*OnlyPackedOffsets=*/false); 23902 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: 23903 return performGatherLoadCombine(N, DAG, 23904 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO, 23905 /*OnlyPackedOffsets=*/false); 23906 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: 23907 return performGatherLoadCombine(N, DAG, 23908 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO, 23909 /*OnlyPackedOffsets=*/false); 23910 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: 23911 return performGatherLoadCombine(N, DAG, 23912 AArch64ISD::GLDFF1_IMM_MERGE_ZERO); 23913 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset: 23914 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset: 23915 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED); 23916 case Intrinsic::aarch64_sve_st1q_scatter_index: 23917 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED); 23918 case Intrinsic::aarch64_sve_st1_scatter: 23919 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED); 23920 case Intrinsic::aarch64_sve_st1_scatter_index: 23921 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED); 23922 case Intrinsic::aarch64_sve_st1_scatter_sxtw: 23923 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED, 23924 /*OnlyPackedOffsets=*/false); 23925 case Intrinsic::aarch64_sve_st1_scatter_uxtw: 23926 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED, 23927 /*OnlyPackedOffsets=*/false); 23928 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: 23929 return performScatterStoreCombine(N, DAG, 23930 AArch64ISD::SST1_SXTW_SCALED_PRED, 23931 /*OnlyPackedOffsets=*/false); 23932 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: 23933 return performScatterStoreCombine(N, DAG, 23934 AArch64ISD::SST1_UXTW_SCALED_PRED, 23935 /*OnlyPackedOffsets=*/false); 23936 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: 23937 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED); 23938 case Intrinsic::aarch64_rndr: 23939 case Intrinsic::aarch64_rndrrs: { 23940 unsigned IntrinsicID = N->getConstantOperandVal(1); 23941 auto Register = 23942 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR 23943 : AArch64SysReg::RNDRRS); 23944 SDLoc DL(N); 23945 SDValue A = DAG.getNode( 23946 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other), 23947 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64)); 23948 SDValue B = DAG.getNode( 23949 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32), 23950 DAG.getConstant(0, DL, MVT::i32), 23951 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1)); 23952 return DAG.getMergeValues( 23953 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL); 23954 } 23955 default: 23956 break; 23957 } 23958 break; 23959 case ISD::GlobalAddress: 23960 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine()); 23961 case ISD::CTLZ: 23962 return performCTLZCombine(N, DAG, Subtarget); 23963 case ISD::SCALAR_TO_VECTOR: 23964 return performScalarToVectorCombine(N, DCI, DAG); 23965 } 23966 return SDValue(); 23967 } 23968 23969 // Check if the return value is used as only a return value, as otherwise 23970 // we can't perform a tail-call. In particular, we need to check for 23971 // target ISD nodes that are returns and any other "odd" constructs 23972 // that the generic analysis code won't necessarily catch. 23973 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, 23974 SDValue &Chain) const { 23975 if (N->getNumValues() != 1) 23976 return false; 23977 if (!N->hasNUsesOfValue(1, 0)) 23978 return false; 23979 23980 SDValue TCChain = Chain; 23981 SDNode *Copy = *N->use_begin(); 23982 if (Copy->getOpcode() == ISD::CopyToReg) { 23983 // If the copy has a glue operand, we conservatively assume it isn't safe to 23984 // perform a tail call. 23985 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == 23986 MVT::Glue) 23987 return false; 23988 TCChain = Copy->getOperand(0); 23989 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 23990 return false; 23991 23992 bool HasRet = false; 23993 for (SDNode *Node : Copy->uses()) { 23994 if (Node->getOpcode() != AArch64ISD::RET_GLUE) 23995 return false; 23996 HasRet = true; 23997 } 23998 23999 if (!HasRet) 24000 return false; 24001 24002 Chain = TCChain; 24003 return true; 24004 } 24005 24006 // Return whether the an instruction can potentially be optimized to a tail 24007 // call. This will cause the optimizers to attempt to move, or duplicate, 24008 // return instructions to help enable tail call optimizations for this 24009 // instruction. 24010 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 24011 return CI->isTailCall(); 24012 } 24013 24014 bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base, 24015 Register Offset, bool IsPre, 24016 MachineRegisterInfo &MRI) const { 24017 auto CstOffset = getIConstantVRegVal(Offset, MRI); 24018 if (!CstOffset || CstOffset->isZero()) 24019 return false; 24020 24021 // All of the indexed addressing mode instructions take a signed 9 bit 24022 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already 24023 // encodes the sign/indexing direction. 24024 return isInt<9>(CstOffset->getSExtValue()); 24025 } 24026 24027 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op, 24028 SDValue &Base, 24029 SDValue &Offset, 24030 SelectionDAG &DAG) const { 24031 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) 24032 return false; 24033 24034 // Non-null if there is exactly one user of the loaded value (ignoring chain). 24035 SDNode *ValOnlyUser = nullptr; 24036 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; 24037 ++UI) { 24038 if (UI.getUse().getResNo() == 1) 24039 continue; // Ignore chain. 24040 if (ValOnlyUser == nullptr) 24041 ValOnlyUser = *UI; 24042 else { 24043 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out. 24044 break; 24045 } 24046 } 24047 24048 auto IsUndefOrZero = [](SDValue V) { 24049 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true); 24050 }; 24051 24052 // If the only user of the value is a scalable vector splat, it is 24053 // preferable to do a replicating load (ld1r*). 24054 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() && 24055 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR || 24056 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU && 24057 IsUndefOrZero(ValOnlyUser->getOperand(2))))) 24058 return false; 24059 24060 Base = Op->getOperand(0); 24061 // All of the indexed addressing mode instructions take a signed 24062 // 9 bit immediate offset. 24063 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) { 24064 int64_t RHSC = RHS->getSExtValue(); 24065 if (Op->getOpcode() == ISD::SUB) 24066 RHSC = -(uint64_t)RHSC; 24067 if (!isInt<9>(RHSC)) 24068 return false; 24069 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset 24070 // when dealing with subtraction. 24071 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0)); 24072 return true; 24073 } 24074 return false; 24075 } 24076 24077 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 24078 SDValue &Offset, 24079 ISD::MemIndexedMode &AM, 24080 SelectionDAG &DAG) const { 24081 EVT VT; 24082 SDValue Ptr; 24083 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 24084 VT = LD->getMemoryVT(); 24085 Ptr = LD->getBasePtr(); 24086 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 24087 VT = ST->getMemoryVT(); 24088 Ptr = ST->getBasePtr(); 24089 } else 24090 return false; 24091 24092 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG)) 24093 return false; 24094 AM = ISD::PRE_INC; 24095 return true; 24096 } 24097 24098 bool AArch64TargetLowering::getPostIndexedAddressParts( 24099 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, 24100 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { 24101 EVT VT; 24102 SDValue Ptr; 24103 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 24104 VT = LD->getMemoryVT(); 24105 Ptr = LD->getBasePtr(); 24106 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 24107 VT = ST->getMemoryVT(); 24108 Ptr = ST->getBasePtr(); 24109 } else 24110 return false; 24111 24112 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG)) 24113 return false; 24114 // Post-indexing updates the base, so it's not a valid transform 24115 // if that's not the same as the load's pointer. 24116 if (Ptr != Base) 24117 return false; 24118 AM = ISD::POST_INC; 24119 return true; 24120 } 24121 24122 static void replaceBoolVectorBitcast(SDNode *N, 24123 SmallVectorImpl<SDValue> &Results, 24124 SelectionDAG &DAG) { 24125 SDLoc DL(N); 24126 SDValue Op = N->getOperand(0); 24127 EVT VT = N->getValueType(0); 24128 [[maybe_unused]] EVT SrcVT = Op.getValueType(); 24129 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 && 24130 "Must be bool vector."); 24131 24132 // Special handling for Clang's __builtin_convertvector. For vectors with <8 24133 // elements, it adds a vector concatenation with undef(s). If we encounter 24134 // this here, we can skip the concat. 24135 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) { 24136 bool AllUndef = true; 24137 for (unsigned I = 1; I < Op.getNumOperands(); ++I) 24138 AllUndef &= Op.getOperand(I).isUndef(); 24139 24140 if (AllUndef) 24141 Op = Op.getOperand(0); 24142 } 24143 24144 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG); 24145 if (VectorBits) 24146 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT)); 24147 } 24148 24149 static void CustomNonLegalBITCASTResults(SDNode *N, 24150 SmallVectorImpl<SDValue> &Results, 24151 SelectionDAG &DAG, EVT ExtendVT, 24152 EVT CastVT) { 24153 SDLoc DL(N); 24154 SDValue Op = N->getOperand(0); 24155 EVT VT = N->getValueType(0); 24156 24157 // Use SCALAR_TO_VECTOR for lane zero 24158 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op); 24159 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec); 24160 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL); 24161 Results.push_back( 24162 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero)); 24163 } 24164 24165 void AArch64TargetLowering::ReplaceBITCASTResults( 24166 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 24167 SDLoc DL(N); 24168 SDValue Op = N->getOperand(0); 24169 EVT VT = N->getValueType(0); 24170 EVT SrcVT = Op.getValueType(); 24171 24172 if (VT == MVT::v2i16 && SrcVT == MVT::i32) { 24173 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16); 24174 return; 24175 } 24176 24177 if (VT == MVT::v4i8 && SrcVT == MVT::i32) { 24178 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8); 24179 return; 24180 } 24181 24182 if (VT == MVT::v2i8 && SrcVT == MVT::i16) { 24183 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8); 24184 return; 24185 } 24186 24187 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) { 24188 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() && 24189 "Expected fp->int bitcast!"); 24190 24191 // Bitcasting between unpacked vector types of different element counts is 24192 // not a NOP because the live elements are laid out differently. 24193 // 01234567 24194 // e.g. nxv2i32 = XX??XX?? 24195 // nxv4f16 = X?X?X?X? 24196 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount()) 24197 return; 24198 24199 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG); 24200 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult)); 24201 return; 24202 } 24203 24204 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1) 24205 return replaceBoolVectorBitcast(N, Results, DAG); 24206 24207 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16)) 24208 return; 24209 24210 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, 24211 DAG.getUNDEF(MVT::i32), Op); 24212 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); 24213 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); 24214 } 24215 24216 static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results, 24217 SelectionDAG &DAG, 24218 const AArch64Subtarget *Subtarget) { 24219 EVT VT = N->getValueType(0); 24220 if (!VT.is256BitVector() || 24221 (VT.getScalarType().isFloatingPoint() && 24222 !N->getFlags().hasAllowReassociation()) || 24223 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16())) 24224 return; 24225 24226 SDValue X = N->getOperand(0); 24227 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1)); 24228 if (!Shuf) { 24229 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0)); 24230 X = N->getOperand(1); 24231 if (!Shuf) 24232 return; 24233 } 24234 24235 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef()) 24236 return; 24237 24238 // Check the mask is 1,0,3,2,5,4,... 24239 ArrayRef<int> Mask = Shuf->getMask(); 24240 for (int I = 0, E = Mask.size(); I < E; I++) 24241 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1)) 24242 return; 24243 24244 SDLoc DL(N); 24245 auto LoHi = DAG.SplitVector(X, DL); 24246 assert(LoHi.first.getValueType() == LoHi.second.getValueType()); 24247 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(), 24248 LoHi.first, LoHi.second); 24249 24250 // Shuffle the elements back into order. 24251 SmallVector<int> NMask; 24252 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) { 24253 NMask.push_back(I); 24254 NMask.push_back(I); 24255 } 24256 Results.push_back( 24257 DAG.getVectorShuffle(VT, DL, 24258 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp, 24259 DAG.getUNDEF(LoHi.first.getValueType())), 24260 DAG.getUNDEF(VT), NMask)); 24261 } 24262 24263 static void ReplaceReductionResults(SDNode *N, 24264 SmallVectorImpl<SDValue> &Results, 24265 SelectionDAG &DAG, unsigned InterOp, 24266 unsigned AcrossOp) { 24267 EVT LoVT, HiVT; 24268 SDValue Lo, Hi; 24269 SDLoc dl(N); 24270 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); 24271 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); 24272 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); 24273 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); 24274 Results.push_back(SplitVal); 24275 } 24276 24277 void AArch64TargetLowering::ReplaceExtractSubVectorResults( 24278 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 24279 SDValue In = N->getOperand(0); 24280 EVT InVT = In.getValueType(); 24281 24282 // Common code will handle these just fine. 24283 if (!InVT.isScalableVector() || !InVT.isInteger()) 24284 return; 24285 24286 SDLoc DL(N); 24287 EVT VT = N->getValueType(0); 24288 24289 // The following checks bail if this is not a halving operation. 24290 24291 ElementCount ResEC = VT.getVectorElementCount(); 24292 24293 if (InVT.getVectorElementCount() != (ResEC * 2)) 24294 return; 24295 24296 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1)); 24297 if (!CIndex) 24298 return; 24299 24300 unsigned Index = CIndex->getZExtValue(); 24301 if ((Index != 0) && (Index != ResEC.getKnownMinValue())) 24302 return; 24303 24304 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI; 24305 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext()); 24306 24307 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0)); 24308 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half)); 24309 } 24310 24311 // Create an even/odd pair of X registers holding integer value V. 24312 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 24313 SDLoc dl(V.getNode()); 24314 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64); 24315 if (DAG.getDataLayout().isBigEndian()) 24316 std::swap (VLo, VHi); 24317 SDValue RegClass = 24318 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32); 24319 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32); 24320 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32); 24321 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 24322 return SDValue( 24323 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 24324 } 24325 24326 static void ReplaceCMP_SWAP_128Results(SDNode *N, 24327 SmallVectorImpl<SDValue> &Results, 24328 SelectionDAG &DAG, 24329 const AArch64Subtarget *Subtarget) { 24330 assert(N->getValueType(0) == MVT::i128 && 24331 "AtomicCmpSwap on types less than 128 should be legal"); 24332 24333 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 24334 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) { 24335 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type, 24336 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG. 24337 SDValue Ops[] = { 24338 createGPRPairNode(DAG, N->getOperand(2)), // Compare value 24339 createGPRPairNode(DAG, N->getOperand(3)), // Store value 24340 N->getOperand(1), // Ptr 24341 N->getOperand(0), // Chain in 24342 }; 24343 24344 unsigned Opcode; 24345 switch (MemOp->getMergedOrdering()) { 24346 case AtomicOrdering::Monotonic: 24347 Opcode = AArch64::CASPX; 24348 break; 24349 case AtomicOrdering::Acquire: 24350 Opcode = AArch64::CASPAX; 24351 break; 24352 case AtomicOrdering::Release: 24353 Opcode = AArch64::CASPLX; 24354 break; 24355 case AtomicOrdering::AcquireRelease: 24356 case AtomicOrdering::SequentiallyConsistent: 24357 Opcode = AArch64::CASPALX; 24358 break; 24359 default: 24360 llvm_unreachable("Unexpected ordering!"); 24361 } 24362 24363 MachineSDNode *CmpSwap = DAG.getMachineNode( 24364 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops); 24365 DAG.setNodeMemRefs(CmpSwap, {MemOp}); 24366 24367 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64; 24368 if (DAG.getDataLayout().isBigEndian()) 24369 std::swap(SubReg1, SubReg2); 24370 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, 24371 SDValue(CmpSwap, 0)); 24372 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, 24373 SDValue(CmpSwap, 0)); 24374 Results.push_back( 24375 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi)); 24376 Results.push_back(SDValue(CmpSwap, 1)); // Chain out 24377 return; 24378 } 24379 24380 unsigned Opcode; 24381 switch (MemOp->getMergedOrdering()) { 24382 case AtomicOrdering::Monotonic: 24383 Opcode = AArch64::CMP_SWAP_128_MONOTONIC; 24384 break; 24385 case AtomicOrdering::Acquire: 24386 Opcode = AArch64::CMP_SWAP_128_ACQUIRE; 24387 break; 24388 case AtomicOrdering::Release: 24389 Opcode = AArch64::CMP_SWAP_128_RELEASE; 24390 break; 24391 case AtomicOrdering::AcquireRelease: 24392 case AtomicOrdering::SequentiallyConsistent: 24393 Opcode = AArch64::CMP_SWAP_128; 24394 break; 24395 default: 24396 llvm_unreachable("Unexpected ordering!"); 24397 } 24398 24399 SDLoc DL(N); 24400 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64); 24401 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64); 24402 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second, 24403 New.first, New.second, N->getOperand(0)}; 24404 SDNode *CmpSwap = DAG.getMachineNode( 24405 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), 24406 Ops); 24407 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 24408 24409 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, 24410 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1))); 24411 Results.push_back(SDValue(CmpSwap, 3)); 24412 } 24413 24414 static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, 24415 AtomicOrdering Ordering) { 24416 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see 24417 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because 24418 // the type is not legal. Therefore we shouldn't expect to see a 128-bit 24419 // ATOMIC_LOAD_CLR at any point. 24420 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR && 24421 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly"); 24422 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD"); 24423 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB"); 24424 24425 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) { 24426 // The operand will need to be XORed in a separate step. 24427 switch (Ordering) { 24428 case AtomicOrdering::Monotonic: 24429 return AArch64::LDCLRP; 24430 break; 24431 case AtomicOrdering::Acquire: 24432 return AArch64::LDCLRPA; 24433 break; 24434 case AtomicOrdering::Release: 24435 return AArch64::LDCLRPL; 24436 break; 24437 case AtomicOrdering::AcquireRelease: 24438 case AtomicOrdering::SequentiallyConsistent: 24439 return AArch64::LDCLRPAL; 24440 break; 24441 default: 24442 llvm_unreachable("Unexpected ordering!"); 24443 } 24444 } 24445 24446 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) { 24447 switch (Ordering) { 24448 case AtomicOrdering::Monotonic: 24449 return AArch64::LDSETP; 24450 break; 24451 case AtomicOrdering::Acquire: 24452 return AArch64::LDSETPA; 24453 break; 24454 case AtomicOrdering::Release: 24455 return AArch64::LDSETPL; 24456 break; 24457 case AtomicOrdering::AcquireRelease: 24458 case AtomicOrdering::SequentiallyConsistent: 24459 return AArch64::LDSETPAL; 24460 break; 24461 default: 24462 llvm_unreachable("Unexpected ordering!"); 24463 } 24464 } 24465 24466 if (ISDOpcode == ISD::ATOMIC_SWAP) { 24467 switch (Ordering) { 24468 case AtomicOrdering::Monotonic: 24469 return AArch64::SWPP; 24470 break; 24471 case AtomicOrdering::Acquire: 24472 return AArch64::SWPPA; 24473 break; 24474 case AtomicOrdering::Release: 24475 return AArch64::SWPPL; 24476 break; 24477 case AtomicOrdering::AcquireRelease: 24478 case AtomicOrdering::SequentiallyConsistent: 24479 return AArch64::SWPPAL; 24480 break; 24481 default: 24482 llvm_unreachable("Unexpected ordering!"); 24483 } 24484 } 24485 24486 llvm_unreachable("Unexpected ISDOpcode!"); 24487 } 24488 24489 static void ReplaceATOMIC_LOAD_128Results(SDNode *N, 24490 SmallVectorImpl<SDValue> &Results, 24491 SelectionDAG &DAG, 24492 const AArch64Subtarget *Subtarget) { 24493 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it 24494 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions 24495 // rather than the CASP instructions, because CASP has register classes for 24496 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG 24497 // to present them as single operands. LSE128 instructions use the GPR64 24498 // register class (because the pair does not have to be sequential), like 24499 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR. 24500 24501 assert(N->getValueType(0) == MVT::i128 && 24502 "AtomicLoadXXX on types less than 128 should be legal"); 24503 24504 if (!Subtarget->hasLSE128()) 24505 return; 24506 24507 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 24508 const SDValue &Chain = N->getOperand(0); 24509 const SDValue &Ptr = N->getOperand(1); 24510 const SDValue &Val128 = N->getOperand(2); 24511 std::pair<SDValue, SDValue> Val2x64 = 24512 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64); 24513 24514 const unsigned ISDOpcode = N->getOpcode(); 24515 const unsigned MachineOpcode = 24516 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering()); 24517 24518 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) { 24519 SDLoc dl(Val128); 24520 Val2x64.first = 24521 DAG.getNode(ISD::XOR, dl, MVT::i64, 24522 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first); 24523 Val2x64.second = 24524 DAG.getNode(ISD::XOR, dl, MVT::i64, 24525 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second); 24526 } 24527 24528 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain}; 24529 if (DAG.getDataLayout().isBigEndian()) 24530 std::swap(Ops[0], Ops[1]); 24531 24532 MachineSDNode *AtomicInst = 24533 DAG.getMachineNode(MachineOpcode, SDLoc(N), 24534 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops); 24535 24536 DAG.setNodeMemRefs(AtomicInst, {MemOp}); 24537 24538 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1); 24539 if (DAG.getDataLayout().isBigEndian()) 24540 std::swap(Lo, Hi); 24541 24542 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi)); 24543 Results.push_back(SDValue(AtomicInst, 2)); // Chain out 24544 } 24545 24546 void AArch64TargetLowering::ReplaceNodeResults( 24547 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 24548 switch (N->getOpcode()) { 24549 default: 24550 llvm_unreachable("Don't know how to custom expand this"); 24551 case ISD::BITCAST: 24552 ReplaceBITCASTResults(N, Results, DAG); 24553 return; 24554 case ISD::VECREDUCE_ADD: 24555 case ISD::VECREDUCE_SMAX: 24556 case ISD::VECREDUCE_SMIN: 24557 case ISD::VECREDUCE_UMAX: 24558 case ISD::VECREDUCE_UMIN: 24559 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); 24560 return; 24561 case ISD::ADD: 24562 case ISD::FADD: 24563 ReplaceAddWithADDP(N, Results, DAG, Subtarget); 24564 return; 24565 24566 case ISD::CTPOP: 24567 case ISD::PARITY: 24568 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG)) 24569 Results.push_back(Result); 24570 return; 24571 case AArch64ISD::SADDV: 24572 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); 24573 return; 24574 case AArch64ISD::UADDV: 24575 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); 24576 return; 24577 case AArch64ISD::SMINV: 24578 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); 24579 return; 24580 case AArch64ISD::UMINV: 24581 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); 24582 return; 24583 case AArch64ISD::SMAXV: 24584 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); 24585 return; 24586 case AArch64ISD::UMAXV: 24587 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); 24588 return; 24589 case ISD::MULHS: 24590 if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType())) 24591 Results.push_back( 24592 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED)); 24593 return; 24594 case ISD::MULHU: 24595 if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType())) 24596 Results.push_back( 24597 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED)); 24598 return; 24599 case ISD::FP_TO_UINT: 24600 case ISD::FP_TO_SINT: 24601 case ISD::STRICT_FP_TO_SINT: 24602 case ISD::STRICT_FP_TO_UINT: 24603 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); 24604 // Let normal code take care of it by not adding anything to Results. 24605 return; 24606 case ISD::ATOMIC_CMP_SWAP: 24607 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget); 24608 return; 24609 case ISD::ATOMIC_LOAD_CLR: 24610 assert(N->getValueType(0) != MVT::i128 && 24611 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP"); 24612 break; 24613 case ISD::ATOMIC_LOAD_AND: 24614 case ISD::ATOMIC_LOAD_OR: 24615 case ISD::ATOMIC_SWAP: { 24616 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 && 24617 "Expected 128-bit atomicrmw."); 24618 // These need custom type legalisation so we go directly to instruction. 24619 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget); 24620 return; 24621 } 24622 case ISD::ATOMIC_LOAD: 24623 case ISD::LOAD: { 24624 MemSDNode *LoadNode = cast<MemSDNode>(N); 24625 EVT MemVT = LoadNode->getMemoryVT(); 24626 // Handle lowering 256 bit non temporal loads into LDNP for little-endian 24627 // targets. 24628 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() && 24629 MemVT.getSizeInBits() == 256u && 24630 (MemVT.getScalarSizeInBits() == 8u || 24631 MemVT.getScalarSizeInBits() == 16u || 24632 MemVT.getScalarSizeInBits() == 32u || 24633 MemVT.getScalarSizeInBits() == 64u)) { 24634 24635 SDValue Result = DAG.getMemIntrinsicNode( 24636 AArch64ISD::LDNP, SDLoc(N), 24637 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), 24638 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), 24639 MVT::Other}), 24640 {LoadNode->getChain(), LoadNode->getBasePtr()}, 24641 LoadNode->getMemoryVT(), LoadNode->getMemOperand()); 24642 24643 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT, 24644 Result.getValue(0), Result.getValue(1)); 24645 Results.append({Pair, Result.getValue(2) /* Chain */}); 24646 return; 24647 } 24648 24649 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) || 24650 LoadNode->getMemoryVT() != MVT::i128) { 24651 // Non-volatile or atomic loads are optimized later in AArch64's load/store 24652 // optimizer. 24653 return; 24654 } 24655 24656 if (SDValue(N, 0).getValueType() == MVT::i128) { 24657 auto *AN = dyn_cast<AtomicSDNode>(LoadNode); 24658 bool isLoadAcquire = 24659 AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire; 24660 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP; 24661 24662 if (isLoadAcquire) 24663 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3)); 24664 24665 SDValue Result = DAG.getMemIntrinsicNode( 24666 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}), 24667 {LoadNode->getChain(), LoadNode->getBasePtr()}, 24668 LoadNode->getMemoryVT(), LoadNode->getMemOperand()); 24669 24670 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0; 24671 24672 SDValue Pair = 24673 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, 24674 Result.getValue(FirstRes), Result.getValue(1 - FirstRes)); 24675 Results.append({Pair, Result.getValue(2) /* Chain */}); 24676 } 24677 return; 24678 } 24679 case ISD::EXTRACT_SUBVECTOR: 24680 ReplaceExtractSubVectorResults(N, Results, DAG); 24681 return; 24682 case ISD::INSERT_SUBVECTOR: 24683 case ISD::CONCAT_VECTORS: 24684 // Custom lowering has been requested for INSERT_SUBVECTOR and 24685 // CONCAT_VECTORS -- but delegate to common code for result type 24686 // legalisation 24687 return; 24688 case ISD::INTRINSIC_WO_CHAIN: { 24689 EVT VT = N->getValueType(0); 24690 assert((VT == MVT::i8 || VT == MVT::i16) && 24691 "custom lowering for unexpected type"); 24692 24693 ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0)); 24694 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 24695 switch (IntID) { 24696 default: 24697 return; 24698 case Intrinsic::aarch64_sve_clasta_n: { 24699 SDLoc DL(N); 24700 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2)); 24701 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32, 24702 N->getOperand(1), Op2, N->getOperand(3)); 24703 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 24704 return; 24705 } 24706 case Intrinsic::aarch64_sve_clastb_n: { 24707 SDLoc DL(N); 24708 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2)); 24709 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32, 24710 N->getOperand(1), Op2, N->getOperand(3)); 24711 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 24712 return; 24713 } 24714 case Intrinsic::aarch64_sve_lasta: { 24715 SDLoc DL(N); 24716 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32, 24717 N->getOperand(1), N->getOperand(2)); 24718 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 24719 return; 24720 } 24721 case Intrinsic::aarch64_sve_lastb: { 24722 SDLoc DL(N); 24723 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32, 24724 N->getOperand(1), N->getOperand(2)); 24725 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); 24726 return; 24727 } 24728 } 24729 } 24730 case ISD::READ_REGISTER: { 24731 SDLoc DL(N); 24732 assert(N->getValueType(0) == MVT::i128 && 24733 "READ_REGISTER custom lowering is only for 128-bit sysregs"); 24734 SDValue Chain = N->getOperand(0); 24735 SDValue SysRegName = N->getOperand(1); 24736 24737 SDValue Result = DAG.getNode( 24738 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}), 24739 Chain, SysRegName); 24740 24741 // Sysregs are not endian. Result.getValue(0) always contains the lower half 24742 // of the 128-bit System Register value. 24743 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, 24744 Result.getValue(0), Result.getValue(1)); 24745 Results.push_back(Pair); 24746 Results.push_back(Result.getValue(2)); // Chain 24747 return; 24748 } 24749 } 24750 } 24751 24752 bool AArch64TargetLowering::useLoadStackGuardNode() const { 24753 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia()) 24754 return TargetLowering::useLoadStackGuardNode(); 24755 return true; 24756 } 24757 24758 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { 24759 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 24760 // reciprocal if there are three or more FDIVs. 24761 return 3; 24762 } 24763 24764 TargetLoweringBase::LegalizeTypeAction 24765 AArch64TargetLowering::getPreferredVectorAction(MVT VT) const { 24766 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, 24767 // v4i16, v2i32 instead of to promote. 24768 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 || 24769 VT == MVT::v1f32) 24770 return TypeWidenVector; 24771 24772 return TargetLoweringBase::getPreferredVectorAction(VT); 24773 } 24774 24775 // In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic 24776 // provided the address is 16-byte aligned. 24777 bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const { 24778 if (!Subtarget->hasLSE2()) 24779 return false; 24780 24781 if (auto LI = dyn_cast<LoadInst>(I)) 24782 return LI->getType()->getPrimitiveSizeInBits() == 128 && 24783 LI->getAlign() >= Align(16); 24784 24785 if (auto SI = dyn_cast<StoreInst>(I)) 24786 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 && 24787 SI->getAlign() >= Align(16); 24788 24789 return false; 24790 } 24791 24792 bool AArch64TargetLowering::isOpSuitableForLSE128(const Instruction *I) const { 24793 if (!Subtarget->hasLSE128()) 24794 return false; 24795 24796 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP 24797 // will clobber the two registers. 24798 if (const auto *SI = dyn_cast<StoreInst>(I)) 24799 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 && 24800 SI->getAlign() >= Align(16) && 24801 (SI->getOrdering() == AtomicOrdering::Release || 24802 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent); 24803 24804 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I)) 24805 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 && 24806 RMW->getAlign() >= Align(16) && 24807 (RMW->getOperation() == AtomicRMWInst::Xchg || 24808 RMW->getOperation() == AtomicRMWInst::And || 24809 RMW->getOperation() == AtomicRMWInst::Or); 24810 24811 return false; 24812 } 24813 24814 bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction *I) const { 24815 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3()) 24816 return false; 24817 24818 if (auto LI = dyn_cast<LoadInst>(I)) 24819 return LI->getType()->getPrimitiveSizeInBits() == 128 && 24820 LI->getAlign() >= Align(16) && 24821 LI->getOrdering() == AtomicOrdering::Acquire; 24822 24823 if (auto SI = dyn_cast<StoreInst>(I)) 24824 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 && 24825 SI->getAlign() >= Align(16) && 24826 SI->getOrdering() == AtomicOrdering::Release; 24827 24828 return false; 24829 } 24830 24831 bool AArch64TargetLowering::shouldInsertFencesForAtomic( 24832 const Instruction *I) const { 24833 if (isOpSuitableForRCPC3(I)) 24834 return false; 24835 if (isOpSuitableForLSE128(I)) 24836 return false; 24837 if (isOpSuitableForLDPSTP(I)) 24838 return true; 24839 return false; 24840 } 24841 24842 bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore( 24843 const Instruction *I) const { 24844 // Store-Release instructions only provide seq_cst guarantees when paired with 24845 // Load-Acquire instructions. MSVC CRT does not use these instructions to 24846 // implement seq_cst loads and stores, so we need additional explicit fences 24847 // after memory writes. 24848 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 24849 return false; 24850 24851 switch (I->getOpcode()) { 24852 default: 24853 return false; 24854 case Instruction::AtomicCmpXchg: 24855 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() == 24856 AtomicOrdering::SequentiallyConsistent; 24857 case Instruction::AtomicRMW: 24858 return cast<AtomicRMWInst>(I)->getOrdering() == 24859 AtomicOrdering::SequentiallyConsistent; 24860 case Instruction::Store: 24861 return cast<StoreInst>(I)->getOrdering() == 24862 AtomicOrdering::SequentiallyConsistent; 24863 } 24864 } 24865 24866 // Loads and stores less than 128-bits are already atomic; ones above that 24867 // are doomed anyway, so defer to the default libcall and blame the OS when 24868 // things go wrong. 24869 TargetLoweringBase::AtomicExpansionKind 24870 AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 24871 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 24872 if (Size != 128) 24873 return AtomicExpansionKind::None; 24874 if (isOpSuitableForRCPC3(SI)) 24875 return AtomicExpansionKind::None; 24876 if (isOpSuitableForLSE128(SI)) 24877 return AtomicExpansionKind::Expand; 24878 if (isOpSuitableForLDPSTP(SI)) 24879 return AtomicExpansionKind::None; 24880 return AtomicExpansionKind::Expand; 24881 } 24882 24883 // Loads and stores less than 128-bits are already atomic; ones above that 24884 // are doomed anyway, so defer to the default libcall and blame the OS when 24885 // things go wrong. 24886 TargetLowering::AtomicExpansionKind 24887 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 24888 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 24889 24890 if (Size != 128) 24891 return AtomicExpansionKind::None; 24892 if (isOpSuitableForRCPC3(LI)) 24893 return AtomicExpansionKind::None; 24894 // No LSE128 loads 24895 if (isOpSuitableForLDPSTP(LI)) 24896 return AtomicExpansionKind::None; 24897 24898 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 24899 // implement atomicrmw without spilling. If the target address is also on the 24900 // stack and close enough to the spill slot, this can lead to a situation 24901 // where the monitor always gets cleared and the atomic operation can never 24902 // succeed. So at -O0 lower this operation to a CAS loop. 24903 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None) 24904 return AtomicExpansionKind::CmpXChg; 24905 24906 // Using CAS for an atomic load has a better chance of succeeding under high 24907 // contention situations. So use it if available. 24908 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg 24909 : AtomicExpansionKind::LLSC; 24910 } 24911 24912 // The "default" for integer RMW operations is to expand to an LL/SC loop. 24913 // However, with the LSE instructions (or outline-atomics mode, which provides 24914 // library routines in place of the LSE-instructions), we can directly emit many 24915 // operations instead. 24916 // 24917 // Floating-point operations are always emitted to a cmpxchg loop, because they 24918 // may trigger a trap which aborts an LLSC sequence. 24919 TargetLowering::AtomicExpansionKind 24920 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 24921 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 24922 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes."); 24923 24924 if (AI->isFloatingPointOperation()) 24925 return AtomicExpansionKind::CmpXChg; 24926 24927 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 && 24928 (AI->getOperation() == AtomicRMWInst::Xchg || 24929 AI->getOperation() == AtomicRMWInst::Or || 24930 AI->getOperation() == AtomicRMWInst::And); 24931 if (CanUseLSE128) 24932 return AtomicExpansionKind::None; 24933 24934 // Nand is not supported in LSE. 24935 // Leave 128 bits to LLSC or CmpXChg. 24936 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) { 24937 if (Subtarget->hasLSE()) 24938 return AtomicExpansionKind::None; 24939 if (Subtarget->outlineAtomics()) { 24940 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far. 24941 // Don't outline them unless 24942 // (1) high level <atomic> support approved: 24943 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf 24944 // (2) low level libgcc and compiler-rt support implemented by: 24945 // min/max outline atomics helpers 24946 if (AI->getOperation() != AtomicRMWInst::Min && 24947 AI->getOperation() != AtomicRMWInst::Max && 24948 AI->getOperation() != AtomicRMWInst::UMin && 24949 AI->getOperation() != AtomicRMWInst::UMax) { 24950 return AtomicExpansionKind::None; 24951 } 24952 } 24953 } 24954 24955 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 24956 // implement atomicrmw without spilling. If the target address is also on the 24957 // stack and close enough to the spill slot, this can lead to a situation 24958 // where the monitor always gets cleared and the atomic operation can never 24959 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if 24960 // we have a single CAS instruction that can replace the loop. 24961 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None || 24962 Subtarget->hasLSE()) 24963 return AtomicExpansionKind::CmpXChg; 24964 24965 return AtomicExpansionKind::LLSC; 24966 } 24967 24968 TargetLowering::AtomicExpansionKind 24969 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( 24970 AtomicCmpXchgInst *AI) const { 24971 // If subtarget has LSE, leave cmpxchg intact for codegen. 24972 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) 24973 return AtomicExpansionKind::None; 24974 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 24975 // implement cmpxchg without spilling. If the address being exchanged is also 24976 // on the stack and close enough to the spill slot, this can lead to a 24977 // situation where the monitor always gets cleared and the atomic operation 24978 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 24979 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None) 24980 return AtomicExpansionKind::None; 24981 24982 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand 24983 // it. 24984 unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits(); 24985 if (Size > 64) 24986 return AtomicExpansionKind::None; 24987 24988 return AtomicExpansionKind::LLSC; 24989 } 24990 24991 Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, 24992 Type *ValueTy, Value *Addr, 24993 AtomicOrdering Ord) const { 24994 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 24995 bool IsAcquire = isAcquireOrStronger(Ord); 24996 24997 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd 24998 // intrinsic must return {i64, i64} and we have to recombine them into a 24999 // single i128 here. 25000 if (ValueTy->getPrimitiveSizeInBits() == 128) { 25001 Intrinsic::ID Int = 25002 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; 25003 Function *Ldxr = Intrinsic::getDeclaration(M, Int); 25004 25005 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); 25006 25007 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 25008 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 25009 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64"); 25010 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64"); 25011 return Builder.CreateOr( 25012 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64"); 25013 } 25014 25015 Type *Tys[] = { Addr->getType() }; 25016 Intrinsic::ID Int = 25017 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; 25018 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); 25019 25020 const DataLayout &DL = M->getDataLayout(); 25021 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy)); 25022 CallInst *CI = Builder.CreateCall(Ldxr, Addr); 25023 CI->addParamAttr( 25024 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy)); 25025 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy); 25026 25027 return Builder.CreateBitCast(Trunc, ValueTy); 25028 } 25029 25030 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 25031 IRBuilderBase &Builder) const { 25032 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 25033 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); 25034 } 25035 25036 Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, 25037 Value *Val, Value *Addr, 25038 AtomicOrdering Ord) const { 25039 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 25040 bool IsRelease = isReleaseOrStronger(Ord); 25041 25042 // Since the intrinsics must have legal type, the i128 intrinsics take two 25043 // parameters: "i64, i64". We must marshal Val into the appropriate form 25044 // before the call. 25045 if (Val->getType()->getPrimitiveSizeInBits() == 128) { 25046 Intrinsic::ID Int = 25047 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; 25048 Function *Stxr = Intrinsic::getDeclaration(M, Int); 25049 Type *Int64Ty = Type::getInt64Ty(M->getContext()); 25050 25051 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); 25052 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); 25053 return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); 25054 } 25055 25056 Intrinsic::ID Int = 25057 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; 25058 Type *Tys[] = { Addr->getType() }; 25059 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); 25060 25061 const DataLayout &DL = M->getDataLayout(); 25062 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType())); 25063 Val = Builder.CreateBitCast(Val, IntValTy); 25064 25065 CallInst *CI = Builder.CreateCall( 25066 Stxr, {Builder.CreateZExtOrBitCast( 25067 Val, Stxr->getFunctionType()->getParamType(0)), 25068 Addr}); 25069 CI->addParamAttr(1, Attribute::get(Builder.getContext(), 25070 Attribute::ElementType, Val->getType())); 25071 return CI; 25072 } 25073 25074 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( 25075 Type *Ty, CallingConv::ID CallConv, bool isVarArg, 25076 const DataLayout &DL) const { 25077 if (!Ty->isArrayTy()) { 25078 const TypeSize &TySize = Ty->getPrimitiveSizeInBits(); 25079 return TySize.isScalable() && TySize.getKnownMinValue() > 128; 25080 } 25081 25082 // All non aggregate members of the type must have the same type 25083 SmallVector<EVT> ValueVTs; 25084 ComputeValueVTs(*this, DL, Ty, ValueVTs); 25085 return all_equal(ValueVTs); 25086 } 25087 25088 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, 25089 EVT) const { 25090 return false; 25091 } 25092 25093 static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) { 25094 Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 25095 Function *ThreadPointerFunc = 25096 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); 25097 return IRB.CreatePointerCast( 25098 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), 25099 Offset), 25100 IRB.getPtrTy(0)); 25101 } 25102 25103 Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { 25104 // Android provides a fixed TLS slot for the stack cookie. See the definition 25105 // of TLS_SLOT_STACK_GUARD in 25106 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 25107 if (Subtarget->isTargetAndroid()) 25108 return UseTlsOffset(IRB, 0x28); 25109 25110 // Fuchsia is similar. 25111 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value. 25112 if (Subtarget->isTargetFuchsia()) 25113 return UseTlsOffset(IRB, -0x10); 25114 25115 return TargetLowering::getIRStackGuard(IRB); 25116 } 25117 25118 void AArch64TargetLowering::insertSSPDeclarations(Module &M) const { 25119 // MSVC CRT provides functionalities for stack protection. 25120 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) { 25121 // MSVC CRT has a global variable holding security cookie. 25122 M.getOrInsertGlobal("__security_cookie", 25123 PointerType::getUnqual(M.getContext())); 25124 25125 // MSVC CRT has a function to validate security cookie. 25126 FunctionCallee SecurityCheckCookie = 25127 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(), 25128 Type::getVoidTy(M.getContext()), 25129 PointerType::getUnqual(M.getContext())); 25130 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { 25131 F->setCallingConv(CallingConv::Win64); 25132 F->addParamAttr(0, Attribute::AttrKind::InReg); 25133 } 25134 return; 25135 } 25136 TargetLowering::insertSSPDeclarations(M); 25137 } 25138 25139 Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const { 25140 // MSVC CRT has a global variable holding security cookie. 25141 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 25142 return M.getGlobalVariable("__security_cookie"); 25143 return TargetLowering::getSDagStackGuard(M); 25144 } 25145 25146 Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { 25147 // MSVC CRT has a function to validate security cookie. 25148 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 25149 return M.getFunction(Subtarget->getSecurityCheckCookieName()); 25150 return TargetLowering::getSSPStackGuardCheck(M); 25151 } 25152 25153 Value * 25154 AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { 25155 // Android provides a fixed TLS slot for the SafeStack pointer. See the 25156 // definition of TLS_SLOT_SAFESTACK in 25157 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 25158 if (Subtarget->isTargetAndroid()) 25159 return UseTlsOffset(IRB, 0x48); 25160 25161 // Fuchsia is similar. 25162 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. 25163 if (Subtarget->isTargetFuchsia()) 25164 return UseTlsOffset(IRB, -0x8); 25165 25166 return TargetLowering::getSafeStackPointerLocation(IRB); 25167 } 25168 25169 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( 25170 const Instruction &AndI) const { 25171 // Only sink 'and' mask to cmp use block if it is masking a single bit, since 25172 // this is likely to be fold the and/cmp/br into a single tbz instruction. It 25173 // may be beneficial to sink in other cases, but we would have to check that 25174 // the cmp would not get folded into the br to form a cbz for these to be 25175 // beneficial. 25176 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1)); 25177 if (!Mask) 25178 return false; 25179 return Mask->getValue().isPowerOf2(); 25180 } 25181 25182 bool AArch64TargetLowering:: 25183 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( 25184 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, 25185 unsigned OldShiftOpcode, unsigned NewShiftOpcode, 25186 SelectionDAG &DAG) const { 25187 // Does baseline recommend not to perform the fold by default? 25188 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( 25189 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) 25190 return false; 25191 // Else, if this is a vector shift, prefer 'shl'. 25192 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL; 25193 } 25194 25195 TargetLowering::ShiftLegalizationStrategy 25196 AArch64TargetLowering::preferredShiftLegalizationStrategy( 25197 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const { 25198 if (DAG.getMachineFunction().getFunction().hasMinSize() && 25199 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin()) 25200 return ShiftLegalizationStrategy::LowerToLibcall; 25201 return TargetLowering::preferredShiftLegalizationStrategy(DAG, N, 25202 ExpansionFactor); 25203 } 25204 25205 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 25206 // Update IsSplitCSR in AArch64unctionInfo. 25207 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); 25208 AFI->setIsSplitCSR(true); 25209 } 25210 25211 void AArch64TargetLowering::insertCopiesSplitCSR( 25212 MachineBasicBlock *Entry, 25213 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 25214 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 25215 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 25216 if (!IStart) 25217 return; 25218 25219 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 25220 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 25221 MachineBasicBlock::iterator MBBI = Entry->begin(); 25222 for (const MCPhysReg *I = IStart; *I; ++I) { 25223 const TargetRegisterClass *RC = nullptr; 25224 if (AArch64::GPR64RegClass.contains(*I)) 25225 RC = &AArch64::GPR64RegClass; 25226 else if (AArch64::FPR64RegClass.contains(*I)) 25227 RC = &AArch64::FPR64RegClass; 25228 else 25229 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 25230 25231 Register NewVR = MRI->createVirtualRegister(RC); 25232 // Create copy from CSR to a virtual register. 25233 // FIXME: this currently does not emit CFI pseudo-instructions, it works 25234 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 25235 // nounwind. If we want to generalize this later, we may need to emit 25236 // CFI pseudo-instructions. 25237 assert(Entry->getParent()->getFunction().hasFnAttribute( 25238 Attribute::NoUnwind) && 25239 "Function should be nounwind in insertCopiesSplitCSR!"); 25240 Entry->addLiveIn(*I); 25241 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 25242 .addReg(*I); 25243 25244 // Insert the copy-back instructions right before the terminator. 25245 for (auto *Exit : Exits) 25246 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 25247 TII->get(TargetOpcode::COPY), *I) 25248 .addReg(NewVR); 25249 } 25250 } 25251 25252 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { 25253 // Integer division on AArch64 is expensive. However, when aggressively 25254 // optimizing for code size, we prefer to use a div instruction, as it is 25255 // usually smaller than the alternative sequence. 25256 // The exception to this is vector division. Since AArch64 doesn't have vector 25257 // integer division, leaving the division as-is is a loss even in terms of 25258 // size, because it will have to be scalarized, while the alternative code 25259 // sequence can be performed in vector form. 25260 bool OptSize = Attr.hasFnAttr(Attribute::MinSize); 25261 return OptSize && !VT.isVector(); 25262 } 25263 25264 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 25265 // We want inc-of-add for scalars and sub-of-not for vectors. 25266 return VT.isScalarInteger(); 25267 } 25268 25269 bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT, 25270 EVT VT) const { 25271 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to 25272 // legalize. 25273 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16()) 25274 return false; 25275 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT); 25276 } 25277 25278 MachineInstr * 25279 AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB, 25280 MachineBasicBlock::instr_iterator &MBBI, 25281 const TargetInstrInfo *TII) const { 25282 assert(MBBI->isCall() && MBBI->getCFIType() && 25283 "Invalid call instruction for a KCFI check"); 25284 25285 switch (MBBI->getOpcode()) { 25286 case AArch64::BLR: 25287 case AArch64::BLRNoIP: 25288 case AArch64::TCRETURNri: 25289 case AArch64::TCRETURNriBTI: 25290 break; 25291 default: 25292 llvm_unreachable("Unexpected CFI call opcode"); 25293 } 25294 25295 MachineOperand &Target = MBBI->getOperand(0); 25296 assert(Target.isReg() && "Invalid target operand for an indirect call"); 25297 Target.setIsRenamable(false); 25298 25299 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK)) 25300 .addReg(Target.getReg()) 25301 .addImm(MBBI->getCFIType()) 25302 .getInstr(); 25303 } 25304 25305 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const { 25306 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint(); 25307 } 25308 25309 unsigned 25310 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { 25311 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) 25312 return getPointerTy(DL).getSizeInBits(); 25313 25314 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; 25315 } 25316 25317 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { 25318 MachineFrameInfo &MFI = MF.getFrameInfo(); 25319 // If we have any vulnerable SVE stack objects then the stack protector 25320 // needs to be placed at the top of the SVE stack area, as the SVE locals 25321 // are placed above the other locals, so we allocate it as if it were a 25322 // scalable vector. 25323 // FIXME: It may be worthwhile having a specific interface for this rather 25324 // than doing it here in finalizeLowering. 25325 if (MFI.hasStackProtectorIndex()) { 25326 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) { 25327 if (MFI.getStackID(i) == TargetStackID::ScalableVector && 25328 MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) { 25329 MFI.setStackID(MFI.getStackProtectorIndex(), 25330 TargetStackID::ScalableVector); 25331 MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16)); 25332 break; 25333 } 25334 } 25335 } 25336 MFI.computeMaxCallFrameSize(MF); 25337 TargetLoweringBase::finalizeLowering(MF); 25338 } 25339 25340 // Unlike X86, we let frame lowering assign offsets to all catch objects. 25341 bool AArch64TargetLowering::needsFixedCatchObjects() const { 25342 return false; 25343 } 25344 25345 bool AArch64TargetLowering::shouldLocalize( 25346 const MachineInstr &MI, const TargetTransformInfo *TTI) const { 25347 auto &MF = *MI.getMF(); 25348 auto &MRI = MF.getRegInfo(); 25349 auto maxUses = [](unsigned RematCost) { 25350 // A cost of 1 means remats are basically free. 25351 if (RematCost == 1) 25352 return std::numeric_limits<unsigned>::max(); 25353 if (RematCost == 2) 25354 return 2U; 25355 25356 // Remat is too expensive, only sink if there's one user. 25357 if (RematCost > 2) 25358 return 1U; 25359 llvm_unreachable("Unexpected remat cost"); 25360 }; 25361 25362 unsigned Opc = MI.getOpcode(); 25363 switch (Opc) { 25364 case TargetOpcode::G_GLOBAL_VALUE: { 25365 // On Darwin, TLS global vars get selected into function calls, which 25366 // we don't want localized, as they can get moved into the middle of a 25367 // another call sequence. 25368 const GlobalValue &GV = *MI.getOperand(1).getGlobal(); 25369 if (GV.isThreadLocal() && Subtarget->isTargetMachO()) 25370 return false; 25371 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure. 25372 } 25373 case TargetOpcode::G_FCONSTANT: 25374 case TargetOpcode::G_CONSTANT: { 25375 const ConstantInt *CI; 25376 unsigned AdditionalCost = 0; 25377 25378 if (Opc == TargetOpcode::G_CONSTANT) 25379 CI = MI.getOperand(1).getCImm(); 25380 else { 25381 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 25382 // We try to estimate cost of 32/64b fpimms, as they'll likely be 25383 // materialized as integers. 25384 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64) 25385 break; 25386 auto APF = MI.getOperand(1).getFPImm()->getValueAPF(); 25387 bool OptForSize = 25388 MF.getFunction().hasOptSize() || MF.getFunction().hasMinSize(); 25389 if (isFPImmLegal(APF, EVT::getFloatingPointVT(Ty.getScalarSizeInBits()), 25390 OptForSize)) 25391 return true; // Constant should be cheap. 25392 CI = 25393 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt()); 25394 // FP materialization also costs an extra move, from gpr to fpr. 25395 AdditionalCost = 1; 25396 } 25397 APInt Imm = CI->getValue(); 25398 InstructionCost Cost = TTI->getIntImmCost( 25399 Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize); 25400 assert(Cost.isValid() && "Expected a valid imm cost"); 25401 25402 unsigned RematCost = *Cost.getValue(); 25403 RematCost += AdditionalCost; 25404 Register Reg = MI.getOperand(0).getReg(); 25405 unsigned MaxUses = maxUses(RematCost); 25406 // Don't pass UINT_MAX sentinal value to hasAtMostUserInstrs(). 25407 if (MaxUses == std::numeric_limits<unsigned>::max()) 25408 --MaxUses; 25409 return MRI.hasAtMostUserInstrs(Reg, MaxUses); 25410 } 25411 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being 25412 // localizable. 25413 case AArch64::ADRP: 25414 case AArch64::G_ADD_LOW: 25415 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too. 25416 case TargetOpcode::G_PTR_ADD: 25417 return true; 25418 default: 25419 break; 25420 } 25421 return TargetLoweringBase::shouldLocalize(MI, TTI); 25422 } 25423 25424 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { 25425 if (Inst.getType()->isScalableTy()) 25426 return true; 25427 25428 for (unsigned i = 0; i < Inst.getNumOperands(); ++i) 25429 if (Inst.getOperand(i)->getType()->isScalableTy()) 25430 return true; 25431 25432 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) { 25433 if (AI->getAllocatedType()->isScalableTy()) 25434 return true; 25435 } 25436 25437 // Checks to allow the use of SME instructions 25438 if (auto *Base = dyn_cast<CallBase>(&Inst)) { 25439 auto CallerAttrs = SMEAttrs(*Inst.getFunction()); 25440 auto CalleeAttrs = SMEAttrs(*Base); 25441 if (CallerAttrs.requiresSMChange(CalleeAttrs, 25442 /*BodyOverridesInterface=*/false) || 25443 CallerAttrs.requiresLazySave(CalleeAttrs)) 25444 return true; 25445 } 25446 return false; 25447 } 25448 25449 // Return the largest legal scalable vector type that matches VT's element type. 25450 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) { 25451 assert(VT.isFixedLengthVector() && 25452 DAG.getTargetLoweringInfo().isTypeLegal(VT) && 25453 "Expected legal fixed length vector!"); 25454 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 25455 default: 25456 llvm_unreachable("unexpected element type for SVE container"); 25457 case MVT::i8: 25458 return EVT(MVT::nxv16i8); 25459 case MVT::i16: 25460 return EVT(MVT::nxv8i16); 25461 case MVT::i32: 25462 return EVT(MVT::nxv4i32); 25463 case MVT::i64: 25464 return EVT(MVT::nxv2i64); 25465 case MVT::f16: 25466 return EVT(MVT::nxv8f16); 25467 case MVT::f32: 25468 return EVT(MVT::nxv4f32); 25469 case MVT::f64: 25470 return EVT(MVT::nxv2f64); 25471 } 25472 } 25473 25474 // Return a PTRUE with active lanes corresponding to the extent of VT. 25475 static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, 25476 EVT VT) { 25477 assert(VT.isFixedLengthVector() && 25478 DAG.getTargetLoweringInfo().isTypeLegal(VT) && 25479 "Expected legal fixed length vector!"); 25480 25481 std::optional<unsigned> PgPattern = 25482 getSVEPredPatternFromNumElements(VT.getVectorNumElements()); 25483 assert(PgPattern && "Unexpected element count for SVE predicate"); 25484 25485 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use 25486 // AArch64SVEPredPattern::all, which can enable the use of unpredicated 25487 // variants of instructions when available. 25488 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); 25489 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits(); 25490 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits(); 25491 if (MaxSVESize && MinSVESize == MaxSVESize && 25492 MaxSVESize == VT.getSizeInBits()) 25493 PgPattern = AArch64SVEPredPattern::all; 25494 25495 MVT MaskVT; 25496 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { 25497 default: 25498 llvm_unreachable("unexpected element type for SVE predicate"); 25499 case MVT::i8: 25500 MaskVT = MVT::nxv16i1; 25501 break; 25502 case MVT::i16: 25503 case MVT::f16: 25504 MaskVT = MVT::nxv8i1; 25505 break; 25506 case MVT::i32: 25507 case MVT::f32: 25508 MaskVT = MVT::nxv4i1; 25509 break; 25510 case MVT::i64: 25511 case MVT::f64: 25512 MaskVT = MVT::nxv2i1; 25513 break; 25514 } 25515 25516 return getPTrue(DAG, DL, MaskVT, *PgPattern); 25517 } 25518 25519 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, 25520 EVT VT) { 25521 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && 25522 "Expected legal scalable vector!"); 25523 auto PredTy = VT.changeVectorElementType(MVT::i1); 25524 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all); 25525 } 25526 25527 static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) { 25528 if (VT.isFixedLengthVector()) 25529 return getPredicateForFixedLengthVector(DAG, DL, VT); 25530 25531 return getPredicateForScalableVector(DAG, DL, VT); 25532 } 25533 25534 // Grow V to consume an entire SVE register. 25535 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { 25536 assert(VT.isScalableVector() && 25537 "Expected to convert into a scalable vector!"); 25538 assert(V.getValueType().isFixedLengthVector() && 25539 "Expected a fixed length vector operand!"); 25540 SDLoc DL(V); 25541 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 25542 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero); 25543 } 25544 25545 // Shrink V so it's just big enough to maintain a VT's worth of data. 25546 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) { 25547 assert(VT.isFixedLengthVector() && 25548 "Expected to convert into a fixed length vector!"); 25549 assert(V.getValueType().isScalableVector() && 25550 "Expected a scalable vector operand!"); 25551 SDLoc DL(V); 25552 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 25553 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero); 25554 } 25555 25556 // Convert all fixed length vector loads larger than NEON to masked_loads. 25557 SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE( 25558 SDValue Op, SelectionDAG &DAG) const { 25559 auto Load = cast<LoadSDNode>(Op); 25560 25561 SDLoc DL(Op); 25562 EVT VT = Op.getValueType(); 25563 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 25564 EVT LoadVT = ContainerVT; 25565 EVT MemVT = Load->getMemoryVT(); 25566 25567 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); 25568 25569 if (VT.isFloatingPoint()) { 25570 LoadVT = ContainerVT.changeTypeToInteger(); 25571 MemVT = MemVT.changeTypeToInteger(); 25572 } 25573 25574 SDValue NewLoad = DAG.getMaskedLoad( 25575 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg, 25576 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(), 25577 Load->getAddressingMode(), Load->getExtensionType()); 25578 25579 SDValue Result = NewLoad; 25580 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) { 25581 EVT ExtendVT = ContainerVT.changeVectorElementType( 25582 Load->getMemoryVT().getVectorElementType()); 25583 25584 Result = getSVESafeBitCast(ExtendVT, Result, DAG); 25585 Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, 25586 Pg, Result, DAG.getUNDEF(ContainerVT)); 25587 } else if (VT.isFloatingPoint()) { 25588 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result); 25589 } 25590 25591 Result = convertFromScalableVector(DAG, VT, Result); 25592 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)}; 25593 return DAG.getMergeValues(MergedValues, DL); 25594 } 25595 25596 static SDValue convertFixedMaskToScalableVector(SDValue Mask, 25597 SelectionDAG &DAG) { 25598 SDLoc DL(Mask); 25599 EVT InVT = Mask.getValueType(); 25600 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 25601 25602 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); 25603 25604 if (ISD::isBuildVectorAllOnes(Mask.getNode())) 25605 return Pg; 25606 25607 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask); 25608 auto Op2 = DAG.getConstant(0, DL, ContainerVT); 25609 25610 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(), 25611 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)}); 25612 } 25613 25614 // Convert all fixed length vector loads larger than NEON to masked_loads. 25615 SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE( 25616 SDValue Op, SelectionDAG &DAG) const { 25617 auto Load = cast<MaskedLoadSDNode>(Op); 25618 25619 SDLoc DL(Op); 25620 EVT VT = Op.getValueType(); 25621 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 25622 25623 SDValue Mask = Load->getMask(); 25624 // If this is an extending load and the mask type is not the same as 25625 // load's type then we have to extend the mask type. 25626 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) { 25627 assert(Load->getExtensionType() != ISD::NON_EXTLOAD && 25628 "Incorrect mask type"); 25629 Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask); 25630 } 25631 Mask = convertFixedMaskToScalableVector(Mask, DAG); 25632 25633 SDValue PassThru; 25634 bool IsPassThruZeroOrUndef = false; 25635 25636 if (Load->getPassThru()->isUndef()) { 25637 PassThru = DAG.getUNDEF(ContainerVT); 25638 IsPassThruZeroOrUndef = true; 25639 } else { 25640 if (ContainerVT.isInteger()) 25641 PassThru = DAG.getConstant(0, DL, ContainerVT); 25642 else 25643 PassThru = DAG.getConstantFP(0, DL, ContainerVT); 25644 if (isZerosVector(Load->getPassThru().getNode())) 25645 IsPassThruZeroOrUndef = true; 25646 } 25647 25648 SDValue NewLoad = DAG.getMaskedLoad( 25649 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), 25650 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(), 25651 Load->getAddressingMode(), Load->getExtensionType()); 25652 25653 SDValue Result = NewLoad; 25654 if (!IsPassThruZeroOrUndef) { 25655 SDValue OldPassThru = 25656 convertToScalableVector(DAG, ContainerVT, Load->getPassThru()); 25657 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru); 25658 } 25659 25660 Result = convertFromScalableVector(DAG, VT, Result); 25661 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)}; 25662 return DAG.getMergeValues(MergedValues, DL); 25663 } 25664 25665 // Convert all fixed length vector stores larger than NEON to masked_stores. 25666 SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( 25667 SDValue Op, SelectionDAG &DAG) const { 25668 auto Store = cast<StoreSDNode>(Op); 25669 25670 SDLoc DL(Op); 25671 EVT VT = Store->getValue().getValueType(); 25672 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 25673 EVT MemVT = Store->getMemoryVT(); 25674 25675 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); 25676 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); 25677 25678 if (VT.isFloatingPoint() && Store->isTruncatingStore()) { 25679 EVT TruncVT = ContainerVT.changeVectorElementType( 25680 Store->getMemoryVT().getVectorElementType()); 25681 MemVT = MemVT.changeTypeToInteger(); 25682 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg, 25683 NewValue, DAG.getTargetConstant(0, DL, MVT::i64), 25684 DAG.getUNDEF(TruncVT)); 25685 NewValue = 25686 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG); 25687 } else if (VT.isFloatingPoint()) { 25688 MemVT = MemVT.changeTypeToInteger(); 25689 NewValue = 25690 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG); 25691 } 25692 25693 return DAG.getMaskedStore(Store->getChain(), DL, NewValue, 25694 Store->getBasePtr(), Store->getOffset(), Pg, MemVT, 25695 Store->getMemOperand(), Store->getAddressingMode(), 25696 Store->isTruncatingStore()); 25697 } 25698 25699 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE( 25700 SDValue Op, SelectionDAG &DAG) const { 25701 auto *Store = cast<MaskedStoreSDNode>(Op); 25702 25703 SDLoc DL(Op); 25704 EVT VT = Store->getValue().getValueType(); 25705 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 25706 25707 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); 25708 SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG); 25709 25710 return DAG.getMaskedStore( 25711 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), 25712 Mask, Store->getMemoryVT(), Store->getMemOperand(), 25713 Store->getAddressingMode(), Store->isTruncatingStore()); 25714 } 25715 25716 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE( 25717 SDValue Op, SelectionDAG &DAG) const { 25718 SDLoc dl(Op); 25719 EVT VT = Op.getValueType(); 25720 EVT EltVT = VT.getVectorElementType(); 25721 25722 bool Signed = Op.getOpcode() == ISD::SDIV; 25723 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; 25724 25725 bool Negated; 25726 uint64_t SplatVal; 25727 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) { 25728 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 25729 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); 25730 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32); 25731 25732 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT); 25733 SDValue Res = 25734 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2); 25735 if (Negated) 25736 Res = DAG.getNode(ISD::SUB, dl, ContainerVT, 25737 DAG.getConstant(0, dl, ContainerVT), Res); 25738 25739 return convertFromScalableVector(DAG, VT, Res); 25740 } 25741 25742 // Scalable vector i32/i64 DIV is supported. 25743 if (EltVT == MVT::i32 || EltVT == MVT::i64) 25744 return LowerToPredicatedOp(Op, DAG, PredOpcode); 25745 25746 // Scalable vector i8/i16 DIV is not supported. Promote it to i32. 25747 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); 25748 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext()); 25749 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 25750 25751 // If the wider type is legal: extend, op, and truncate. 25752 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext()); 25753 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) { 25754 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0)); 25755 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1)); 25756 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1); 25757 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div); 25758 } 25759 25760 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT, 25761 &ExtendOpcode](SDValue Op) { 25762 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64); 25763 SDValue IdxHalf = 25764 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64); 25765 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero); 25766 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf); 25767 return std::pair<SDValue, SDValue>( 25768 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo), 25769 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)}); 25770 }; 25771 25772 // If wider type is not legal: split, extend, op, trunc and concat. 25773 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0)); 25774 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1)); 25775 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt); 25776 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt); 25777 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo); 25778 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi); 25779 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc}); 25780 } 25781 25782 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE( 25783 SDValue Op, SelectionDAG &DAG) const { 25784 EVT VT = Op.getValueType(); 25785 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 25786 25787 SDLoc DL(Op); 25788 SDValue Val = Op.getOperand(0); 25789 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); 25790 Val = convertToScalableVector(DAG, ContainerVT, Val); 25791 25792 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND; 25793 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; 25794 25795 // Repeatedly unpack Val until the result is of the desired element type. 25796 switch (ContainerVT.getSimpleVT().SimpleTy) { 25797 default: 25798 llvm_unreachable("unimplemented container type"); 25799 case MVT::nxv16i8: 25800 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val); 25801 if (VT.getVectorElementType() == MVT::i16) 25802 break; 25803 [[fallthrough]]; 25804 case MVT::nxv8i16: 25805 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val); 25806 if (VT.getVectorElementType() == MVT::i32) 25807 break; 25808 [[fallthrough]]; 25809 case MVT::nxv4i32: 25810 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val); 25811 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!"); 25812 break; 25813 } 25814 25815 return convertFromScalableVector(DAG, VT, Val); 25816 } 25817 25818 SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE( 25819 SDValue Op, SelectionDAG &DAG) const { 25820 EVT VT = Op.getValueType(); 25821 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 25822 25823 SDLoc DL(Op); 25824 SDValue Val = Op.getOperand(0); 25825 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); 25826 Val = convertToScalableVector(DAG, ContainerVT, Val); 25827 25828 // Repeatedly truncate Val until the result is of the desired element type. 25829 switch (ContainerVT.getSimpleVT().SimpleTy) { 25830 default: 25831 llvm_unreachable("unimplemented container type"); 25832 case MVT::nxv2i64: 25833 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val); 25834 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val); 25835 if (VT.getVectorElementType() == MVT::i32) 25836 break; 25837 [[fallthrough]]; 25838 case MVT::nxv4i32: 25839 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val); 25840 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val); 25841 if (VT.getVectorElementType() == MVT::i16) 25842 break; 25843 [[fallthrough]]; 25844 case MVT::nxv8i16: 25845 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val); 25846 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val); 25847 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!"); 25848 break; 25849 } 25850 25851 return convertFromScalableVector(DAG, VT, Val); 25852 } 25853 25854 SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt( 25855 SDValue Op, SelectionDAG &DAG) const { 25856 EVT VT = Op.getValueType(); 25857 EVT InVT = Op.getOperand(0).getValueType(); 25858 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!"); 25859 25860 SDLoc DL(Op); 25861 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 25862 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0)); 25863 25864 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1)); 25865 } 25866 25867 SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt( 25868 SDValue Op, SelectionDAG &DAG) const { 25869 EVT VT = Op.getValueType(); 25870 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 25871 25872 SDLoc DL(Op); 25873 EVT InVT = Op.getOperand(0).getValueType(); 25874 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 25875 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0)); 25876 25877 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0, 25878 Op.getOperand(1), Op.getOperand(2)); 25879 25880 return convertFromScalableVector(DAG, VT, ScalableRes); 25881 } 25882 25883 // Convert vector operation 'Op' to an equivalent predicated operation whereby 25884 // the original operation's type is used to construct a suitable predicate. 25885 // NOTE: The results for inactive lanes are undefined. 25886 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, 25887 SelectionDAG &DAG, 25888 unsigned NewOp) const { 25889 EVT VT = Op.getValueType(); 25890 SDLoc DL(Op); 25891 auto Pg = getPredicateForVector(DAG, DL, VT); 25892 25893 if (VT.isFixedLengthVector()) { 25894 assert(isTypeLegal(VT) && "Expected only legal fixed-width types"); 25895 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 25896 25897 // Create list of operands by converting existing ones to scalable types. 25898 SmallVector<SDValue, 4> Operands = {Pg}; 25899 for (const SDValue &V : Op->op_values()) { 25900 if (isa<CondCodeSDNode>(V)) { 25901 Operands.push_back(V); 25902 continue; 25903 } 25904 25905 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) { 25906 EVT VTArg = VTNode->getVT().getVectorElementType(); 25907 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg); 25908 Operands.push_back(DAG.getValueType(NewVTArg)); 25909 continue; 25910 } 25911 25912 assert(isTypeLegal(V.getValueType()) && 25913 "Expected only legal fixed-width types"); 25914 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V)); 25915 } 25916 25917 if (isMergePassthruOpcode(NewOp)) 25918 Operands.push_back(DAG.getUNDEF(ContainerVT)); 25919 25920 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands); 25921 return convertFromScalableVector(DAG, VT, ScalableRes); 25922 } 25923 25924 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!"); 25925 25926 SmallVector<SDValue, 4> Operands = {Pg}; 25927 for (const SDValue &V : Op->op_values()) { 25928 assert((!V.getValueType().isVector() || 25929 V.getValueType().isScalableVector()) && 25930 "Only scalable vectors are supported!"); 25931 Operands.push_back(V); 25932 } 25933 25934 if (isMergePassthruOpcode(NewOp)) 25935 Operands.push_back(DAG.getUNDEF(VT)); 25936 25937 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags()); 25938 } 25939 25940 // If a fixed length vector operation has no side effects when applied to 25941 // undefined elements, we can safely use scalable vectors to perform the same 25942 // operation without needing to worry about predication. 25943 SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op, 25944 SelectionDAG &DAG) const { 25945 EVT VT = Op.getValueType(); 25946 assert(VT.isFixedLengthVector() && isTypeLegal(VT) && 25947 "Only expected to lower fixed length vector operation!"); 25948 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 25949 25950 // Create list of operands by converting existing ones to scalable types. 25951 SmallVector<SDValue, 4> Ops; 25952 for (const SDValue &V : Op->op_values()) { 25953 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!"); 25954 25955 // Pass through non-vector operands. 25956 if (!V.getValueType().isVector()) { 25957 Ops.push_back(V); 25958 continue; 25959 } 25960 25961 // "cast" fixed length vector to a scalable vector. 25962 assert(V.getValueType().isFixedLengthVector() && 25963 isTypeLegal(V.getValueType()) && 25964 "Only fixed length vectors are supported!"); 25965 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V)); 25966 } 25967 25968 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops); 25969 return convertFromScalableVector(DAG, VT, ScalableRes); 25970 } 25971 25972 SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, 25973 SelectionDAG &DAG) const { 25974 SDLoc DL(ScalarOp); 25975 SDValue AccOp = ScalarOp.getOperand(0); 25976 SDValue VecOp = ScalarOp.getOperand(1); 25977 EVT SrcVT = VecOp.getValueType(); 25978 EVT ResVT = SrcVT.getVectorElementType(); 25979 25980 EVT ContainerVT = SrcVT; 25981 if (SrcVT.isFixedLengthVector()) { 25982 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); 25983 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); 25984 } 25985 25986 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); 25987 SDValue Zero = DAG.getConstant(0, DL, MVT::i64); 25988 25989 // Convert operands to Scalable. 25990 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, 25991 DAG.getUNDEF(ContainerVT), AccOp, Zero); 25992 25993 // Perform reduction. 25994 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT, 25995 Pg, AccOp, VecOp); 25996 25997 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero); 25998 } 25999 26000 SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp, 26001 SelectionDAG &DAG) const { 26002 SDLoc DL(ReduceOp); 26003 SDValue Op = ReduceOp.getOperand(0); 26004 EVT OpVT = Op.getValueType(); 26005 EVT VT = ReduceOp.getValueType(); 26006 26007 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1) 26008 return SDValue(); 26009 26010 SDValue Pg = getPredicateForVector(DAG, DL, OpVT); 26011 26012 switch (ReduceOp.getOpcode()) { 26013 default: 26014 return SDValue(); 26015 case ISD::VECREDUCE_OR: 26016 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1) 26017 // The predicate can be 'Op' because 26018 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op). 26019 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE); 26020 else 26021 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE); 26022 case ISD::VECREDUCE_AND: { 26023 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg); 26024 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE); 26025 } 26026 case ISD::VECREDUCE_XOR: { 26027 SDValue ID = 26028 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64); 26029 if (OpVT == MVT::nxv1i1) { 26030 // Emulate a CNTP on .Q using .D and a different governing predicate. 26031 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg); 26032 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op); 26033 } 26034 SDValue Cntp = 26035 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op); 26036 return DAG.getAnyExtOrTrunc(Cntp, DL, VT); 26037 } 26038 } 26039 26040 return SDValue(); 26041 } 26042 26043 SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode, 26044 SDValue ScalarOp, 26045 SelectionDAG &DAG) const { 26046 SDLoc DL(ScalarOp); 26047 SDValue VecOp = ScalarOp.getOperand(0); 26048 EVT SrcVT = VecOp.getValueType(); 26049 26050 if (useSVEForFixedLengthVectorVT( 26051 SrcVT, 26052 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) { 26053 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); 26054 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); 26055 } 26056 26057 // UADDV always returns an i64 result. 26058 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 : 26059 SrcVT.getVectorElementType(); 26060 EVT RdxVT = SrcVT; 26061 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED) 26062 RdxVT = getPackedSVEVectorVT(ResVT); 26063 26064 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); 26065 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp); 26066 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, 26067 Rdx, DAG.getConstant(0, DL, MVT::i64)); 26068 26069 // The VEC_REDUCE nodes expect an element size result. 26070 if (ResVT != ScalarOp.getValueType()) 26071 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType()); 26072 26073 return Res; 26074 } 26075 26076 SDValue 26077 AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op, 26078 SelectionDAG &DAG) const { 26079 EVT VT = Op.getValueType(); 26080 SDLoc DL(Op); 26081 26082 EVT InVT = Op.getOperand(1).getValueType(); 26083 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 26084 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1)); 26085 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2)); 26086 26087 // Convert the mask to a predicated (NOTE: We don't need to worry about 26088 // inactive lanes since VSELECT is safe when given undefined elements). 26089 EVT MaskVT = Op.getOperand(0).getValueType(); 26090 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT); 26091 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0)); 26092 Mask = DAG.getNode(ISD::TRUNCATE, DL, 26093 MaskContainerVT.changeVectorElementType(MVT::i1), Mask); 26094 26095 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT, 26096 Mask, Op1, Op2); 26097 26098 return convertFromScalableVector(DAG, VT, ScalableRes); 26099 } 26100 26101 SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE( 26102 SDValue Op, SelectionDAG &DAG) const { 26103 SDLoc DL(Op); 26104 EVT InVT = Op.getOperand(0).getValueType(); 26105 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); 26106 26107 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) && 26108 "Only expected to lower fixed length vector operation!"); 26109 assert(Op.getValueType() == InVT.changeTypeToInteger() && 26110 "Expected integer result of the same bit length as the inputs!"); 26111 26112 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); 26113 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1)); 26114 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); 26115 26116 EVT CmpVT = Pg.getValueType(); 26117 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT, 26118 {Pg, Op1, Op2, Op.getOperand(2)}); 26119 26120 EVT PromoteVT = ContainerVT.changeTypeToInteger(); 26121 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT); 26122 return convertFromScalableVector(DAG, Op.getValueType(), Promote); 26123 } 26124 26125 SDValue 26126 AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op, 26127 SelectionDAG &DAG) const { 26128 SDLoc DL(Op); 26129 auto SrcOp = Op.getOperand(0); 26130 EVT VT = Op.getValueType(); 26131 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT); 26132 EVT ContainerSrcVT = 26133 getContainerForFixedLengthVector(DAG, SrcOp.getValueType()); 26134 26135 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp); 26136 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp); 26137 return convertFromScalableVector(DAG, VT, Op); 26138 } 26139 26140 SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE( 26141 SDValue Op, SelectionDAG &DAG) const { 26142 SDLoc DL(Op); 26143 unsigned NumOperands = Op->getNumOperands(); 26144 26145 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && 26146 "Unexpected number of operands in CONCAT_VECTORS"); 26147 26148 auto SrcOp1 = Op.getOperand(0); 26149 auto SrcOp2 = Op.getOperand(1); 26150 EVT VT = Op.getValueType(); 26151 EVT SrcVT = SrcOp1.getValueType(); 26152 26153 if (NumOperands > 2) { 26154 SmallVector<SDValue, 4> Ops; 26155 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext()); 26156 for (unsigned I = 0; I < NumOperands; I += 2) 26157 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT, 26158 Op->getOperand(I), Op->getOperand(I + 1))); 26159 26160 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops); 26161 } 26162 26163 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 26164 26165 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT); 26166 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1); 26167 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2); 26168 26169 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2); 26170 26171 return convertFromScalableVector(DAG, VT, Op); 26172 } 26173 26174 SDValue 26175 AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op, 26176 SelectionDAG &DAG) const { 26177 EVT VT = Op.getValueType(); 26178 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 26179 26180 SDLoc DL(Op); 26181 SDValue Val = Op.getOperand(0); 26182 SDValue Pg = getPredicateForVector(DAG, DL, VT); 26183 EVT SrcVT = Val.getValueType(); 26184 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 26185 EVT ExtendVT = ContainerVT.changeVectorElementType( 26186 SrcVT.getVectorElementType()); 26187 26188 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val); 26189 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val); 26190 26191 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val); 26192 Val = getSVESafeBitCast(ExtendVT, Val, DAG); 26193 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, 26194 Pg, Val, DAG.getUNDEF(ContainerVT)); 26195 26196 return convertFromScalableVector(DAG, VT, Val); 26197 } 26198 26199 SDValue 26200 AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op, 26201 SelectionDAG &DAG) const { 26202 EVT VT = Op.getValueType(); 26203 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 26204 26205 SDLoc DL(Op); 26206 SDValue Val = Op.getOperand(0); 26207 EVT SrcVT = Val.getValueType(); 26208 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT); 26209 EVT RoundVT = ContainerSrcVT.changeVectorElementType( 26210 VT.getVectorElementType()); 26211 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT); 26212 26213 Val = convertToScalableVector(DAG, ContainerSrcVT, Val); 26214 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val, 26215 Op.getOperand(1), DAG.getUNDEF(RoundVT)); 26216 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG); 26217 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val); 26218 26219 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val); 26220 return DAG.getNode(ISD::BITCAST, DL, VT, Val); 26221 } 26222 26223 SDValue 26224 AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op, 26225 SelectionDAG &DAG) const { 26226 EVT VT = Op.getValueType(); 26227 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 26228 26229 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP; 26230 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU 26231 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; 26232 26233 SDLoc DL(Op); 26234 SDValue Val = Op.getOperand(0); 26235 EVT SrcVT = Val.getValueType(); 26236 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT); 26237 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT); 26238 26239 if (VT.bitsGE(SrcVT)) { 26240 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT); 26241 26242 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, 26243 VT.changeTypeToInteger(), Val); 26244 26245 // Safe to use a larger than specified operand because by promoting the 26246 // value nothing has changed from an arithmetic point of view. 26247 Val = 26248 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val); 26249 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val, 26250 DAG.getUNDEF(ContainerDstVT)); 26251 return convertFromScalableVector(DAG, VT, Val); 26252 } else { 26253 EVT CvtVT = ContainerSrcVT.changeVectorElementType( 26254 ContainerDstVT.getVectorElementType()); 26255 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT); 26256 26257 Val = convertToScalableVector(DAG, ContainerSrcVT, Val); 26258 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT)); 26259 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG); 26260 Val = convertFromScalableVector(DAG, SrcVT, Val); 26261 26262 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val); 26263 return DAG.getNode(ISD::BITCAST, DL, VT, Val); 26264 } 26265 } 26266 26267 SDValue 26268 AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op, 26269 SelectionDAG &DAG) const { 26270 SDLoc DL(Op); 26271 EVT OpVT = Op.getValueType(); 26272 assert(OpVT.isScalableVector() && 26273 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE."); 26274 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0), 26275 Op.getOperand(1)); 26276 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0), 26277 Op.getOperand(1)); 26278 return DAG.getMergeValues({Even, Odd}, DL); 26279 } 26280 26281 SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op, 26282 SelectionDAG &DAG) const { 26283 SDLoc DL(Op); 26284 EVT OpVT = Op.getValueType(); 26285 assert(OpVT.isScalableVector() && 26286 "Expected scalable vector in LowerVECTOR_INTERLEAVE."); 26287 26288 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0), 26289 Op.getOperand(1)); 26290 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0), 26291 Op.getOperand(1)); 26292 return DAG.getMergeValues({Lo, Hi}, DL); 26293 } 26294 26295 SDValue 26296 AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op, 26297 SelectionDAG &DAG) const { 26298 EVT VT = Op.getValueType(); 26299 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 26300 26301 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; 26302 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU 26303 : AArch64ISD::FCVTZU_MERGE_PASSTHRU; 26304 26305 SDLoc DL(Op); 26306 SDValue Val = Op.getOperand(0); 26307 EVT SrcVT = Val.getValueType(); 26308 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT); 26309 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT); 26310 26311 if (VT.bitsGT(SrcVT)) { 26312 EVT CvtVT = ContainerDstVT.changeVectorElementType( 26313 ContainerSrcVT.getVectorElementType()); 26314 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT); 26315 26316 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val); 26317 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val); 26318 26319 Val = convertToScalableVector(DAG, ContainerDstVT, Val); 26320 Val = getSVESafeBitCast(CvtVT, Val, DAG); 26321 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val, 26322 DAG.getUNDEF(ContainerDstVT)); 26323 return convertFromScalableVector(DAG, VT, Val); 26324 } else { 26325 EVT CvtVT = ContainerSrcVT.changeTypeToInteger(); 26326 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT); 26327 26328 // Safe to use a larger than specified result since an fp_to_int where the 26329 // result doesn't fit into the destination is undefined. 26330 Val = convertToScalableVector(DAG, ContainerSrcVT, Val); 26331 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT)); 26332 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val); 26333 26334 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val); 26335 } 26336 } 26337 26338 static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, 26339 ArrayRef<int> ShuffleMask, EVT VT, 26340 EVT ContainerVT, SelectionDAG &DAG) { 26341 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); 26342 SDLoc DL(Op); 26343 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits(); 26344 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits(); 26345 bool IsSingleOp = 26346 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size()); 26347 26348 if (!Subtarget.isNeonAvailable() && !MinSVESize) 26349 MinSVESize = 128; 26350 26351 // Ignore two operands if no SVE2 or all index numbers couldn't 26352 // be represented. 26353 if (!IsSingleOp && (!Subtarget.hasSVE2() || MinSVESize != MaxSVESize)) 26354 return SDValue(); 26355 26356 EVT VTOp1 = Op.getOperand(0).getValueType(); 26357 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits(); 26358 unsigned IndexLen = MinSVESize / BitsPerElt; 26359 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements(); 26360 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue(); 26361 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen && 26362 "Incorrectly legalised shuffle operation"); 26363 26364 SmallVector<SDValue, 8> TBLMask; 26365 for (int Index : ShuffleMask) { 26366 // Handling poison index value. 26367 if (Index < 0) 26368 Index = 0; 26369 // If we refer to the second operand then we have to add elements 26370 // number in hardware register minus number of elements in a type. 26371 if ((unsigned)Index >= ElementsPerVectorReg) 26372 Index += IndexLen - ElementsPerVectorReg; 26373 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals 26374 // to 255, this might point to the last element of in the second operand 26375 // of the shufflevector, thus we are rejecting this transform. 26376 if ((unsigned)Index >= MaxOffset) 26377 return SDValue(); 26378 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64)); 26379 } 26380 26381 // Choosing an out-of-range index leads to the lane being zeroed vs zero 26382 // value where it would perform first lane duplication for out of 26383 // index elements. For i8 elements an out-of-range index could be a valid 26384 // for 2048-bit vector register size. 26385 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) 26386 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64)); 26387 26388 EVT MaskEltType = EVT::getIntegerVT(*DAG.getContext(), BitsPerElt); 26389 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen); 26390 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType); 26391 SDValue VecMask = 26392 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen)); 26393 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask); 26394 26395 SDValue Shuffle; 26396 if (IsSingleOp) 26397 Shuffle = 26398 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, 26399 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), 26400 Op1, SVEMask); 26401 else if (Subtarget.hasSVE2()) 26402 Shuffle = 26403 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, 26404 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), 26405 Op1, Op2, SVEMask); 26406 else 26407 llvm_unreachable("Cannot lower shuffle without SVE2 TBL"); 26408 Shuffle = convertFromScalableVector(DAG, VT, Shuffle); 26409 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); 26410 } 26411 26412 SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( 26413 SDValue Op, SelectionDAG &DAG) const { 26414 EVT VT = Op.getValueType(); 26415 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); 26416 26417 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 26418 auto ShuffleMask = SVN->getMask(); 26419 26420 SDLoc DL(Op); 26421 SDValue Op1 = Op.getOperand(0); 26422 SDValue Op2 = Op.getOperand(1); 26423 26424 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); 26425 Op1 = convertToScalableVector(DAG, ContainerVT, Op1); 26426 Op2 = convertToScalableVector(DAG, ContainerVT, Op2); 26427 26428 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT { 26429 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16) 26430 return MVT::i32; 26431 return ScalarTy; 26432 }; 26433 26434 if (SVN->isSplat()) { 26435 unsigned Lane = std::max(0, SVN->getSplatIndex()); 26436 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType()); 26437 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1, 26438 DAG.getConstant(Lane, DL, MVT::i64)); 26439 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl); 26440 return convertFromScalableVector(DAG, VT, Op); 26441 } 26442 26443 bool ReverseEXT = false; 26444 unsigned Imm; 26445 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) && 26446 Imm == VT.getVectorNumElements() - 1) { 26447 if (ReverseEXT) 26448 std::swap(Op1, Op2); 26449 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType()); 26450 SDValue Scalar = DAG.getNode( 26451 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1, 26452 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64)); 26453 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar); 26454 return convertFromScalableVector(DAG, VT, Op); 26455 } 26456 26457 for (unsigned LaneSize : {64U, 32U, 16U}) { 26458 if (isREVMask(ShuffleMask, VT, LaneSize)) { 26459 EVT NewVT = 26460 getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize)); 26461 unsigned RevOp; 26462 unsigned EltSz = VT.getScalarSizeInBits(); 26463 if (EltSz == 8) 26464 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU; 26465 else if (EltSz == 16) 26466 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU; 26467 else 26468 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU; 26469 26470 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1); 26471 Op = LowerToPredicatedOp(Op, DAG, RevOp); 26472 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op); 26473 return convertFromScalableVector(DAG, VT, Op); 26474 } 26475 } 26476 26477 if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 && 26478 isREVMask(ShuffleMask, VT, 128)) { 26479 if (!VT.isFloatingPoint()) 26480 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU); 26481 26482 EVT NewVT = getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), 64)); 26483 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1); 26484 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU); 26485 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op); 26486 return convertFromScalableVector(DAG, VT, Op); 26487 } 26488 26489 unsigned WhichResult; 26490 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0) 26491 return convertFromScalableVector( 26492 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2)); 26493 26494 if (isTRNMask(ShuffleMask, VT, WhichResult)) { 26495 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 26496 return convertFromScalableVector( 26497 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2)); 26498 } 26499 26500 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0) 26501 return convertFromScalableVector( 26502 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1)); 26503 26504 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 26505 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 26506 return convertFromScalableVector( 26507 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1)); 26508 } 26509 26510 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask 26511 // represents the same logical operation as performed by a ZIP instruction. In 26512 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly 26513 // equivalent to an AArch64 instruction. There's the extra component of 26514 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions 26515 // only operated on 64/128bit vector types that have a direct mapping to a 26516 // target register and so an exact mapping is implied. 26517 // However, when using SVE for fixed length vectors, most legal vector types 26518 // are actually sub-vectors of a larger SVE register. When mapping 26519 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider 26520 // how the mask's indices translate. Specifically, when the mapping requires 26521 // an exact meaning for a specific vector index (e.g. Index X is the last 26522 // vector element in the register) then such mappings are often only safe when 26523 // the exact SVE register size is know. The main exception to this is when 26524 // indices are logically relative to the first element of either 26525 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change 26526 // when converting from fixed-length to scalable vector types (i.e. the start 26527 // of a fixed length vector is always the start of a scalable vector). 26528 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); 26529 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits(); 26530 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) { 26531 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) && 26532 Op2.isUndef()) { 26533 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1); 26534 return convertFromScalableVector(DAG, VT, Op); 26535 } 26536 26537 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0) 26538 return convertFromScalableVector( 26539 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2)); 26540 26541 if (isUZPMask(ShuffleMask, VT, WhichResult)) { 26542 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 26543 return convertFromScalableVector( 26544 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2)); 26545 } 26546 26547 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0) 26548 return convertFromScalableVector( 26549 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1)); 26550 26551 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 26552 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 26553 return convertFromScalableVector( 26554 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1)); 26555 } 26556 } 26557 26558 // Avoid producing TBL instruction if we don't know SVE register minimal size, 26559 // unless NEON is not available and we can assume minimal SVE register size is 26560 // 128-bits. 26561 if (MinSVESize || !Subtarget->isNeonAvailable()) 26562 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT, 26563 DAG); 26564 26565 return SDValue(); 26566 } 26567 26568 SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op, 26569 SelectionDAG &DAG) const { 26570 SDLoc DL(Op); 26571 EVT InVT = Op.getValueType(); 26572 26573 assert(VT.isScalableVector() && isTypeLegal(VT) && 26574 InVT.isScalableVector() && isTypeLegal(InVT) && 26575 "Only expect to cast between legal scalable vector types!"); 26576 assert(VT.getVectorElementType() != MVT::i1 && 26577 InVT.getVectorElementType() != MVT::i1 && 26578 "For predicate bitcasts, use getSVEPredicateBitCast"); 26579 26580 if (InVT == VT) 26581 return Op; 26582 26583 EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType()); 26584 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType()); 26585 26586 // Safe bitcasting between unpacked vector types of different element counts 26587 // is currently unsupported because the following is missing the necessary 26588 // work to ensure the result's elements live where they're supposed to within 26589 // an SVE register. 26590 // 01234567 26591 // e.g. nxv2i32 = XX??XX?? 26592 // nxv4f16 = X?X?X?X? 26593 assert((VT.getVectorElementCount() == InVT.getVectorElementCount() || 26594 VT == PackedVT || InVT == PackedInVT) && 26595 "Unexpected bitcast!"); 26596 26597 // Pack input if required. 26598 if (InVT != PackedInVT) 26599 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op); 26600 26601 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op); 26602 26603 // Unpack result if required. 26604 if (VT != PackedVT) 26605 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); 26606 26607 return Op; 26608 } 26609 26610 bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG, 26611 SDValue N) const { 26612 return ::isAllActivePredicate(DAG, N); 26613 } 26614 26615 EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const { 26616 return ::getPromotedVTForPredicate(VT); 26617 } 26618 26619 bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode( 26620 SDValue Op, const APInt &OriginalDemandedBits, 26621 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, 26622 unsigned Depth) const { 26623 26624 unsigned Opc = Op.getOpcode(); 26625 switch (Opc) { 26626 case AArch64ISD::VSHL: { 26627 // Match (VSHL (VLSHR Val X) X) 26628 SDValue ShiftL = Op; 26629 SDValue ShiftR = Op->getOperand(0); 26630 if (ShiftR->getOpcode() != AArch64ISD::VLSHR) 26631 return false; 26632 26633 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse()) 26634 return false; 26635 26636 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1); 26637 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1); 26638 26639 // Other cases can be handled as well, but this is not 26640 // implemented. 26641 if (ShiftRBits != ShiftLBits) 26642 return false; 26643 26644 unsigned ScalarSize = Op.getScalarValueSizeInBits(); 26645 assert(ScalarSize > ShiftLBits && "Invalid shift imm"); 26646 26647 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits); 26648 APInt UnusedBits = ~OriginalDemandedBits; 26649 26650 if ((ZeroBits & UnusedBits) != ZeroBits) 26651 return false; 26652 26653 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not 26654 // used - simplify to just Val. 26655 return TLO.CombineTo(Op, ShiftR->getOperand(0)); 26656 } 26657 case ISD::INTRINSIC_WO_CHAIN: { 26658 if (auto ElementSize = IsSVECntIntrinsic(Op)) { 26659 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits(); 26660 if (!MaxSVEVectorSizeInBits) 26661 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector; 26662 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize; 26663 // The SVE count intrinsics don't support the multiplier immediate so we 26664 // don't have to account for that here. The value returned may be slightly 26665 // over the true required bits, as this is based on the "ALL" pattern. The 26666 // other patterns are also exposed by these intrinsics, but they all 26667 // return a value that's strictly less than "ALL". 26668 unsigned RequiredBits = llvm::bit_width(MaxElements); 26669 unsigned BitWidth = Known.Zero.getBitWidth(); 26670 if (RequiredBits < BitWidth) 26671 Known.Zero.setHighBits(BitWidth - RequiredBits); 26672 return false; 26673 } 26674 } 26675 } 26676 26677 return TargetLowering::SimplifyDemandedBitsForTargetNode( 26678 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); 26679 } 26680 26681 bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const { 26682 return Op.getOpcode() == AArch64ISD::DUP || 26683 Op.getOpcode() == AArch64ISD::MOVI || 26684 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && 26685 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) || 26686 TargetLowering::isTargetCanonicalConstantNode(Op); 26687 } 26688 26689 bool AArch64TargetLowering::isComplexDeinterleavingSupported() const { 26690 return Subtarget->hasSVE() || Subtarget->hasSVE2() || 26691 Subtarget->hasComplxNum(); 26692 } 26693 26694 bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( 26695 ComplexDeinterleavingOperation Operation, Type *Ty) const { 26696 auto *VTy = dyn_cast<VectorType>(Ty); 26697 if (!VTy) 26698 return false; 26699 26700 // If the vector is scalable, SVE is enabled, implying support for complex 26701 // numbers. Otherwirse, we need to ensure complex number support is avaialble 26702 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum()) 26703 return false; 26704 26705 auto *ScalarTy = VTy->getScalarType(); 26706 unsigned NumElements = VTy->getElementCount().getKnownMinValue(); 26707 26708 // We can only process vectors that have a bit size of 128 or higher (with an 26709 // additional 64 bits for Neon). Additionally, these vectors must have a 26710 // power-of-2 size, as we later split them into the smallest supported size 26711 // and merging them back together after applying complex operation. 26712 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements; 26713 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) || 26714 !llvm::isPowerOf2_32(VTyWidth)) 26715 return false; 26716 26717 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2()) { 26718 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits(); 26719 return 8 <= ScalarWidth && ScalarWidth <= 64; 26720 } 26721 26722 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) || 26723 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy(); 26724 } 26725 26726 Value *AArch64TargetLowering::createComplexDeinterleavingIR( 26727 IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, 26728 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, 26729 Value *Accumulator) const { 26730 VectorType *Ty = cast<VectorType>(InputA->getType()); 26731 bool IsScalable = Ty->isScalableTy(); 26732 bool IsInt = Ty->getElementType()->isIntegerTy(); 26733 26734 unsigned TyWidth = 26735 Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue(); 26736 26737 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) && 26738 "Vector type must be either 64 or a power of 2 that is at least 128"); 26739 26740 if (TyWidth > 128) { 26741 int Stride = Ty->getElementCount().getKnownMinValue() / 2; 26742 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty); 26743 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0)); 26744 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0)); 26745 auto *UpperSplitA = 26746 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride)); 26747 auto *UpperSplitB = 26748 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride)); 26749 Value *LowerSplitAcc = nullptr; 26750 Value *UpperSplitAcc = nullptr; 26751 if (Accumulator) { 26752 LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0)); 26753 UpperSplitAcc = 26754 B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride)); 26755 } 26756 auto *LowerSplitInt = createComplexDeinterleavingIR( 26757 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); 26758 auto *UpperSplitInt = createComplexDeinterleavingIR( 26759 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); 26760 26761 auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt, 26762 B.getInt64(0)); 26763 return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride)); 26764 } 26765 26766 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { 26767 if (Accumulator == nullptr) 26768 Accumulator = Constant::getNullValue(Ty); 26769 26770 if (IsScalable) { 26771 if (IsInt) 26772 return B.CreateIntrinsic( 26773 Intrinsic::aarch64_sve_cmla_x, Ty, 26774 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)}); 26775 26776 auto *Mask = B.getAllOnesMask(Ty->getElementCount()); 26777 return B.CreateIntrinsic( 26778 Intrinsic::aarch64_sve_fcmla, Ty, 26779 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)}); 26780 } 26781 26782 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0, 26783 Intrinsic::aarch64_neon_vcmla_rot90, 26784 Intrinsic::aarch64_neon_vcmla_rot180, 26785 Intrinsic::aarch64_neon_vcmla_rot270}; 26786 26787 26788 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty, 26789 {Accumulator, InputA, InputB}); 26790 } 26791 26792 if (OperationType == ComplexDeinterleavingOperation::CAdd) { 26793 if (IsScalable) { 26794 if (Rotation == ComplexDeinterleavingRotation::Rotation_90 || 26795 Rotation == ComplexDeinterleavingRotation::Rotation_270) { 26796 if (IsInt) 26797 return B.CreateIntrinsic( 26798 Intrinsic::aarch64_sve_cadd_x, Ty, 26799 {InputA, InputB, B.getInt32((int)Rotation * 90)}); 26800 26801 auto *Mask = B.getAllOnesMask(Ty->getElementCount()); 26802 return B.CreateIntrinsic( 26803 Intrinsic::aarch64_sve_fcadd, Ty, 26804 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)}); 26805 } 26806 return nullptr; 26807 } 26808 26809 Intrinsic::ID IntId = Intrinsic::not_intrinsic; 26810 if (Rotation == ComplexDeinterleavingRotation::Rotation_90) 26811 IntId = Intrinsic::aarch64_neon_vcadd_rot90; 26812 else if (Rotation == ComplexDeinterleavingRotation::Rotation_270) 26813 IntId = Intrinsic::aarch64_neon_vcadd_rot270; 26814 26815 if (IntId == Intrinsic::not_intrinsic) 26816 return nullptr; 26817 26818 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB}); 26819 } 26820 26821 return nullptr; 26822 } 26823 26824 bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const { 26825 unsigned Opc = N->getOpcode(); 26826 if (ISD::isExtOpcode(Opc)) { 26827 if (any_of(N->uses(), 26828 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; })) 26829 return false; 26830 } 26831 return true; 26832 } 26833 26834 unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const { 26835 return Subtarget->getMinimumJumpTableEntries(); 26836 } 26837 26838 MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, 26839 CallingConv::ID CC, 26840 EVT VT) const { 26841 bool NonUnitFixedLengthVector = 26842 VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar(); 26843 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors()) 26844 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); 26845 26846 EVT VT1; 26847 MVT RegisterVT; 26848 unsigned NumIntermediates; 26849 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates, 26850 RegisterVT); 26851 return RegisterVT; 26852 } 26853 26854 unsigned AArch64TargetLowering::getNumRegistersForCallingConv( 26855 LLVMContext &Context, CallingConv::ID CC, EVT VT) const { 26856 bool NonUnitFixedLengthVector = 26857 VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar(); 26858 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors()) 26859 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); 26860 26861 EVT VT1; 26862 MVT VT2; 26863 unsigned NumIntermediates; 26864 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, 26865 NumIntermediates, VT2); 26866 } 26867 26868 unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv( 26869 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, 26870 unsigned &NumIntermediates, MVT &RegisterVT) const { 26871 int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv( 26872 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); 26873 if (!RegisterVT.isFixedLengthVector() || 26874 RegisterVT.getFixedSizeInBits() <= 128) 26875 return NumRegs; 26876 26877 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!"); 26878 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!"); 26879 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!"); 26880 26881 // A size mismatch here implies either type promotion or widening and would 26882 // have resulted in scalarisation if larger vectors had not be available. 26883 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) { 26884 EVT EltTy = VT.getVectorElementType(); 26885 EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1)); 26886 if (!isTypeLegal(NewVT)) 26887 NewVT = EltTy; 26888 26889 IntermediateVT = NewVT; 26890 NumIntermediates = VT.getVectorNumElements(); 26891 RegisterVT = getRegisterType(Context, NewVT); 26892 return NumIntermediates; 26893 } 26894 26895 // SVE VLS support does not introduce a new ABI so we should use NEON sized 26896 // types for vector arguments and returns. 26897 26898 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128; 26899 NumIntermediates *= NumSubRegs; 26900 NumRegs *= NumSubRegs; 26901 26902 switch (RegisterVT.getVectorElementType().SimpleTy) { 26903 default: 26904 llvm_unreachable("unexpected element type for vector"); 26905 case MVT::i8: 26906 IntermediateVT = RegisterVT = MVT::v16i8; 26907 break; 26908 case MVT::i16: 26909 IntermediateVT = RegisterVT = MVT::v8i16; 26910 break; 26911 case MVT::i32: 26912 IntermediateVT = RegisterVT = MVT::v4i32; 26913 break; 26914 case MVT::i64: 26915 IntermediateVT = RegisterVT = MVT::v2i64; 26916 break; 26917 case MVT::f16: 26918 IntermediateVT = RegisterVT = MVT::v8f16; 26919 break; 26920 case MVT::f32: 26921 IntermediateVT = RegisterVT = MVT::v4f32; 26922 break; 26923 case MVT::f64: 26924 IntermediateVT = RegisterVT = MVT::v2f64; 26925 break; 26926 case MVT::bf16: 26927 IntermediateVT = RegisterVT = MVT::v8bf16; 26928 break; 26929 } 26930 26931 return NumRegs; 26932 } 26933 26934 bool AArch64TargetLowering::hasInlineStackProbe( 26935 const MachineFunction &MF) const { 26936 return !Subtarget->isTargetWindows() && 26937 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing(); 26938 } 26939