xref: /src/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
101095a5dSDimitry Andric //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
201095a5dSDimitry Andric //
3e6d15924SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e6d15924SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5e6d15924SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
601095a5dSDimitry Andric //
701095a5dSDimitry Andric //===----------------------------------------------------------------------===//
801095a5dSDimitry Andric //
901095a5dSDimitry Andric /// \file
1001095a5dSDimitry Andric /// This pass does misc. AMDGPU optimizations on IR before instruction
1101095a5dSDimitry Andric /// selection.
1201095a5dSDimitry Andric //
1301095a5dSDimitry Andric //===----------------------------------------------------------------------===//
1401095a5dSDimitry Andric 
1501095a5dSDimitry Andric #include "AMDGPU.h"
16a7fe922bSDimitry Andric #include "AMDGPUTargetMachine.h"
177fa27ce4SDimitry Andric #include "SIModeRegisterDefaults.h"
18eb11fae6SDimitry Andric #include "llvm/Analysis/AssumptionCache.h"
19cfca06d7SDimitry Andric #include "llvm/Analysis/ConstantFolding.h"
207fa27ce4SDimitry Andric #include "llvm/Analysis/TargetLibraryInfo.h"
217fa27ce4SDimitry Andric #include "llvm/Analysis/UniformityAnalysis.h"
22eb11fae6SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
23b5630dbaSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
24cfca06d7SDimitry Andric #include "llvm/IR/Dominators.h"
257fa27ce4SDimitry Andric #include "llvm/IR/IRBuilder.h"
267ab83427SDimitry Andric #include "llvm/IR/InstVisitor.h"
27b60736ecSDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
287fa27ce4SDimitry Andric #include "llvm/IR/PatternMatch.h"
29706b4fc4SDimitry Andric #include "llvm/InitializePasses.h"
3071d5a254SDimitry Andric #include "llvm/Pass.h"
31b60736ecSDimitry Andric #include "llvm/Support/KnownBits.h"
32cfca06d7SDimitry Andric #include "llvm/Transforms/Utils/IntegerDivision.h"
337fa27ce4SDimitry Andric #include "llvm/Transforms/Utils/Local.h"
3401095a5dSDimitry Andric 
3501095a5dSDimitry Andric #define DEBUG_TYPE "amdgpu-codegenprepare"
3601095a5dSDimitry Andric 
3701095a5dSDimitry Andric using namespace llvm;
387fa27ce4SDimitry Andric using namespace llvm::PatternMatch;
3901095a5dSDimitry Andric 
4001095a5dSDimitry Andric namespace {
4101095a5dSDimitry Andric 
42eb11fae6SDimitry Andric static cl::opt<bool> WidenLoads(
43eb11fae6SDimitry Andric   "amdgpu-codegenprepare-widen-constant-loads",
44eb11fae6SDimitry Andric   cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
45eb11fae6SDimitry Andric   cl::ReallyHidden,
46cfca06d7SDimitry Andric   cl::init(false));
47eb11fae6SDimitry Andric 
48b60736ecSDimitry Andric static cl::opt<bool> Widen16BitOps(
49b60736ecSDimitry Andric   "amdgpu-codegenprepare-widen-16-bit-ops",
50b60736ecSDimitry Andric   cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
51b60736ecSDimitry Andric   cl::ReallyHidden,
52b60736ecSDimitry Andric   cl::init(true));
53b60736ecSDimitry Andric 
547fa27ce4SDimitry Andric static cl::opt<bool>
55b1c73532SDimitry Andric     BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
567fa27ce4SDimitry Andric                    cl::desc("Break large PHI nodes for DAGISel"),
577fa27ce4SDimitry Andric                    cl::ReallyHidden, cl::init(true));
587fa27ce4SDimitry Andric 
597fa27ce4SDimitry Andric static cl::opt<bool>
60b1c73532SDimitry Andric     ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
617fa27ce4SDimitry Andric                         cl::desc("For testing purposes, always break large "
627fa27ce4SDimitry Andric                                  "PHIs even if it isn't profitable."),
637fa27ce4SDimitry Andric                         cl::ReallyHidden, cl::init(false));
647fa27ce4SDimitry Andric 
65b1c73532SDimitry Andric static cl::opt<unsigned> BreakLargePHIsThreshold(
667fa27ce4SDimitry Andric     "amdgpu-codegenprepare-break-large-phis-threshold",
677fa27ce4SDimitry Andric     cl::desc("Minimum type size in bits for breaking large PHI nodes"),
687fa27ce4SDimitry Andric     cl::ReallyHidden, cl::init(32));
697fa27ce4SDimitry Andric 
701d5ae102SDimitry Andric static cl::opt<bool> UseMul24Intrin(
711d5ae102SDimitry Andric   "amdgpu-codegenprepare-mul24",
721d5ae102SDimitry Andric   cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
731d5ae102SDimitry Andric   cl::ReallyHidden,
741d5ae102SDimitry Andric   cl::init(true));
751d5ae102SDimitry Andric 
76cfca06d7SDimitry Andric // Legalize 64-bit division by using the generic IR expansion.
77cfca06d7SDimitry Andric static cl::opt<bool> ExpandDiv64InIR(
78cfca06d7SDimitry Andric   "amdgpu-codegenprepare-expand-div64",
79cfca06d7SDimitry Andric   cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80cfca06d7SDimitry Andric   cl::ReallyHidden,
81cfca06d7SDimitry Andric   cl::init(false));
82cfca06d7SDimitry Andric 
83cfca06d7SDimitry Andric // Leave all division operations as they are. This supersedes ExpandDiv64InIR
84cfca06d7SDimitry Andric // and is used for testing the legalizer.
85cfca06d7SDimitry Andric static cl::opt<bool> DisableIDivExpand(
86cfca06d7SDimitry Andric   "amdgpu-codegenprepare-disable-idiv-expansion",
87cfca06d7SDimitry Andric   cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88cfca06d7SDimitry Andric   cl::ReallyHidden,
89cfca06d7SDimitry Andric   cl::init(false));
90cfca06d7SDimitry Andric 
917fa27ce4SDimitry Andric // Disable processing of fdiv so we can better test the backend implementations.
927fa27ce4SDimitry Andric static cl::opt<bool> DisableFDivExpand(
937fa27ce4SDimitry Andric   "amdgpu-codegenprepare-disable-fdiv-expansion",
947fa27ce4SDimitry Andric   cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
957fa27ce4SDimitry Andric   cl::ReallyHidden,
967fa27ce4SDimitry Andric   cl::init(false));
977fa27ce4SDimitry Andric 
987fa27ce4SDimitry Andric class AMDGPUCodeGenPrepareImpl
997fa27ce4SDimitry Andric     : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
1007fa27ce4SDimitry Andric public:
101eb11fae6SDimitry Andric   const GCNSubtarget *ST = nullptr;
102ac9a064cSDimitry Andric   const AMDGPUTargetMachine *TM = nullptr;
1037fa27ce4SDimitry Andric   const TargetLibraryInfo *TLInfo = nullptr;
104eb11fae6SDimitry Andric   AssumptionCache *AC = nullptr;
105cfca06d7SDimitry Andric   DominatorTree *DT = nullptr;
1067fa27ce4SDimitry Andric   UniformityInfo *UA = nullptr;
10771d5a254SDimitry Andric   Module *Mod = nullptr;
108e6d15924SDimitry Andric   const DataLayout *DL = nullptr;
10971d5a254SDimitry Andric   bool HasUnsafeFPMath = false;
1107fa27ce4SDimitry Andric   bool HasFP32DenormalFlush = false;
1117fa27ce4SDimitry Andric   bool FlowChanged = false;
112b1c73532SDimitry Andric   mutable Function *SqrtF32 = nullptr;
113b1c73532SDimitry Andric   mutable Function *LdexpF32 = nullptr;
1147fa27ce4SDimitry Andric 
1157fa27ce4SDimitry Andric   DenseMap<const PHINode *, bool> BreakPhiNodesCache;
1167fa27ce4SDimitry Andric 
getSqrtF32() const117b1c73532SDimitry Andric   Function *getSqrtF32() const {
118b1c73532SDimitry Andric     if (SqrtF32)
119b1c73532SDimitry Andric       return SqrtF32;
120b1c73532SDimitry Andric 
121b1c73532SDimitry Andric     LLVMContext &Ctx = Mod->getContext();
122b1c73532SDimitry Andric     SqrtF32 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_sqrt,
123b1c73532SDimitry Andric                                         {Type::getFloatTy(Ctx)});
124b1c73532SDimitry Andric     return SqrtF32;
125b1c73532SDimitry Andric   }
126b1c73532SDimitry Andric 
getLdexpF32() const127b1c73532SDimitry Andric   Function *getLdexpF32() const {
128b1c73532SDimitry Andric     if (LdexpF32)
129b1c73532SDimitry Andric       return LdexpF32;
130b1c73532SDimitry Andric 
131b1c73532SDimitry Andric     LLVMContext &Ctx = Mod->getContext();
132b1c73532SDimitry Andric     LdexpF32 = Intrinsic::getDeclaration(
133b1c73532SDimitry Andric         Mod, Intrinsic::ldexp, {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
134b1c73532SDimitry Andric     return LdexpF32;
135b1c73532SDimitry Andric   }
136b1c73532SDimitry Andric 
1377fa27ce4SDimitry Andric   bool canBreakPHINode(const PHINode &I);
13801095a5dSDimitry Andric 
139eb11fae6SDimitry Andric   /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
140b915e9e0SDimitry Andric   /// binary operation \p V.
141b915e9e0SDimitry Andric   ///
142b915e9e0SDimitry Andric   /// \returns Binary operation \p V.
143b915e9e0SDimitry Andric   /// \returns \p T's base element bit width.
144b915e9e0SDimitry Andric   unsigned getBaseElementBitWidth(const Type *T) const;
145b915e9e0SDimitry Andric 
146b915e9e0SDimitry Andric   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
147b915e9e0SDimitry Andric   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
148b915e9e0SDimitry Andric   /// is returned.
149b915e9e0SDimitry Andric   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
150b915e9e0SDimitry Andric 
151b915e9e0SDimitry Andric   /// \returns True if binary operation \p I is a signed binary operation, false
152b915e9e0SDimitry Andric   /// otherwise.
153b915e9e0SDimitry Andric   bool isSigned(const BinaryOperator &I) const;
154b915e9e0SDimitry Andric 
155b915e9e0SDimitry Andric   /// \returns True if the condition of 'select' operation \p I comes from a
156b915e9e0SDimitry Andric   /// signed 'icmp' operation, false otherwise.
157b915e9e0SDimitry Andric   bool isSigned(const SelectInst &I) const;
158b915e9e0SDimitry Andric 
159b915e9e0SDimitry Andric   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
160b915e9e0SDimitry Andric   /// false otherwise.
161b915e9e0SDimitry Andric   bool needsPromotionToI32(const Type *T) const;
162b915e9e0SDimitry Andric 
1637fa27ce4SDimitry Andric   /// Return true if \p T is a legal scalar floating point type.
1647fa27ce4SDimitry Andric   bool isLegalFloatingTy(const Type *T) const;
1657fa27ce4SDimitry Andric 
1667fa27ce4SDimitry Andric   /// Wrapper to pass all the arguments to computeKnownFPClass
computeKnownFPClass(const Value * V,FPClassTest Interested,const Instruction * CtxI) const1677fa27ce4SDimitry Andric   KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
1687fa27ce4SDimitry Andric                                    const Instruction *CtxI) const {
1697fa27ce4SDimitry Andric     return llvm::computeKnownFPClass(V, *DL, Interested, 0, TLInfo, AC, CtxI,
1707fa27ce4SDimitry Andric                                      DT);
1717fa27ce4SDimitry Andric   }
1727fa27ce4SDimitry Andric 
canIgnoreDenormalInput(const Value * V,const Instruction * CtxI) const1737fa27ce4SDimitry Andric   bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
1747fa27ce4SDimitry Andric     return HasFP32DenormalFlush ||
1757fa27ce4SDimitry Andric            computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal();
1767fa27ce4SDimitry Andric   }
1777fa27ce4SDimitry Andric 
178eb11fae6SDimitry Andric   /// Promotes uniform binary operation \p I to equivalent 32 bit binary
179b915e9e0SDimitry Andric   /// operation.
180b915e9e0SDimitry Andric   ///
181b915e9e0SDimitry Andric   /// \details \p I's base element bit width must be greater than 1 and less
182b915e9e0SDimitry Andric   /// than or equal 16. Promotion is done by sign or zero extending operands to
183b915e9e0SDimitry Andric   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
184b915e9e0SDimitry Andric   /// truncating the result of 32 bit binary operation back to \p I's original
185b915e9e0SDimitry Andric   /// type. Division operation is not promoted.
186b915e9e0SDimitry Andric   ///
187b915e9e0SDimitry Andric   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
188b915e9e0SDimitry Andric   /// false otherwise.
189b915e9e0SDimitry Andric   bool promoteUniformOpToI32(BinaryOperator &I) const;
190b915e9e0SDimitry Andric 
191eb11fae6SDimitry Andric   /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
192b915e9e0SDimitry Andric   ///
193b915e9e0SDimitry Andric   /// \details \p I's base element bit width must be greater than 1 and less
194b915e9e0SDimitry Andric   /// than or equal 16. Promotion is done by sign or zero extending operands to
195b915e9e0SDimitry Andric   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
196b915e9e0SDimitry Andric   ///
197b915e9e0SDimitry Andric   /// \returns True.
198b915e9e0SDimitry Andric   bool promoteUniformOpToI32(ICmpInst &I) const;
199b915e9e0SDimitry Andric 
200eb11fae6SDimitry Andric   /// Promotes uniform 'select' operation \p I to 32 bit 'select'
201b915e9e0SDimitry Andric   /// operation.
202b915e9e0SDimitry Andric   ///
203b915e9e0SDimitry Andric   /// \details \p I's base element bit width must be greater than 1 and less
204b915e9e0SDimitry Andric   /// than or equal 16. Promotion is done by sign or zero extending operands to
205b915e9e0SDimitry Andric   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
206b915e9e0SDimitry Andric   /// result of 32 bit 'select' operation back to \p I's original type.
207b915e9e0SDimitry Andric   ///
208b915e9e0SDimitry Andric   /// \returns True.
209b915e9e0SDimitry Andric   bool promoteUniformOpToI32(SelectInst &I) const;
210b915e9e0SDimitry Andric 
211eb11fae6SDimitry Andric   /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
212b915e9e0SDimitry Andric   /// intrinsic.
213b915e9e0SDimitry Andric   ///
214b915e9e0SDimitry Andric   /// \details \p I's base element bit width must be greater than 1 and less
215b915e9e0SDimitry Andric   /// than or equal 16. Promotion is done by zero extending the operand to 32
216b915e9e0SDimitry Andric   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
217b915e9e0SDimitry Andric   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
218b915e9e0SDimitry Andric   /// shift amount is 32 minus \p I's base element bit width), and truncating
219b915e9e0SDimitry Andric   /// the result of the shift operation back to \p I's original type.
220b915e9e0SDimitry Andric   ///
221b915e9e0SDimitry Andric   /// \returns True.
222b915e9e0SDimitry Andric   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
223eb11fae6SDimitry Andric 
224c0981da4SDimitry Andric   /// \returns The minimum number of bits needed to store the value of \Op as an
225c0981da4SDimitry Andric   /// unsigned integer. Truncating to this size and then zero-extending to
2266f8fc217SDimitry Andric   /// the original will not change the value.
2276f8fc217SDimitry Andric   unsigned numBitsUnsigned(Value *Op) const;
228c0981da4SDimitry Andric 
229c0981da4SDimitry Andric   /// \returns The minimum number of bits needed to store the value of \Op as a
230c0981da4SDimitry Andric   /// signed integer. Truncating to this size and then sign-extending to
2316f8fc217SDimitry Andric   /// the original size will not change the value.
2326f8fc217SDimitry Andric   unsigned numBitsSigned(Value *Op) const;
233e6d15924SDimitry Andric 
234e6d15924SDimitry Andric   /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
235e6d15924SDimitry Andric   /// SelectionDAG has an issue where an and asserting the bits are known
236e6d15924SDimitry Andric   bool replaceMulWithMul24(BinaryOperator &I) const;
237e6d15924SDimitry Andric 
238cfca06d7SDimitry Andric   /// Perform same function as equivalently named function in DAGCombiner. Since
239cfca06d7SDimitry Andric   /// we expand some divisions here, we need to perform this before obscuring.
240cfca06d7SDimitry Andric   bool foldBinOpIntoSelect(BinaryOperator &I) const;
241cfca06d7SDimitry Andric 
242cfca06d7SDimitry Andric   bool divHasSpecialOptimization(BinaryOperator &I,
243cfca06d7SDimitry Andric                                  Value *Num, Value *Den) const;
244cfca06d7SDimitry Andric   int getDivNumBits(BinaryOperator &I,
245cfca06d7SDimitry Andric                     Value *Num, Value *Den,
246cfca06d7SDimitry Andric                     unsigned AtLeast, bool Signed) const;
247cfca06d7SDimitry Andric 
248eb11fae6SDimitry Andric   /// Expands 24 bit div or rem.
249eb11fae6SDimitry Andric   Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
250eb11fae6SDimitry Andric                         Value *Num, Value *Den,
251eb11fae6SDimitry Andric                         bool IsDiv, bool IsSigned) const;
252eb11fae6SDimitry Andric 
253cfca06d7SDimitry Andric   Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
254cfca06d7SDimitry Andric                             Value *Num, Value *Den, unsigned NumBits,
255cfca06d7SDimitry Andric                             bool IsDiv, bool IsSigned) const;
256cfca06d7SDimitry Andric 
257eb11fae6SDimitry Andric   /// Expands 32 bit div or rem.
258eb11fae6SDimitry Andric   Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
259eb11fae6SDimitry Andric                         Value *Num, Value *Den) const;
260eb11fae6SDimitry Andric 
261cfca06d7SDimitry Andric   Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
262cfca06d7SDimitry Andric                         Value *Num, Value *Den) const;
263cfca06d7SDimitry Andric   void expandDivRem64(BinaryOperator &I) const;
264cfca06d7SDimitry Andric 
265eb11fae6SDimitry Andric   /// Widen a scalar load.
266044eb2f6SDimitry Andric   ///
267044eb2f6SDimitry Andric   /// \details \p Widen scalar load for uniform, small type loads from constant
268044eb2f6SDimitry Andric   //  memory / to a full 32-bits and then truncate the input to allow a scalar
269044eb2f6SDimitry Andric   //  load instead of a vector load.
270044eb2f6SDimitry Andric   //
271044eb2f6SDimitry Andric   /// \returns True.
272044eb2f6SDimitry Andric 
273044eb2f6SDimitry Andric   bool canWidenScalarExtLoad(LoadInst &I) const;
274b915e9e0SDimitry Andric 
2757fa27ce4SDimitry Andric   Value *matchFractPat(IntrinsicInst &I);
2767fa27ce4SDimitry Andric   Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
2777fa27ce4SDimitry Andric 
2787fa27ce4SDimitry Andric   bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
2797fa27ce4SDimitry Andric                           FastMathFlags SqrtFMF) const;
2807fa27ce4SDimitry Andric 
2817fa27ce4SDimitry Andric   Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
2827fa27ce4SDimitry Andric                          FastMathFlags DivFMF, FastMathFlags SqrtFMF,
2837fa27ce4SDimitry Andric                          const Instruction *CtxI) const;
2847fa27ce4SDimitry Andric 
2857fa27ce4SDimitry Andric   Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
2867fa27ce4SDimitry Andric                          FastMathFlags FMF, const Instruction *CtxI) const;
2877fa27ce4SDimitry Andric   Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
2887fa27ce4SDimitry Andric                               float ReqdAccuracy) const;
2897fa27ce4SDimitry Andric 
2907fa27ce4SDimitry Andric   Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
2917fa27ce4SDimitry Andric                           FastMathFlags DivFMF, FastMathFlags SqrtFMF,
2927fa27ce4SDimitry Andric                           Value *RsqOp, const Instruction *FDiv,
2937fa27ce4SDimitry Andric                           float ReqdAccuracy) const;
2947fa27ce4SDimitry Andric 
2957fa27ce4SDimitry Andric   std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
2967fa27ce4SDimitry Andric                                               Value *Src) const;
2977fa27ce4SDimitry Andric 
2987fa27ce4SDimitry Andric   Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
2997fa27ce4SDimitry Andric                          bool IsNegative) const;
3007fa27ce4SDimitry Andric   Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
3017fa27ce4SDimitry Andric                       FastMathFlags FMF) const;
302b1c73532SDimitry Andric   Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
303b1c73532SDimitry Andric                           FastMathFlags FMF) const;
3047fa27ce4SDimitry Andric 
30501095a5dSDimitry Andric public:
306a7fe922bSDimitry Andric   bool visitFDiv(BinaryOperator &I);
307a7fe922bSDimitry Andric 
visitInstruction(Instruction & I)308b915e9e0SDimitry Andric   bool visitInstruction(Instruction &I) { return false; }
309b915e9e0SDimitry Andric   bool visitBinaryOperator(BinaryOperator &I);
310044eb2f6SDimitry Andric   bool visitLoadInst(LoadInst &I);
311b915e9e0SDimitry Andric   bool visitICmpInst(ICmpInst &I);
312b915e9e0SDimitry Andric   bool visitSelectInst(SelectInst &I);
3137fa27ce4SDimitry Andric   bool visitPHINode(PHINode &I);
314ac9a064cSDimitry Andric   bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
315b915e9e0SDimitry Andric 
316b915e9e0SDimitry Andric   bool visitIntrinsicInst(IntrinsicInst &I);
317b915e9e0SDimitry Andric   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
3187fa27ce4SDimitry Andric   bool visitMinNum(IntrinsicInst &I);
319b1c73532SDimitry Andric   bool visitSqrt(IntrinsicInst &I);
3207fa27ce4SDimitry Andric   bool run(Function &F);
3217fa27ce4SDimitry Andric };
32201095a5dSDimitry Andric 
3237fa27ce4SDimitry Andric class AMDGPUCodeGenPrepare : public FunctionPass {
3247fa27ce4SDimitry Andric private:
3257fa27ce4SDimitry Andric   AMDGPUCodeGenPrepareImpl Impl;
32601095a5dSDimitry Andric 
3277fa27ce4SDimitry Andric public:
3287fa27ce4SDimitry Andric   static char ID;
AMDGPUCodeGenPrepare()3297fa27ce4SDimitry Andric   AMDGPUCodeGenPrepare() : FunctionPass(ID) {
3307fa27ce4SDimitry Andric     initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry());
3317fa27ce4SDimitry Andric   }
getAnalysisUsage(AnalysisUsage & AU) const33201095a5dSDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
333eb11fae6SDimitry Andric     AU.addRequired<AssumptionCacheTracker>();
3347fa27ce4SDimitry Andric     AU.addRequired<UniformityInfoWrapperPass>();
3357fa27ce4SDimitry Andric     AU.addRequired<TargetLibraryInfoWrapperPass>();
336cfca06d7SDimitry Andric 
337cfca06d7SDimitry Andric     // FIXME: Division expansion needs to preserve the dominator tree.
338cfca06d7SDimitry Andric     if (!ExpandDiv64InIR)
33901095a5dSDimitry Andric       AU.setPreservesAll();
34001095a5dSDimitry Andric   }
3417fa27ce4SDimitry Andric   bool runOnFunction(Function &F) override;
3427fa27ce4SDimitry Andric   bool doInitialization(Module &M) override;
getPassName() const3437fa27ce4SDimitry Andric   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
34401095a5dSDimitry Andric };
34501095a5dSDimitry Andric 
34671d5a254SDimitry Andric } // end anonymous namespace
347b915e9e0SDimitry Andric 
run(Function & F)3487fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
349b1c73532SDimitry Andric   BreakPhiNodesCache.clear();
3507fa27ce4SDimitry Andric   bool MadeChange = false;
3517fa27ce4SDimitry Andric 
3527fa27ce4SDimitry Andric   Function::iterator NextBB;
3537fa27ce4SDimitry Andric   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
3547fa27ce4SDimitry Andric     BasicBlock *BB = &*FI;
3557fa27ce4SDimitry Andric     NextBB = std::next(FI);
3567fa27ce4SDimitry Andric 
3577fa27ce4SDimitry Andric     BasicBlock::iterator Next;
3587fa27ce4SDimitry Andric     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
3597fa27ce4SDimitry Andric          I = Next) {
3607fa27ce4SDimitry Andric       Next = std::next(I);
3617fa27ce4SDimitry Andric 
3627fa27ce4SDimitry Andric       MadeChange |= visit(*I);
3637fa27ce4SDimitry Andric 
3647fa27ce4SDimitry Andric       if (Next != E) { // Control flow changed
3657fa27ce4SDimitry Andric         BasicBlock *NextInstBB = Next->getParent();
3667fa27ce4SDimitry Andric         if (NextInstBB != BB) {
3677fa27ce4SDimitry Andric           BB = NextInstBB;
3687fa27ce4SDimitry Andric           E = BB->end();
3697fa27ce4SDimitry Andric           FE = F.end();
3707fa27ce4SDimitry Andric         }
3717fa27ce4SDimitry Andric       }
3727fa27ce4SDimitry Andric     }
3737fa27ce4SDimitry Andric   }
3747fa27ce4SDimitry Andric   return MadeChange;
3757fa27ce4SDimitry Andric }
3767fa27ce4SDimitry Andric 
getBaseElementBitWidth(const Type * T) const3777fa27ce4SDimitry Andric unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
378b915e9e0SDimitry Andric   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
379b915e9e0SDimitry Andric 
380b915e9e0SDimitry Andric   if (T->isIntegerTy())
381b915e9e0SDimitry Andric     return T->getIntegerBitWidth();
382b915e9e0SDimitry Andric   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
383b915e9e0SDimitry Andric }
384b915e9e0SDimitry Andric 
getI32Ty(IRBuilder<> & B,const Type * T) const3857fa27ce4SDimitry Andric Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
386b915e9e0SDimitry Andric   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
387b915e9e0SDimitry Andric 
388b915e9e0SDimitry Andric   if (T->isIntegerTy())
389b915e9e0SDimitry Andric     return B.getInt32Ty();
390cfca06d7SDimitry Andric   return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
391b915e9e0SDimitry Andric }
392b915e9e0SDimitry Andric 
isSigned(const BinaryOperator & I) const3937fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
394b915e9e0SDimitry Andric   return I.getOpcode() == Instruction::AShr ||
395b915e9e0SDimitry Andric       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
396b915e9e0SDimitry Andric }
397b915e9e0SDimitry Andric 
isSigned(const SelectInst & I) const3987fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
399b915e9e0SDimitry Andric   return isa<ICmpInst>(I.getOperand(0)) ?
400b915e9e0SDimitry Andric       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
401b915e9e0SDimitry Andric }
402b915e9e0SDimitry Andric 
needsPromotionToI32(const Type * T) const4037fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
404b60736ecSDimitry Andric   if (!Widen16BitOps)
405b60736ecSDimitry Andric     return false;
406b60736ecSDimitry Andric 
40771d5a254SDimitry Andric   const IntegerType *IntTy = dyn_cast<IntegerType>(T);
40871d5a254SDimitry Andric   if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
409b915e9e0SDimitry Andric     return true;
41071d5a254SDimitry Andric 
41171d5a254SDimitry Andric   if (const VectorType *VT = dyn_cast<VectorType>(T)) {
41271d5a254SDimitry Andric     // TODO: The set of packed operations is more limited, so may want to
41371d5a254SDimitry Andric     // promote some anyway.
41471d5a254SDimitry Andric     if (ST->hasVOP3PInsts())
415b915e9e0SDimitry Andric       return false;
41671d5a254SDimitry Andric 
41771d5a254SDimitry Andric     return needsPromotionToI32(VT->getElementType());
41871d5a254SDimitry Andric   }
41971d5a254SDimitry Andric 
42071d5a254SDimitry Andric   return false;
42171d5a254SDimitry Andric }
42271d5a254SDimitry Andric 
isLegalFloatingTy(const Type * Ty) const4237fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
4247fa27ce4SDimitry Andric   return Ty->isFloatTy() || Ty->isDoubleTy() ||
4257fa27ce4SDimitry Andric          (Ty->isHalfTy() && ST->has16BitInsts());
4267fa27ce4SDimitry Andric }
4277fa27ce4SDimitry Andric 
42871d5a254SDimitry Andric // Return true if the op promoted to i32 should have nsw set.
promotedOpIsNSW(const Instruction & I)42971d5a254SDimitry Andric static bool promotedOpIsNSW(const Instruction &I) {
43071d5a254SDimitry Andric   switch (I.getOpcode()) {
43171d5a254SDimitry Andric   case Instruction::Shl:
43271d5a254SDimitry Andric   case Instruction::Add:
43371d5a254SDimitry Andric   case Instruction::Sub:
43471d5a254SDimitry Andric     return true;
43571d5a254SDimitry Andric   case Instruction::Mul:
43671d5a254SDimitry Andric     return I.hasNoUnsignedWrap();
43771d5a254SDimitry Andric   default:
43871d5a254SDimitry Andric     return false;
43971d5a254SDimitry Andric   }
44071d5a254SDimitry Andric }
44171d5a254SDimitry Andric 
44271d5a254SDimitry Andric // Return true if the op promoted to i32 should have nuw set.
promotedOpIsNUW(const Instruction & I)44371d5a254SDimitry Andric static bool promotedOpIsNUW(const Instruction &I) {
44471d5a254SDimitry Andric   switch (I.getOpcode()) {
44571d5a254SDimitry Andric   case Instruction::Shl:
44671d5a254SDimitry Andric   case Instruction::Add:
44771d5a254SDimitry Andric   case Instruction::Mul:
44871d5a254SDimitry Andric     return true;
44971d5a254SDimitry Andric   case Instruction::Sub:
45071d5a254SDimitry Andric     return I.hasNoUnsignedWrap();
45171d5a254SDimitry Andric   default:
45271d5a254SDimitry Andric     return false;
45371d5a254SDimitry Andric   }
454b915e9e0SDimitry Andric }
455b915e9e0SDimitry Andric 
canWidenScalarExtLoad(LoadInst & I) const4567fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
457044eb2f6SDimitry Andric   Type *Ty = I.getType();
458044eb2f6SDimitry Andric   const DataLayout &DL = Mod->getDataLayout();
459044eb2f6SDimitry Andric   int TySize = DL.getTypeSizeInBits(Ty);
460cfca06d7SDimitry Andric   Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
461044eb2f6SDimitry Andric 
4627fa27ce4SDimitry Andric   return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I);
463044eb2f6SDimitry Andric }
464044eb2f6SDimitry Andric 
promoteUniformOpToI32(BinaryOperator & I) const4657fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
466b915e9e0SDimitry Andric   assert(needsPromotionToI32(I.getType()) &&
467b915e9e0SDimitry Andric          "I does not need promotion to i32");
468b915e9e0SDimitry Andric 
469b915e9e0SDimitry Andric   if (I.getOpcode() == Instruction::SDiv ||
470eb11fae6SDimitry Andric       I.getOpcode() == Instruction::UDiv ||
471eb11fae6SDimitry Andric       I.getOpcode() == Instruction::SRem ||
472eb11fae6SDimitry Andric       I.getOpcode() == Instruction::URem)
473b915e9e0SDimitry Andric     return false;
474b915e9e0SDimitry Andric 
475b915e9e0SDimitry Andric   IRBuilder<> Builder(&I);
476b915e9e0SDimitry Andric   Builder.SetCurrentDebugLocation(I.getDebugLoc());
477b915e9e0SDimitry Andric 
478b915e9e0SDimitry Andric   Type *I32Ty = getI32Ty(Builder, I.getType());
479b915e9e0SDimitry Andric   Value *ExtOp0 = nullptr;
480b915e9e0SDimitry Andric   Value *ExtOp1 = nullptr;
481b915e9e0SDimitry Andric   Value *ExtRes = nullptr;
482b915e9e0SDimitry Andric   Value *TruncRes = nullptr;
483b915e9e0SDimitry Andric 
484b915e9e0SDimitry Andric   if (isSigned(I)) {
485b915e9e0SDimitry Andric     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
486b915e9e0SDimitry Andric     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
487b915e9e0SDimitry Andric   } else {
488b915e9e0SDimitry Andric     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
489b915e9e0SDimitry Andric     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
490b915e9e0SDimitry Andric   }
49171d5a254SDimitry Andric 
49271d5a254SDimitry Andric   ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
49371d5a254SDimitry Andric   if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
49471d5a254SDimitry Andric     if (promotedOpIsNSW(cast<Instruction>(I)))
49571d5a254SDimitry Andric       Inst->setHasNoSignedWrap();
49671d5a254SDimitry Andric 
49771d5a254SDimitry Andric     if (promotedOpIsNUW(cast<Instruction>(I)))
49871d5a254SDimitry Andric       Inst->setHasNoUnsignedWrap();
49971d5a254SDimitry Andric 
50071d5a254SDimitry Andric     if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
50171d5a254SDimitry Andric       Inst->setIsExact(ExactOp->isExact());
50271d5a254SDimitry Andric   }
50371d5a254SDimitry Andric 
504b915e9e0SDimitry Andric   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
505b915e9e0SDimitry Andric 
506b915e9e0SDimitry Andric   I.replaceAllUsesWith(TruncRes);
507b915e9e0SDimitry Andric   I.eraseFromParent();
508b915e9e0SDimitry Andric 
509b915e9e0SDimitry Andric   return true;
510b915e9e0SDimitry Andric }
511b915e9e0SDimitry Andric 
promoteUniformOpToI32(ICmpInst & I) const5127fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
513b915e9e0SDimitry Andric   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
514b915e9e0SDimitry Andric          "I does not need promotion to i32");
515b915e9e0SDimitry Andric 
516b915e9e0SDimitry Andric   IRBuilder<> Builder(&I);
517b915e9e0SDimitry Andric   Builder.SetCurrentDebugLocation(I.getDebugLoc());
518b915e9e0SDimitry Andric 
519b915e9e0SDimitry Andric   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
520b915e9e0SDimitry Andric   Value *ExtOp0 = nullptr;
521b915e9e0SDimitry Andric   Value *ExtOp1 = nullptr;
522b915e9e0SDimitry Andric   Value *NewICmp  = nullptr;
523b915e9e0SDimitry Andric 
524b915e9e0SDimitry Andric   if (I.isSigned()) {
525b915e9e0SDimitry Andric     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
526b915e9e0SDimitry Andric     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
527b915e9e0SDimitry Andric   } else {
528b915e9e0SDimitry Andric     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
529b915e9e0SDimitry Andric     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
530b915e9e0SDimitry Andric   }
531b915e9e0SDimitry Andric   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
532b915e9e0SDimitry Andric 
533b915e9e0SDimitry Andric   I.replaceAllUsesWith(NewICmp);
534b915e9e0SDimitry Andric   I.eraseFromParent();
535b915e9e0SDimitry Andric 
536b915e9e0SDimitry Andric   return true;
537b915e9e0SDimitry Andric }
538b915e9e0SDimitry Andric 
promoteUniformOpToI32(SelectInst & I) const5397fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
540b915e9e0SDimitry Andric   assert(needsPromotionToI32(I.getType()) &&
541b915e9e0SDimitry Andric          "I does not need promotion to i32");
542b915e9e0SDimitry Andric 
543b915e9e0SDimitry Andric   IRBuilder<> Builder(&I);
544b915e9e0SDimitry Andric   Builder.SetCurrentDebugLocation(I.getDebugLoc());
545b915e9e0SDimitry Andric 
546b915e9e0SDimitry Andric   Type *I32Ty = getI32Ty(Builder, I.getType());
547b915e9e0SDimitry Andric   Value *ExtOp1 = nullptr;
548b915e9e0SDimitry Andric   Value *ExtOp2 = nullptr;
549b915e9e0SDimitry Andric   Value *ExtRes = nullptr;
550b915e9e0SDimitry Andric   Value *TruncRes = nullptr;
551b915e9e0SDimitry Andric 
552b915e9e0SDimitry Andric   if (isSigned(I)) {
553b915e9e0SDimitry Andric     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
554b915e9e0SDimitry Andric     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
555b915e9e0SDimitry Andric   } else {
556b915e9e0SDimitry Andric     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
557b915e9e0SDimitry Andric     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
558b915e9e0SDimitry Andric   }
559b915e9e0SDimitry Andric   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
560b915e9e0SDimitry Andric   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
561b915e9e0SDimitry Andric 
562b915e9e0SDimitry Andric   I.replaceAllUsesWith(TruncRes);
563b915e9e0SDimitry Andric   I.eraseFromParent();
564b915e9e0SDimitry Andric 
565b915e9e0SDimitry Andric   return true;
566b915e9e0SDimitry Andric }
567b915e9e0SDimitry Andric 
promoteUniformBitreverseToI32(IntrinsicInst & I) const5687fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
569b915e9e0SDimitry Andric     IntrinsicInst &I) const {
570b915e9e0SDimitry Andric   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
571b915e9e0SDimitry Andric          "I must be bitreverse intrinsic");
572b915e9e0SDimitry Andric   assert(needsPromotionToI32(I.getType()) &&
573b915e9e0SDimitry Andric          "I does not need promotion to i32");
574b915e9e0SDimitry Andric 
575b915e9e0SDimitry Andric   IRBuilder<> Builder(&I);
576b915e9e0SDimitry Andric   Builder.SetCurrentDebugLocation(I.getDebugLoc());
577b915e9e0SDimitry Andric 
578b915e9e0SDimitry Andric   Type *I32Ty = getI32Ty(Builder, I.getType());
579b915e9e0SDimitry Andric   Function *I32 =
580b915e9e0SDimitry Andric       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
581b915e9e0SDimitry Andric   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
582b915e9e0SDimitry Andric   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
583b915e9e0SDimitry Andric   Value *LShrOp =
584b915e9e0SDimitry Andric       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
585b915e9e0SDimitry Andric   Value *TruncRes =
586b915e9e0SDimitry Andric       Builder.CreateTrunc(LShrOp, I.getType());
587b915e9e0SDimitry Andric 
588b915e9e0SDimitry Andric   I.replaceAllUsesWith(TruncRes);
589b915e9e0SDimitry Andric   I.eraseFromParent();
590b915e9e0SDimitry Andric 
591b915e9e0SDimitry Andric   return true;
592b915e9e0SDimitry Andric }
593b915e9e0SDimitry Andric 
numBitsUnsigned(Value * Op) const5947fa27ce4SDimitry Andric unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
5956f8fc217SDimitry Andric   return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();
596e6d15924SDimitry Andric }
597e6d15924SDimitry Andric 
numBitsSigned(Value * Op) const5987fa27ce4SDimitry Andric unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
5996f8fc217SDimitry Andric   return ComputeMaxSignificantBits(Op, *DL, 0, AC);
600e6d15924SDimitry Andric }
601e6d15924SDimitry Andric 
extractValues(IRBuilder<> & Builder,SmallVectorImpl<Value * > & Values,Value * V)602e6d15924SDimitry Andric static void extractValues(IRBuilder<> &Builder,
603e6d15924SDimitry Andric                           SmallVectorImpl<Value *> &Values, Value *V) {
604cfca06d7SDimitry Andric   auto *VT = dyn_cast<FixedVectorType>(V->getType());
605e6d15924SDimitry Andric   if (!VT) {
606e6d15924SDimitry Andric     Values.push_back(V);
607e6d15924SDimitry Andric     return;
608e6d15924SDimitry Andric   }
609e6d15924SDimitry Andric 
610e6d15924SDimitry Andric   for (int I = 0, E = VT->getNumElements(); I != E; ++I)
611e6d15924SDimitry Andric     Values.push_back(Builder.CreateExtractElement(V, I));
612e6d15924SDimitry Andric }
613e6d15924SDimitry Andric 
insertValues(IRBuilder<> & Builder,Type * Ty,SmallVectorImpl<Value * > & Values)614e6d15924SDimitry Andric static Value *insertValues(IRBuilder<> &Builder,
615e6d15924SDimitry Andric                            Type *Ty,
616e6d15924SDimitry Andric                            SmallVectorImpl<Value *> &Values) {
617e3b55780SDimitry Andric   if (!Ty->isVectorTy()) {
618e3b55780SDimitry Andric     assert(Values.size() == 1);
619e6d15924SDimitry Andric     return Values[0];
620e3b55780SDimitry Andric   }
621e6d15924SDimitry Andric 
622e3b55780SDimitry Andric   Value *NewVal = PoisonValue::get(Ty);
623e6d15924SDimitry Andric   for (int I = 0, E = Values.size(); I != E; ++I)
624e6d15924SDimitry Andric     NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
625e6d15924SDimitry Andric 
626e6d15924SDimitry Andric   return NewVal;
627e6d15924SDimitry Andric }
628e6d15924SDimitry Andric 
replaceMulWithMul24(BinaryOperator & I) const6297fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
630e6d15924SDimitry Andric   if (I.getOpcode() != Instruction::Mul)
631e6d15924SDimitry Andric     return false;
632e6d15924SDimitry Andric 
633e6d15924SDimitry Andric   Type *Ty = I.getType();
634e6d15924SDimitry Andric   unsigned Size = Ty->getScalarSizeInBits();
635e6d15924SDimitry Andric   if (Size <= 16 && ST->has16BitInsts())
636e6d15924SDimitry Andric     return false;
637e6d15924SDimitry Andric 
638e6d15924SDimitry Andric   // Prefer scalar if this could be s_mul_i32
6397fa27ce4SDimitry Andric   if (UA->isUniform(&I))
640e6d15924SDimitry Andric     return false;
641e6d15924SDimitry Andric 
642e6d15924SDimitry Andric   Value *LHS = I.getOperand(0);
643e6d15924SDimitry Andric   Value *RHS = I.getOperand(1);
644e6d15924SDimitry Andric   IRBuilder<> Builder(&I);
645e6d15924SDimitry Andric   Builder.SetCurrentDebugLocation(I.getDebugLoc());
646e6d15924SDimitry Andric 
647c0981da4SDimitry Andric   unsigned LHSBits = 0, RHSBits = 0;
648c0981da4SDimitry Andric   bool IsSigned = false;
649e6d15924SDimitry Andric 
6506f8fc217SDimitry Andric   if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
6516f8fc217SDimitry Andric       (RHSBits = numBitsUnsigned(RHS)) <= 24) {
652c0981da4SDimitry Andric     IsSigned = false;
653c0981da4SDimitry Andric 
6546f8fc217SDimitry Andric   } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
6556f8fc217SDimitry Andric              (RHSBits = numBitsSigned(RHS)) <= 24) {
656c0981da4SDimitry Andric     IsSigned = true;
657c0981da4SDimitry Andric 
658e6d15924SDimitry Andric   } else
659e6d15924SDimitry Andric     return false;
660e6d15924SDimitry Andric 
661e6d15924SDimitry Andric   SmallVector<Value *, 4> LHSVals;
662e6d15924SDimitry Andric   SmallVector<Value *, 4> RHSVals;
663e6d15924SDimitry Andric   SmallVector<Value *, 4> ResultVals;
664e6d15924SDimitry Andric   extractValues(Builder, LHSVals, LHS);
665e6d15924SDimitry Andric   extractValues(Builder, RHSVals, RHS);
666e6d15924SDimitry Andric 
667e6d15924SDimitry Andric   IntegerType *I32Ty = Builder.getInt32Ty();
668b1c73532SDimitry Andric   IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
669b1c73532SDimitry Andric   Type *DstTy = LHSVals[0]->getType();
670b1c73532SDimitry Andric 
671e6d15924SDimitry Andric   for (int I = 0, E = LHSVals.size(); I != E; ++I) {
672b1c73532SDimitry Andric     Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
673b1c73532SDimitry Andric                           : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
674b1c73532SDimitry Andric     Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
675b1c73532SDimitry Andric                           : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
676b1c73532SDimitry Andric     Intrinsic::ID ID =
677b1c73532SDimitry Andric         IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
678b1c73532SDimitry Andric     Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
679b1c73532SDimitry Andric     Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
680b1c73532SDimitry Andric                       : Builder.CreateZExtOrTrunc(Result, DstTy);
681b1c73532SDimitry Andric     ResultVals.push_back(Result);
682e6d15924SDimitry Andric   }
683e6d15924SDimitry Andric 
6841d5ae102SDimitry Andric   Value *NewVal = insertValues(Builder, Ty, ResultVals);
6851d5ae102SDimitry Andric   NewVal->takeName(&I);
6861d5ae102SDimitry Andric   I.replaceAllUsesWith(NewVal);
687e6d15924SDimitry Andric   I.eraseFromParent();
688e6d15924SDimitry Andric 
689e6d15924SDimitry Andric   return true;
690e6d15924SDimitry Andric }
691e6d15924SDimitry Andric 
692cfca06d7SDimitry Andric // Find a select instruction, which may have been casted. This is mostly to deal
693cfca06d7SDimitry Andric // with cases where i16 selects were promoted here to i32.
findSelectThroughCast(Value * V,CastInst * & Cast)694cfca06d7SDimitry Andric static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
695cfca06d7SDimitry Andric   Cast = nullptr;
696cfca06d7SDimitry Andric   if (SelectInst *Sel = dyn_cast<SelectInst>(V))
697cfca06d7SDimitry Andric     return Sel;
698eb11fae6SDimitry Andric 
699cfca06d7SDimitry Andric   if ((Cast = dyn_cast<CastInst>(V))) {
700cfca06d7SDimitry Andric     if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
701cfca06d7SDimitry Andric       return Sel;
702a7fe922bSDimitry Andric   }
703a7fe922bSDimitry Andric 
704cfca06d7SDimitry Andric   return nullptr;
705cfca06d7SDimitry Andric }
706a7fe922bSDimitry Andric 
foldBinOpIntoSelect(BinaryOperator & BO) const7077fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
708cfca06d7SDimitry Andric   // Don't do this unless the old select is going away. We want to eliminate the
709cfca06d7SDimitry Andric   // binary operator, not replace a binop with a select.
710cfca06d7SDimitry Andric   int SelOpNo = 0;
711cfca06d7SDimitry Andric 
712cfca06d7SDimitry Andric   CastInst *CastOp;
713cfca06d7SDimitry Andric 
714cfca06d7SDimitry Andric   // TODO: Should probably try to handle some cases with multiple
715cfca06d7SDimitry Andric   // users. Duplicating the select may be profitable for division.
716cfca06d7SDimitry Andric   SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
717cfca06d7SDimitry Andric   if (!Sel || !Sel->hasOneUse()) {
718cfca06d7SDimitry Andric     SelOpNo = 1;
719cfca06d7SDimitry Andric     Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
720cfca06d7SDimitry Andric   }
721cfca06d7SDimitry Andric 
722cfca06d7SDimitry Andric   if (!Sel || !Sel->hasOneUse())
723a7fe922bSDimitry Andric     return false;
724a7fe922bSDimitry Andric 
725cfca06d7SDimitry Andric   Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
726cfca06d7SDimitry Andric   Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
727cfca06d7SDimitry Andric   Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
728cfca06d7SDimitry Andric   if (!CBO || !CT || !CF)
729cfca06d7SDimitry Andric     return false;
730cfca06d7SDimitry Andric 
731cfca06d7SDimitry Andric   if (CastOp) {
732cfca06d7SDimitry Andric     if (!CastOp->hasOneUse())
733cfca06d7SDimitry Andric       return false;
734cfca06d7SDimitry Andric     CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
735cfca06d7SDimitry Andric     CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
736cfca06d7SDimitry Andric   }
737cfca06d7SDimitry Andric 
738cfca06d7SDimitry Andric   // TODO: Handle special 0/-1 cases DAG combine does, although we only really
739cfca06d7SDimitry Andric   // need to handle divisions here.
740cfca06d7SDimitry Andric   Constant *FoldedT = SelOpNo ?
741cfca06d7SDimitry Andric     ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
742cfca06d7SDimitry Andric     ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
7431f917f69SDimitry Andric   if (!FoldedT || isa<ConstantExpr>(FoldedT))
744cfca06d7SDimitry Andric     return false;
745cfca06d7SDimitry Andric 
746cfca06d7SDimitry Andric   Constant *FoldedF = SelOpNo ?
747cfca06d7SDimitry Andric     ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
748cfca06d7SDimitry Andric     ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
7491f917f69SDimitry Andric   if (!FoldedF || isa<ConstantExpr>(FoldedF))
750cfca06d7SDimitry Andric     return false;
751cfca06d7SDimitry Andric 
752cfca06d7SDimitry Andric   IRBuilder<> Builder(&BO);
753cfca06d7SDimitry Andric   Builder.SetCurrentDebugLocation(BO.getDebugLoc());
754cfca06d7SDimitry Andric   if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
755cfca06d7SDimitry Andric     Builder.setFastMathFlags(FPOp->getFastMathFlags());
756cfca06d7SDimitry Andric 
757cfca06d7SDimitry Andric   Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
758cfca06d7SDimitry Andric                                           FoldedT, FoldedF);
759cfca06d7SDimitry Andric   NewSelect->takeName(&BO);
760cfca06d7SDimitry Andric   BO.replaceAllUsesWith(NewSelect);
761cfca06d7SDimitry Andric   BO.eraseFromParent();
762cfca06d7SDimitry Andric   if (CastOp)
763cfca06d7SDimitry Andric     CastOp->eraseFromParent();
764cfca06d7SDimitry Andric   Sel->eraseFromParent();
765cfca06d7SDimitry Andric   return true;
766cfca06d7SDimitry Andric }
767cfca06d7SDimitry Andric 
7687fa27ce4SDimitry Andric std::pair<Value *, Value *>
getFrexpResults(IRBuilder<> & Builder,Value * Src) const7697fa27ce4SDimitry Andric AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
7707fa27ce4SDimitry Andric                                           Value *Src) const {
7717fa27ce4SDimitry Andric   Type *Ty = Src->getType();
7727fa27ce4SDimitry Andric   Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
7737fa27ce4SDimitry Andric                                          {Ty, Builder.getInt32Ty()}, Src);
7747fa27ce4SDimitry Andric   Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
7757fa27ce4SDimitry Andric 
7767fa27ce4SDimitry Andric   // Bypass the bug workaround for the exponent result since it doesn't matter.
7777fa27ce4SDimitry Andric   // TODO: Does the bug workaround even really need to consider the exponent
7787fa27ce4SDimitry Andric   // result? It's unspecified by the spec.
7797fa27ce4SDimitry Andric 
7807fa27ce4SDimitry Andric   Value *FrexpExp =
7817fa27ce4SDimitry Andric       ST->hasFractBug()
7827fa27ce4SDimitry Andric           ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
7837fa27ce4SDimitry Andric                                     {Builder.getInt32Ty(), Ty}, Src)
7847fa27ce4SDimitry Andric           : Builder.CreateExtractValue(Frexp, {1});
7857fa27ce4SDimitry Andric   return {FrexpMant, FrexpExp};
7867fa27ce4SDimitry Andric }
7877fa27ce4SDimitry Andric 
7887fa27ce4SDimitry Andric /// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
emitRcpIEEE1ULP(IRBuilder<> & Builder,Value * Src,bool IsNegative) const7897fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
7907fa27ce4SDimitry Andric                                                  Value *Src,
7917fa27ce4SDimitry Andric                                                  bool IsNegative) const {
7927fa27ce4SDimitry Andric   // Same as for 1.0, but expand the sign out of the constant.
7937fa27ce4SDimitry Andric   // -1.0 / x -> rcp (fneg x)
7947fa27ce4SDimitry Andric   if (IsNegative)
7957fa27ce4SDimitry Andric     Src = Builder.CreateFNeg(Src);
7967fa27ce4SDimitry Andric 
7977fa27ce4SDimitry Andric   // The rcp instruction doesn't support denormals, so scale the input
7987fa27ce4SDimitry Andric   // out of the denormal range and convert at the end.
7997fa27ce4SDimitry Andric   //
8007fa27ce4SDimitry Andric   // Expand as 2^-n * (1.0 / (x * 2^n))
8017fa27ce4SDimitry Andric 
8027fa27ce4SDimitry Andric   // TODO: Skip scaling if input is known never denormal and the input
8037fa27ce4SDimitry Andric   // range won't underflow to denormal. The hard part is knowing the
8047fa27ce4SDimitry Andric   // result. We need a range check, the result could be denormal for
8057fa27ce4SDimitry Andric   // 0x1p+126 < den <= 0x1p+127.
8067fa27ce4SDimitry Andric   auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
8077fa27ce4SDimitry Andric   Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
8087fa27ce4SDimitry Andric   Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
809b1c73532SDimitry Andric   return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
8107fa27ce4SDimitry Andric }
8117fa27ce4SDimitry Andric 
8127fa27ce4SDimitry Andric /// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
emitFrexpDiv(IRBuilder<> & Builder,Value * LHS,Value * RHS,FastMathFlags FMF) const8137fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
8147fa27ce4SDimitry Andric                                               Value *RHS,
8157fa27ce4SDimitry Andric                                               FastMathFlags FMF) const {
8167fa27ce4SDimitry Andric   // If we have have to work around the fract/frexp bug, we're worse off than
8177fa27ce4SDimitry Andric   // using the fdiv.fast expansion. The full safe expansion is faster if we have
8187fa27ce4SDimitry Andric   // fast FMA.
8197fa27ce4SDimitry Andric   if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() &&
8207fa27ce4SDimitry Andric       (!FMF.noNaNs() || !FMF.noInfs()))
8217fa27ce4SDimitry Andric     return nullptr;
8227fa27ce4SDimitry Andric 
8237fa27ce4SDimitry Andric   // We're scaling the LHS to avoid a denormal input, and scale the denominator
8247fa27ce4SDimitry Andric   // to avoid large values underflowing the result.
8257fa27ce4SDimitry Andric   auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
8267fa27ce4SDimitry Andric 
8277fa27ce4SDimitry Andric   Value *Rcp =
8287fa27ce4SDimitry Andric       Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
8297fa27ce4SDimitry Andric 
8307fa27ce4SDimitry Andric   auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
8317fa27ce4SDimitry Andric   Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
8327fa27ce4SDimitry Andric 
8337fa27ce4SDimitry Andric   // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
8347fa27ce4SDimitry Andric   // result.
8357fa27ce4SDimitry Andric   Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
836b1c73532SDimitry Andric   return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
837b1c73532SDimitry Andric }
838b1c73532SDimitry Andric 
839b1c73532SDimitry Andric /// Emit a sqrt that handles denormals and is accurate to 2ulp.
emitSqrtIEEE2ULP(IRBuilder<> & Builder,Value * Src,FastMathFlags FMF) const840b1c73532SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
841b1c73532SDimitry Andric                                                   Value *Src,
842b1c73532SDimitry Andric                                                   FastMathFlags FMF) const {
843b1c73532SDimitry Andric   Type *Ty = Src->getType();
844b1c73532SDimitry Andric   APFloat SmallestNormal =
845b1c73532SDimitry Andric       APFloat::getSmallestNormalized(Ty->getFltSemantics());
846b1c73532SDimitry Andric   Value *NeedScale =
847b1c73532SDimitry Andric       Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
848b1c73532SDimitry Andric 
849b1c73532SDimitry Andric   ConstantInt *Zero = Builder.getInt32(0);
850b1c73532SDimitry Andric   Value *InputScaleFactor =
851b1c73532SDimitry Andric       Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
852b1c73532SDimitry Andric 
853b1c73532SDimitry Andric   Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
854b1c73532SDimitry Andric 
855b1c73532SDimitry Andric   Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
856b1c73532SDimitry Andric 
857b1c73532SDimitry Andric   Value *OutputScaleFactor =
858b1c73532SDimitry Andric       Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
859b1c73532SDimitry Andric   return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
8607fa27ce4SDimitry Andric }
8617fa27ce4SDimitry Andric 
8627fa27ce4SDimitry Andric /// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
emitRsqIEEE1ULP(IRBuilder<> & Builder,Value * Src,bool IsNegative)8637fa27ce4SDimitry Andric static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
8647fa27ce4SDimitry Andric                               bool IsNegative) {
8657fa27ce4SDimitry Andric   // bool need_scale = x < 0x1p-126f;
8667fa27ce4SDimitry Andric   // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
8677fa27ce4SDimitry Andric   // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
8687fa27ce4SDimitry Andric   // rsq(x * input_scale) * output_scale;
8697fa27ce4SDimitry Andric 
8707fa27ce4SDimitry Andric   Type *Ty = Src->getType();
8717fa27ce4SDimitry Andric   APFloat SmallestNormal =
8727fa27ce4SDimitry Andric       APFloat::getSmallestNormalized(Ty->getFltSemantics());
8737fa27ce4SDimitry Andric   Value *NeedScale =
8747fa27ce4SDimitry Andric       Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
8757fa27ce4SDimitry Andric   Constant *One = ConstantFP::get(Ty, 1.0);
8767fa27ce4SDimitry Andric   Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
8777fa27ce4SDimitry Andric   Constant *OutputScale =
8787fa27ce4SDimitry Andric       ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
8797fa27ce4SDimitry Andric 
8807fa27ce4SDimitry Andric   Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
8817fa27ce4SDimitry Andric 
8827fa27ce4SDimitry Andric   Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
8837fa27ce4SDimitry Andric   Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
8847fa27ce4SDimitry Andric   Value *OutputScaleFactor = Builder.CreateSelect(
8857fa27ce4SDimitry Andric       NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
8867fa27ce4SDimitry Andric 
8877fa27ce4SDimitry Andric   return Builder.CreateFMul(Rsq, OutputScaleFactor);
8887fa27ce4SDimitry Andric }
8897fa27ce4SDimitry Andric 
canOptimizeWithRsq(const FPMathOperator * SqrtOp,FastMathFlags DivFMF,FastMathFlags SqrtFMF) const8907fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
8917fa27ce4SDimitry Andric                                                   FastMathFlags DivFMF,
8927fa27ce4SDimitry Andric                                                   FastMathFlags SqrtFMF) const {
8937fa27ce4SDimitry Andric   // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
8947fa27ce4SDimitry Andric   if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
8957fa27ce4SDimitry Andric     return false;
8967fa27ce4SDimitry Andric 
8977fa27ce4SDimitry Andric   // v_rsq_f32 gives 1ulp
8987fa27ce4SDimitry Andric   return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
8997fa27ce4SDimitry Andric          SqrtOp->getFPAccuracy() >= 1.0f;
9007fa27ce4SDimitry Andric }
9017fa27ce4SDimitry Andric 
optimizeWithRsq(IRBuilder<> & Builder,Value * Num,Value * Den,const FastMathFlags DivFMF,const FastMathFlags SqrtFMF,const Instruction * CtxI) const9027fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
903b1c73532SDimitry Andric     IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
904b1c73532SDimitry Andric     const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
9057fa27ce4SDimitry Andric   // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
9067fa27ce4SDimitry Andric   assert(DivFMF.allowContract() && SqrtFMF.allowContract());
9077fa27ce4SDimitry Andric 
9087fa27ce4SDimitry Andric   // rsq_f16 is accurate to 0.51 ulp.
9097fa27ce4SDimitry Andric   // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
9107fa27ce4SDimitry Andric   // rsq_f64 is never accurate.
9117fa27ce4SDimitry Andric   const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
9127fa27ce4SDimitry Andric   if (!CLHS)
9137fa27ce4SDimitry Andric     return nullptr;
9147fa27ce4SDimitry Andric 
9157fa27ce4SDimitry Andric   assert(Den->getType()->isFloatTy());
9167fa27ce4SDimitry Andric 
9177fa27ce4SDimitry Andric   bool IsNegative = false;
9187fa27ce4SDimitry Andric 
9197fa27ce4SDimitry Andric   // TODO: Handle other numerator values with arcp.
9207fa27ce4SDimitry Andric   if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
9217fa27ce4SDimitry Andric     // Add in the sqrt flags.
9227fa27ce4SDimitry Andric     IRBuilder<>::FastMathFlagGuard Guard(Builder);
923b1c73532SDimitry Andric     Builder.setFastMathFlags(DivFMF | SqrtFMF);
9247fa27ce4SDimitry Andric 
925b1c73532SDimitry Andric     if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
9267fa27ce4SDimitry Andric         canIgnoreDenormalInput(Den, CtxI)) {
9277fa27ce4SDimitry Andric       Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
9287fa27ce4SDimitry Andric       // -1.0 / sqrt(x) -> fneg(rsq(x))
9297fa27ce4SDimitry Andric       return IsNegative ? Builder.CreateFNeg(Result) : Result;
9307fa27ce4SDimitry Andric     }
9317fa27ce4SDimitry Andric 
9327fa27ce4SDimitry Andric     return emitRsqIEEE1ULP(Builder, Den, IsNegative);
9337fa27ce4SDimitry Andric   }
9347fa27ce4SDimitry Andric 
9357fa27ce4SDimitry Andric   return nullptr;
9367fa27ce4SDimitry Andric }
9377fa27ce4SDimitry Andric 
938cfca06d7SDimitry Andric // Optimize fdiv with rcp:
939cfca06d7SDimitry Andric //
940cfca06d7SDimitry Andric // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
941cfca06d7SDimitry Andric //               allowed with unsafe-fp-math or afn.
942cfca06d7SDimitry Andric //
9437fa27ce4SDimitry Andric // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
9447fa27ce4SDimitry Andric Value *
optimizeWithRcp(IRBuilder<> & Builder,Value * Num,Value * Den,FastMathFlags FMF,const Instruction * CtxI) const9457fa27ce4SDimitry Andric AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
9467fa27ce4SDimitry Andric                                           Value *Den, FastMathFlags FMF,
9477fa27ce4SDimitry Andric                                           const Instruction *CtxI) const {
9487fa27ce4SDimitry Andric   // rcp_f16 is accurate to 0.51 ulp.
9497fa27ce4SDimitry Andric   // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
9507fa27ce4SDimitry Andric   // rcp_f64 is never accurate.
9517fa27ce4SDimitry Andric   assert(Den->getType()->isFloatTy());
952cfca06d7SDimitry Andric 
953cfca06d7SDimitry Andric   if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
9547fa27ce4SDimitry Andric     bool IsNegative = false;
9557fa27ce4SDimitry Andric     if (CLHS->isExactlyValue(1.0) ||
9567fa27ce4SDimitry Andric         (IsNegative = CLHS->isExactlyValue(-1.0))) {
9577fa27ce4SDimitry Andric       Value *Src = Den;
9587fa27ce4SDimitry Andric 
9597fa27ce4SDimitry Andric       if (HasFP32DenormalFlush || FMF.approxFunc()) {
9607fa27ce4SDimitry Andric         // -1.0 / x -> 1.0 / fneg(x)
9617fa27ce4SDimitry Andric         if (IsNegative)
9627fa27ce4SDimitry Andric           Src = Builder.CreateFNeg(Src);
963cfca06d7SDimitry Andric 
964cfca06d7SDimitry Andric         // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
965cfca06d7SDimitry Andric         // the CI documentation has a worst case error of 1 ulp.
9667fa27ce4SDimitry Andric         // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
9677fa27ce4SDimitry Andric         // to use it as long as we aren't trying to use denormals.
968cfca06d7SDimitry Andric         //
969cfca06d7SDimitry Andric         // v_rcp_f16 and v_rsq_f16 DO support denormals.
970cfca06d7SDimitry Andric 
971cfca06d7SDimitry Andric         // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
972cfca06d7SDimitry Andric         //       insert rsq intrinsic here.
973cfca06d7SDimitry Andric 
974cfca06d7SDimitry Andric         // 1.0 / x -> rcp(x)
9757fa27ce4SDimitry Andric         return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
976cfca06d7SDimitry Andric       }
977cfca06d7SDimitry Andric 
9787fa27ce4SDimitry Andric       // TODO: If the input isn't denormal, and we know the input exponent isn't
9797fa27ce4SDimitry Andric       // big enough to introduce a denormal we can avoid the scaling.
9807fa27ce4SDimitry Andric       return emitRcpIEEE1ULP(Builder, Src, IsNegative);
981cfca06d7SDimitry Andric     }
982cfca06d7SDimitry Andric   }
983cfca06d7SDimitry Andric 
9847fa27ce4SDimitry Andric   if (FMF.allowReciprocal()) {
985cfca06d7SDimitry Andric     // x / y -> x * (1.0 / y)
9867fa27ce4SDimitry Andric 
9877fa27ce4SDimitry Andric     // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
9887fa27ce4SDimitry Andric     // will never underflow.
9897fa27ce4SDimitry Andric     if (HasFP32DenormalFlush || FMF.approxFunc()) {
9907fa27ce4SDimitry Andric       Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
991cfca06d7SDimitry Andric       return Builder.CreateFMul(Num, Recip);
992cfca06d7SDimitry Andric     }
9937fa27ce4SDimitry Andric 
9947fa27ce4SDimitry Andric     Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
9957fa27ce4SDimitry Andric     return Builder.CreateFMul(Num, Recip);
9967fa27ce4SDimitry Andric   }
9977fa27ce4SDimitry Andric 
998cfca06d7SDimitry Andric   return nullptr;
999cfca06d7SDimitry Andric }
1000cfca06d7SDimitry Andric 
1001cfca06d7SDimitry Andric // optimize with fdiv.fast:
1002cfca06d7SDimitry Andric //
1003cfca06d7SDimitry Andric // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1004cfca06d7SDimitry Andric //
1005cfca06d7SDimitry Andric // 1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
1006cfca06d7SDimitry Andric //
1007cfca06d7SDimitry Andric // NOTE: optimizeWithRcp should be tried first because rcp is the preference.
optimizeWithFDivFast(IRBuilder<> & Builder,Value * Num,Value * Den,float ReqdAccuracy) const10087fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
10097fa27ce4SDimitry Andric     IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
1010cfca06d7SDimitry Andric   // fdiv.fast can achieve 2.5 ULP accuracy.
1011cfca06d7SDimitry Andric   if (ReqdAccuracy < 2.5f)
1012cfca06d7SDimitry Andric     return nullptr;
1013cfca06d7SDimitry Andric 
1014cfca06d7SDimitry Andric   // Only have fdiv.fast for f32.
10157fa27ce4SDimitry Andric   assert(Den->getType()->isFloatTy());
1016cfca06d7SDimitry Andric 
1017cfca06d7SDimitry Andric   bool NumIsOne = false;
1018cfca06d7SDimitry Andric   if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
1019cfca06d7SDimitry Andric     if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
1020cfca06d7SDimitry Andric       NumIsOne = true;
1021cfca06d7SDimitry Andric   }
1022cfca06d7SDimitry Andric 
1023cfca06d7SDimitry Andric   // fdiv does not support denormals. But 1.0/x is always fine to use it.
10247fa27ce4SDimitry Andric   //
10257fa27ce4SDimitry Andric   // TODO: This works for any value with a specific known exponent range, don't
10267fa27ce4SDimitry Andric   // just limit to constant 1.
10277fa27ce4SDimitry Andric   if (!HasFP32DenormalFlush && !NumIsOne)
1028cfca06d7SDimitry Andric     return nullptr;
1029cfca06d7SDimitry Andric 
10307fa27ce4SDimitry Andric   return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
10317fa27ce4SDimitry Andric }
10327fa27ce4SDimitry Andric 
visitFDivElement(IRBuilder<> & Builder,Value * Num,Value * Den,FastMathFlags DivFMF,FastMathFlags SqrtFMF,Value * RsqOp,const Instruction * FDivInst,float ReqdDivAccuracy) const10337fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
10347fa27ce4SDimitry Andric     IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
10357fa27ce4SDimitry Andric     FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
10367fa27ce4SDimitry Andric     float ReqdDivAccuracy) const {
10377fa27ce4SDimitry Andric   if (RsqOp) {
10387fa27ce4SDimitry Andric     Value *Rsq =
10397fa27ce4SDimitry Andric         optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
10407fa27ce4SDimitry Andric     if (Rsq)
10417fa27ce4SDimitry Andric       return Rsq;
10427fa27ce4SDimitry Andric   }
10437fa27ce4SDimitry Andric 
10447fa27ce4SDimitry Andric   Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
10457fa27ce4SDimitry Andric   if (Rcp)
10467fa27ce4SDimitry Andric     return Rcp;
10477fa27ce4SDimitry Andric 
10487fa27ce4SDimitry Andric   // In the basic case fdiv_fast has the same instruction count as the frexp div
10497fa27ce4SDimitry Andric   // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
10507fa27ce4SDimitry Andric   // potentially be fused into a user. Also, materialization of the constants
10517fa27ce4SDimitry Andric   // can be reused for multiple instances.
10527fa27ce4SDimitry Andric   Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
10537fa27ce4SDimitry Andric   if (FDivFast)
10547fa27ce4SDimitry Andric     return FDivFast;
10557fa27ce4SDimitry Andric 
10567fa27ce4SDimitry Andric   return emitFrexpDiv(Builder, Num, Den, DivFMF);
1057cfca06d7SDimitry Andric }
1058cfca06d7SDimitry Andric 
1059cfca06d7SDimitry Andric // Optimizations is performed based on fpmath, fast math flags as well as
1060cfca06d7SDimitry Andric // denormals to optimize fdiv with either rcp or fdiv.fast.
1061cfca06d7SDimitry Andric //
1062cfca06d7SDimitry Andric // With rcp:
1063cfca06d7SDimitry Andric //   1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1064cfca06d7SDimitry Andric //                 allowed with unsafe-fp-math or afn.
1065cfca06d7SDimitry Andric //
1066cfca06d7SDimitry Andric //   a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
1067cfca06d7SDimitry Andric //
1068cfca06d7SDimitry Andric // With fdiv.fast:
1069cfca06d7SDimitry Andric //   a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1070cfca06d7SDimitry Andric //
1071cfca06d7SDimitry Andric //   1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
1072cfca06d7SDimitry Andric //
1073cfca06d7SDimitry Andric // NOTE: rcp is the preference in cases that both are legal.
visitFDiv(BinaryOperator & FDiv)10747fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
10757fa27ce4SDimitry Andric   if (DisableFDivExpand)
10767fa27ce4SDimitry Andric     return false;
1077cfca06d7SDimitry Andric 
1078cfca06d7SDimitry Andric   Type *Ty = FDiv.getType()->getScalarType();
10797fa27ce4SDimitry Andric   if (!Ty->isFloatTy())
10807fa27ce4SDimitry Andric     return false;
1081cfca06d7SDimitry Andric 
1082b60736ecSDimitry Andric   // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
10837fa27ce4SDimitry Andric   // expansion around them in codegen. f16 is good enough to always use.
1084a7fe922bSDimitry Andric 
1085a7fe922bSDimitry Andric   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
10867fa27ce4SDimitry Andric   const FastMathFlags DivFMF = FPOp->getFastMathFlags();
1087cfca06d7SDimitry Andric   const float ReqdAccuracy = FPOp->getFPAccuracy();
1088a7fe922bSDimitry Andric 
10897fa27ce4SDimitry Andric   FastMathFlags SqrtFMF;
1090a7fe922bSDimitry Andric 
1091a7fe922bSDimitry Andric   Value *Num = FDiv.getOperand(0);
1092a7fe922bSDimitry Andric   Value *Den = FDiv.getOperand(1);
1093a7fe922bSDimitry Andric 
10947fa27ce4SDimitry Andric   Value *RsqOp = nullptr;
10957fa27ce4SDimitry Andric   auto *DenII = dyn_cast<IntrinsicInst>(Den);
10967fa27ce4SDimitry Andric   if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
10977fa27ce4SDimitry Andric       DenII->hasOneUse()) {
10987fa27ce4SDimitry Andric     const auto *SqrtOp = cast<FPMathOperator>(DenII);
10997fa27ce4SDimitry Andric     SqrtFMF = SqrtOp->getFastMathFlags();
11007fa27ce4SDimitry Andric     if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
11017fa27ce4SDimitry Andric       RsqOp = SqrtOp->getOperand(0);
1102a7fe922bSDimitry Andric   }
1103a7fe922bSDimitry Andric 
1104b1c73532SDimitry Andric   // Inaccurate rcp is allowed with unsafe-fp-math or afn.
1105b1c73532SDimitry Andric   //
1106b1c73532SDimitry Andric   // Defer to codegen to handle this.
1107b1c73532SDimitry Andric   //
1108b1c73532SDimitry Andric   // TODO: Decide on an interpretation for interactions between afn + arcp +
1109b1c73532SDimitry Andric   // !fpmath, and make it consistent between here and codegen. For now, defer
1110b1c73532SDimitry Andric   // expansion of afn to codegen. The current interpretation is so aggressive we
1111b1c73532SDimitry Andric   // don't need any pre-consideration here when we have better information. A
1112b1c73532SDimitry Andric   // more conservative interpretation could use handling here.
1113b1c73532SDimitry Andric   const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
1114b1c73532SDimitry Andric   if (!RsqOp && AllowInaccurateRcp)
1115b1c73532SDimitry Andric     return false;
1116b1c73532SDimitry Andric 
1117b1c73532SDimitry Andric   // Defer the correct implementations to codegen.
1118b1c73532SDimitry Andric   if (ReqdAccuracy < 1.0f)
1119b1c73532SDimitry Andric     return false;
1120b1c73532SDimitry Andric 
11217fa27ce4SDimitry Andric   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
11227fa27ce4SDimitry Andric   Builder.setFastMathFlags(DivFMF);
11237fa27ce4SDimitry Andric   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
11247fa27ce4SDimitry Andric 
11257fa27ce4SDimitry Andric   SmallVector<Value *, 4> NumVals;
11267fa27ce4SDimitry Andric   SmallVector<Value *, 4> DenVals;
11277fa27ce4SDimitry Andric   SmallVector<Value *, 4> RsqDenVals;
11287fa27ce4SDimitry Andric   extractValues(Builder, NumVals, Num);
11297fa27ce4SDimitry Andric   extractValues(Builder, DenVals, Den);
11307fa27ce4SDimitry Andric 
11317fa27ce4SDimitry Andric   if (RsqOp)
11327fa27ce4SDimitry Andric     extractValues(Builder, RsqDenVals, RsqOp);
11337fa27ce4SDimitry Andric 
11347fa27ce4SDimitry Andric   SmallVector<Value *, 4> ResultVals(NumVals.size());
11357fa27ce4SDimitry Andric   for (int I = 0, E = NumVals.size(); I != E; ++I) {
11367fa27ce4SDimitry Andric     Value *NumElt = NumVals[I];
11377fa27ce4SDimitry Andric     Value *DenElt = DenVals[I];
11387fa27ce4SDimitry Andric     Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
11397fa27ce4SDimitry Andric 
11407fa27ce4SDimitry Andric     Value *NewElt =
11417fa27ce4SDimitry Andric         visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
11427fa27ce4SDimitry Andric                          cast<Instruction>(FPOp), ReqdAccuracy);
11437fa27ce4SDimitry Andric     if (!NewElt) {
11447fa27ce4SDimitry Andric       // Keep the original, but scalarized.
11457fa27ce4SDimitry Andric 
11467fa27ce4SDimitry Andric       // This has the unfortunate side effect of sometimes scalarizing when
11477fa27ce4SDimitry Andric       // we're not going to do anything.
11487fa27ce4SDimitry Andric       NewElt = Builder.CreateFDiv(NumElt, DenElt);
11497fa27ce4SDimitry Andric       if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
11507fa27ce4SDimitry Andric         NewEltInst->copyMetadata(FDiv);
1151a7fe922bSDimitry Andric     }
1152a7fe922bSDimitry Andric 
11537fa27ce4SDimitry Andric     ResultVals[I] = NewElt;
1154a7fe922bSDimitry Andric   }
1155a7fe922bSDimitry Andric 
11567fa27ce4SDimitry Andric   Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
1157344a3780SDimitry Andric 
11587fa27ce4SDimitry Andric   if (NewVal) {
11597fa27ce4SDimitry Andric     FDiv.replaceAllUsesWith(NewVal);
11607fa27ce4SDimitry Andric     NewVal->takeName(&FDiv);
11617fa27ce4SDimitry Andric     RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLInfo);
11627fa27ce4SDimitry Andric   }
1163344a3780SDimitry Andric 
1164344a3780SDimitry Andric   return true;
1165344a3780SDimitry Andric }
1166344a3780SDimitry Andric 
hasUnsafeFPMath(const Function & F)1167a7fe922bSDimitry Andric static bool hasUnsafeFPMath(const Function &F) {
1168a7fe922bSDimitry Andric   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
1169344a3780SDimitry Andric   return Attr.getValueAsBool();
1170a7fe922bSDimitry Andric }
1171a7fe922bSDimitry Andric 
getMul64(IRBuilder<> & Builder,Value * LHS,Value * RHS)1172eb11fae6SDimitry Andric static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
1173eb11fae6SDimitry Andric                                           Value *LHS, Value *RHS) {
1174eb11fae6SDimitry Andric   Type *I32Ty = Builder.getInt32Ty();
1175eb11fae6SDimitry Andric   Type *I64Ty = Builder.getInt64Ty();
1176b915e9e0SDimitry Andric 
1177eb11fae6SDimitry Andric   Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
1178eb11fae6SDimitry Andric   Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
1179eb11fae6SDimitry Andric   Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1180eb11fae6SDimitry Andric   Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
1181eb11fae6SDimitry Andric   Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1182eb11fae6SDimitry Andric   Hi = Builder.CreateTrunc(Hi, I32Ty);
1183e3b55780SDimitry Andric   return std::pair(Lo, Hi);
1184eb11fae6SDimitry Andric }
1185eb11fae6SDimitry Andric 
getMulHu(IRBuilder<> & Builder,Value * LHS,Value * RHS)1186eb11fae6SDimitry Andric static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1187eb11fae6SDimitry Andric   return getMul64(Builder, LHS, RHS).second;
1188eb11fae6SDimitry Andric }
1189eb11fae6SDimitry Andric 
1190145449b1SDimitry Andric /// Figure out how many bits are really needed for this division. \p AtLeast is
1191cfca06d7SDimitry Andric /// an optimization hint to bypass the second ComputeNumSignBits call if we the
1192cfca06d7SDimitry Andric /// first one is insufficient. Returns -1 on failure.
getDivNumBits(BinaryOperator & I,Value * Num,Value * Den,unsigned AtLeast,bool IsSigned) const11937fa27ce4SDimitry Andric int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
11947fa27ce4SDimitry Andric                                             Value *Den, unsigned AtLeast,
11957fa27ce4SDimitry Andric                                             bool IsSigned) const {
1196cfca06d7SDimitry Andric   const DataLayout &DL = Mod->getDataLayout();
1197cfca06d7SDimitry Andric   unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
1198cfca06d7SDimitry Andric   if (LHSSignBits < AtLeast)
1199cfca06d7SDimitry Andric     return -1;
1200cfca06d7SDimitry Andric 
1201cfca06d7SDimitry Andric   unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
1202cfca06d7SDimitry Andric   if (RHSSignBits < AtLeast)
1203cfca06d7SDimitry Andric     return -1;
1204cfca06d7SDimitry Andric 
1205cfca06d7SDimitry Andric   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1206cfca06d7SDimitry Andric   unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
1207cfca06d7SDimitry Andric   if (IsSigned)
1208cfca06d7SDimitry Andric     ++DivBits;
1209cfca06d7SDimitry Andric   return DivBits;
1210cfca06d7SDimitry Andric }
1211cfca06d7SDimitry Andric 
1212eb11fae6SDimitry Andric // The fractional part of a float is enough to accurately represent up to
1213eb11fae6SDimitry Andric // a 24-bit signed integer.
expandDivRem24(IRBuilder<> & Builder,BinaryOperator & I,Value * Num,Value * Den,bool IsDiv,bool IsSigned) const12147fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
12157fa27ce4SDimitry Andric                                                 BinaryOperator &I, Value *Num,
12167fa27ce4SDimitry Andric                                                 Value *Den, bool IsDiv,
12177fa27ce4SDimitry Andric                                                 bool IsSigned) const {
1218ac9a064cSDimitry Andric   unsigned SSBits = Num->getType()->getScalarSizeInBits();
1219ac9a064cSDimitry Andric   // If Num bits <= 24, assume 0 signbits.
1220ac9a064cSDimitry Andric   unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);
1221ac9a064cSDimitry Andric   int DivBits = getDivNumBits(I, Num, Den, AtLeast, IsSigned);
1222cfca06d7SDimitry Andric   if (DivBits == -1)
1223eb11fae6SDimitry Andric     return nullptr;
1224cfca06d7SDimitry Andric   return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
1225cfca06d7SDimitry Andric }
1226eb11fae6SDimitry Andric 
expandDivRem24Impl(IRBuilder<> & Builder,BinaryOperator & I,Value * Num,Value * Den,unsigned DivBits,bool IsDiv,bool IsSigned) const12277fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
12287fa27ce4SDimitry Andric     IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
12297fa27ce4SDimitry Andric     unsigned DivBits, bool IsDiv, bool IsSigned) const {
1230eb11fae6SDimitry Andric   Type *I32Ty = Builder.getInt32Ty();
1231cfca06d7SDimitry Andric   Num = Builder.CreateTrunc(Num, I32Ty);
1232cfca06d7SDimitry Andric   Den = Builder.CreateTrunc(Den, I32Ty);
1233cfca06d7SDimitry Andric 
1234eb11fae6SDimitry Andric   Type *F32Ty = Builder.getFloatTy();
1235eb11fae6SDimitry Andric   ConstantInt *One = Builder.getInt32(1);
1236eb11fae6SDimitry Andric   Value *JQ = One;
1237eb11fae6SDimitry Andric 
1238eb11fae6SDimitry Andric   if (IsSigned) {
1239eb11fae6SDimitry Andric     // char|short jq = ia ^ ib;
1240eb11fae6SDimitry Andric     JQ = Builder.CreateXor(Num, Den);
1241eb11fae6SDimitry Andric 
1242eb11fae6SDimitry Andric     // jq = jq >> (bitsize - 2)
1243eb11fae6SDimitry Andric     JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1244eb11fae6SDimitry Andric 
1245eb11fae6SDimitry Andric     // jq = jq | 0x1
1246eb11fae6SDimitry Andric     JQ = Builder.CreateOr(JQ, One);
1247eb11fae6SDimitry Andric   }
1248eb11fae6SDimitry Andric 
1249eb11fae6SDimitry Andric   // int ia = (int)LHS;
1250eb11fae6SDimitry Andric   Value *IA = Num;
1251eb11fae6SDimitry Andric 
1252eb11fae6SDimitry Andric   // int ib, (int)RHS;
1253eb11fae6SDimitry Andric   Value *IB = Den;
1254eb11fae6SDimitry Andric 
1255eb11fae6SDimitry Andric   // float fa = (float)ia;
1256eb11fae6SDimitry Andric   Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1257eb11fae6SDimitry Andric                        : Builder.CreateUIToFP(IA, F32Ty);
1258eb11fae6SDimitry Andric 
1259eb11fae6SDimitry Andric   // float fb = (float)ib;
1260eb11fae6SDimitry Andric   Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1261eb11fae6SDimitry Andric                        : Builder.CreateUIToFP(IB,F32Ty);
1262eb11fae6SDimitry Andric 
1263cfca06d7SDimitry Andric   Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
1264cfca06d7SDimitry Andric                                                 Builder.getFloatTy());
1265cfca06d7SDimitry Andric   Value *RCP = Builder.CreateCall(RcpDecl, { FB });
1266eb11fae6SDimitry Andric   Value *FQM = Builder.CreateFMul(FA, RCP);
1267eb11fae6SDimitry Andric 
1268eb11fae6SDimitry Andric   // fq = trunc(fqm);
1269d8e91e46SDimitry Andric   CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1270eb11fae6SDimitry Andric   FQ->copyFastMathFlags(Builder.getFastMathFlags());
1271eb11fae6SDimitry Andric 
1272eb11fae6SDimitry Andric   // float fqneg = -fq;
1273eb11fae6SDimitry Andric   Value *FQNeg = Builder.CreateFNeg(FQ);
1274eb11fae6SDimitry Andric 
1275eb11fae6SDimitry Andric   // float fr = mad(fqneg, fb, fa);
1276cfca06d7SDimitry Andric   auto FMAD = !ST->hasMadMacF32Insts()
1277cfca06d7SDimitry Andric                   ? Intrinsic::fma
1278cfca06d7SDimitry Andric                   : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1279cfca06d7SDimitry Andric   Value *FR = Builder.CreateIntrinsic(FMAD,
1280d8e91e46SDimitry Andric                                       {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1281eb11fae6SDimitry Andric 
1282eb11fae6SDimitry Andric   // int iq = (int)fq;
1283eb11fae6SDimitry Andric   Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1284eb11fae6SDimitry Andric                        : Builder.CreateFPToUI(FQ, I32Ty);
1285eb11fae6SDimitry Andric 
1286eb11fae6SDimitry Andric   // fr = fabs(fr);
1287d8e91e46SDimitry Andric   FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
1288eb11fae6SDimitry Andric 
1289eb11fae6SDimitry Andric   // fb = fabs(fb);
1290d8e91e46SDimitry Andric   FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
1291eb11fae6SDimitry Andric 
1292eb11fae6SDimitry Andric   // int cv = fr >= fb;
1293eb11fae6SDimitry Andric   Value *CV = Builder.CreateFCmpOGE(FR, FB);
1294eb11fae6SDimitry Andric 
1295eb11fae6SDimitry Andric   // jq = (cv ? jq : 0);
1296eb11fae6SDimitry Andric   JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1297eb11fae6SDimitry Andric 
1298eb11fae6SDimitry Andric   // dst = iq + jq;
1299eb11fae6SDimitry Andric   Value *Div = Builder.CreateAdd(IQ, JQ);
1300eb11fae6SDimitry Andric 
1301eb11fae6SDimitry Andric   Value *Res = Div;
1302eb11fae6SDimitry Andric   if (!IsDiv) {
1303eb11fae6SDimitry Andric     // Rem needs compensation, it's easier to recompute it
1304eb11fae6SDimitry Andric     Value *Rem = Builder.CreateMul(Div, Den);
1305eb11fae6SDimitry Andric     Res = Builder.CreateSub(Num, Rem);
1306eb11fae6SDimitry Andric   }
1307eb11fae6SDimitry Andric 
1308cfca06d7SDimitry Andric   if (DivBits != 0 && DivBits < 32) {
1309cfca06d7SDimitry Andric     // Extend in register from the number of bits this divide really is.
1310eb11fae6SDimitry Andric     if (IsSigned) {
1311cfca06d7SDimitry Andric       int InRegBits = 32 - DivBits;
1312cfca06d7SDimitry Andric 
1313cfca06d7SDimitry Andric       Res = Builder.CreateShl(Res, InRegBits);
1314cfca06d7SDimitry Andric       Res = Builder.CreateAShr(Res, InRegBits);
1315eb11fae6SDimitry Andric     } else {
1316cfca06d7SDimitry Andric       ConstantInt *TruncMask
1317cfca06d7SDimitry Andric         = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1318eb11fae6SDimitry Andric       Res = Builder.CreateAnd(Res, TruncMask);
1319eb11fae6SDimitry Andric     }
1320cfca06d7SDimitry Andric   }
1321eb11fae6SDimitry Andric 
1322eb11fae6SDimitry Andric   return Res;
1323eb11fae6SDimitry Andric }
1324eb11fae6SDimitry Andric 
1325cfca06d7SDimitry Andric // Try to recognize special cases the DAG will emit special, better expansions
1326cfca06d7SDimitry Andric // than the general expansion we do here.
1327cfca06d7SDimitry Andric 
1328cfca06d7SDimitry Andric // TODO: It would be better to just directly handle those optimizations here.
divHasSpecialOptimization(BinaryOperator & I,Value * Num,Value * Den) const13297fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
13307fa27ce4SDimitry Andric                                                          Value *Num,
13317fa27ce4SDimitry Andric                                                          Value *Den) const {
1332cfca06d7SDimitry Andric   if (Constant *C = dyn_cast<Constant>(Den)) {
1333cfca06d7SDimitry Andric     // Arbitrary constants get a better expansion as long as a wider mulhi is
1334cfca06d7SDimitry Andric     // legal.
1335cfca06d7SDimitry Andric     if (C->getType()->getScalarSizeInBits() <= 32)
1336cfca06d7SDimitry Andric       return true;
1337cfca06d7SDimitry Andric 
1338cfca06d7SDimitry Andric     // TODO: Sdiv check for not exact for some reason.
1339cfca06d7SDimitry Andric 
1340cfca06d7SDimitry Andric     // If there's no wider mulhi, there's only a better expansion for powers of
1341cfca06d7SDimitry Andric     // two.
1342cfca06d7SDimitry Andric     // TODO: Should really know for each vector element.
1343cfca06d7SDimitry Andric     if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
1344cfca06d7SDimitry Andric       return true;
1345cfca06d7SDimitry Andric 
1346cfca06d7SDimitry Andric     return false;
1347cfca06d7SDimitry Andric   }
1348cfca06d7SDimitry Andric 
1349cfca06d7SDimitry Andric   if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1350cfca06d7SDimitry Andric     // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1351cfca06d7SDimitry Andric     if (BinOpDen->getOpcode() == Instruction::Shl &&
1352cfca06d7SDimitry Andric         isa<Constant>(BinOpDen->getOperand(0)) &&
1353cfca06d7SDimitry Andric         isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
1354cfca06d7SDimitry Andric                                0, AC, &I, DT)) {
1355cfca06d7SDimitry Andric       return true;
1356cfca06d7SDimitry Andric     }
1357cfca06d7SDimitry Andric   }
1358cfca06d7SDimitry Andric 
1359cfca06d7SDimitry Andric   return false;
1360cfca06d7SDimitry Andric }
1361cfca06d7SDimitry Andric 
getSign32(Value * V,IRBuilder<> & Builder,const DataLayout * DL)1362cfca06d7SDimitry Andric static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
1363cfca06d7SDimitry Andric   // Check whether the sign can be determined statically.
1364cfca06d7SDimitry Andric   KnownBits Known = computeKnownBits(V, *DL);
1365cfca06d7SDimitry Andric   if (Known.isNegative())
1366cfca06d7SDimitry Andric     return Constant::getAllOnesValue(V->getType());
1367cfca06d7SDimitry Andric   if (Known.isNonNegative())
1368cfca06d7SDimitry Andric     return Constant::getNullValue(V->getType());
1369cfca06d7SDimitry Andric   return Builder.CreateAShr(V, Builder.getInt32(31));
1370cfca06d7SDimitry Andric }
1371cfca06d7SDimitry Andric 
expandDivRem32(IRBuilder<> & Builder,BinaryOperator & I,Value * X,Value * Y) const13727fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1373cfca06d7SDimitry Andric                                                 BinaryOperator &I, Value *X,
1374cfca06d7SDimitry Andric                                                 Value *Y) const {
1375eb11fae6SDimitry Andric   Instruction::BinaryOps Opc = I.getOpcode();
1376eb11fae6SDimitry Andric   assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1377eb11fae6SDimitry Andric          Opc == Instruction::SRem || Opc == Instruction::SDiv);
1378eb11fae6SDimitry Andric 
1379eb11fae6SDimitry Andric   FastMathFlags FMF;
1380eb11fae6SDimitry Andric   FMF.setFast();
1381eb11fae6SDimitry Andric   Builder.setFastMathFlags(FMF);
1382eb11fae6SDimitry Andric 
1383cfca06d7SDimitry Andric   if (divHasSpecialOptimization(I, X, Y))
1384cfca06d7SDimitry Andric     return nullptr;  // Keep it for later optimization.
1385eb11fae6SDimitry Andric 
1386eb11fae6SDimitry Andric   bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1387eb11fae6SDimitry Andric   bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1388eb11fae6SDimitry Andric 
1389cfca06d7SDimitry Andric   Type *Ty = X->getType();
1390eb11fae6SDimitry Andric   Type *I32Ty = Builder.getInt32Ty();
1391eb11fae6SDimitry Andric   Type *F32Ty = Builder.getFloatTy();
1392eb11fae6SDimitry Andric 
1393ac9a064cSDimitry Andric   if (Ty->getScalarSizeInBits() != 32) {
1394eb11fae6SDimitry Andric     if (IsSigned) {
1395ac9a064cSDimitry Andric       X = Builder.CreateSExtOrTrunc(X, I32Ty);
1396ac9a064cSDimitry Andric       Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1397eb11fae6SDimitry Andric     } else {
1398ac9a064cSDimitry Andric       X = Builder.CreateZExtOrTrunc(X, I32Ty);
1399ac9a064cSDimitry Andric       Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1400eb11fae6SDimitry Andric     }
1401eb11fae6SDimitry Andric   }
1402eb11fae6SDimitry Andric 
1403cfca06d7SDimitry Andric   if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1404cfca06d7SDimitry Andric     return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1405cfca06d7SDimitry Andric                       Builder.CreateZExtOrTrunc(Res, Ty);
1406eb11fae6SDimitry Andric   }
1407eb11fae6SDimitry Andric 
1408eb11fae6SDimitry Andric   ConstantInt *Zero = Builder.getInt32(0);
1409eb11fae6SDimitry Andric   ConstantInt *One = Builder.getInt32(1);
1410eb11fae6SDimitry Andric 
1411eb11fae6SDimitry Andric   Value *Sign = nullptr;
1412eb11fae6SDimitry Andric   if (IsSigned) {
1413cfca06d7SDimitry Andric     Value *SignX = getSign32(X, Builder, DL);
1414cfca06d7SDimitry Andric     Value *SignY = getSign32(Y, Builder, DL);
1415eb11fae6SDimitry Andric     // Remainder sign is the same as LHS
1416cfca06d7SDimitry Andric     Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1417eb11fae6SDimitry Andric 
1418cfca06d7SDimitry Andric     X = Builder.CreateAdd(X, SignX);
1419cfca06d7SDimitry Andric     Y = Builder.CreateAdd(Y, SignY);
1420eb11fae6SDimitry Andric 
1421cfca06d7SDimitry Andric     X = Builder.CreateXor(X, SignX);
1422cfca06d7SDimitry Andric     Y = Builder.CreateXor(Y, SignY);
1423eb11fae6SDimitry Andric   }
1424eb11fae6SDimitry Andric 
1425cfca06d7SDimitry Andric   // The algorithm here is based on ideas from "Software Integer Division", Tom
1426cfca06d7SDimitry Andric   // Rodeheffer, August 2008.
1427cfca06d7SDimitry Andric   //
1428cfca06d7SDimitry Andric   // unsigned udiv(unsigned x, unsigned y) {
1429cfca06d7SDimitry Andric   //   // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1430cfca06d7SDimitry Andric   //   // that this is a lower bound on inv(y), even if some of the calculations
1431cfca06d7SDimitry Andric   //   // round up.
1432cfca06d7SDimitry Andric   //   unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1433cfca06d7SDimitry Andric   //
1434cfca06d7SDimitry Andric   //   // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1435cfca06d7SDimitry Andric   //   // Empirically this is guaranteed to give a "two-y" lower bound on
1436cfca06d7SDimitry Andric   //   // inv(y).
1437cfca06d7SDimitry Andric   //   z += umulh(z, -y * z);
1438cfca06d7SDimitry Andric   //
1439cfca06d7SDimitry Andric   //   // Quotient/remainder estimate.
1440cfca06d7SDimitry Andric   //   unsigned q = umulh(x, z);
1441cfca06d7SDimitry Andric   //   unsigned r = x - q * y;
1442cfca06d7SDimitry Andric   //
1443cfca06d7SDimitry Andric   //   // Two rounds of quotient/remainder refinement.
1444cfca06d7SDimitry Andric   //   if (r >= y) {
1445cfca06d7SDimitry Andric   //     ++q;
1446cfca06d7SDimitry Andric   //     r -= y;
1447cfca06d7SDimitry Andric   //   }
1448cfca06d7SDimitry Andric   //   if (r >= y) {
1449cfca06d7SDimitry Andric   //     ++q;
1450cfca06d7SDimitry Andric   //     r -= y;
1451cfca06d7SDimitry Andric   //   }
1452cfca06d7SDimitry Andric   //
1453cfca06d7SDimitry Andric   //   return q;
1454cfca06d7SDimitry Andric   // }
1455eb11fae6SDimitry Andric 
1456cfca06d7SDimitry Andric   // Initial estimate of inv(y).
1457cfca06d7SDimitry Andric   Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1458cfca06d7SDimitry Andric   Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
1459cfca06d7SDimitry Andric   Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
14607fa27ce4SDimitry Andric   Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1461cfca06d7SDimitry Andric   Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1462cfca06d7SDimitry Andric   Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1463eb11fae6SDimitry Andric 
1464cfca06d7SDimitry Andric   // One round of UNR.
1465cfca06d7SDimitry Andric   Value *NegY = Builder.CreateSub(Zero, Y);
1466cfca06d7SDimitry Andric   Value *NegYZ = Builder.CreateMul(NegY, Z);
1467cfca06d7SDimitry Andric   Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1468eb11fae6SDimitry Andric 
1469cfca06d7SDimitry Andric   // Quotient/remainder estimate.
1470cfca06d7SDimitry Andric   Value *Q = getMulHu(Builder, X, Z);
1471cfca06d7SDimitry Andric   Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1472eb11fae6SDimitry Andric 
1473cfca06d7SDimitry Andric   // First quotient/remainder refinement.
1474cfca06d7SDimitry Andric   Value *Cond = Builder.CreateICmpUGE(R, Y);
1475cfca06d7SDimitry Andric   if (IsDiv)
1476cfca06d7SDimitry Andric     Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1477cfca06d7SDimitry Andric   R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1478eb11fae6SDimitry Andric 
1479cfca06d7SDimitry Andric   // Second quotient/remainder refinement.
1480cfca06d7SDimitry Andric   Cond = Builder.CreateICmpUGE(R, Y);
1481eb11fae6SDimitry Andric   Value *Res;
1482cfca06d7SDimitry Andric   if (IsDiv)
1483cfca06d7SDimitry Andric     Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1484cfca06d7SDimitry Andric   else
1485cfca06d7SDimitry Andric     Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1486eb11fae6SDimitry Andric 
1487eb11fae6SDimitry Andric   if (IsSigned) {
1488eb11fae6SDimitry Andric     Res = Builder.CreateXor(Res, Sign);
1489eb11fae6SDimitry Andric     Res = Builder.CreateSub(Res, Sign);
1490ac9a064cSDimitry Andric     Res = Builder.CreateSExtOrTrunc(Res, Ty);
1491ac9a064cSDimitry Andric   } else {
1492ac9a064cSDimitry Andric     Res = Builder.CreateZExtOrTrunc(Res, Ty);
1493eb11fae6SDimitry Andric   }
1494eb11fae6SDimitry Andric   return Res;
1495eb11fae6SDimitry Andric }
1496eb11fae6SDimitry Andric 
shrinkDivRem64(IRBuilder<> & Builder,BinaryOperator & I,Value * Num,Value * Den) const14977fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
14987fa27ce4SDimitry Andric                                                 BinaryOperator &I, Value *Num,
14997fa27ce4SDimitry Andric                                                 Value *Den) const {
1500cfca06d7SDimitry Andric   if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1501cfca06d7SDimitry Andric     return nullptr;  // Keep it for later optimization.
1502cfca06d7SDimitry Andric 
1503cfca06d7SDimitry Andric   Instruction::BinaryOps Opc = I.getOpcode();
1504cfca06d7SDimitry Andric 
1505cfca06d7SDimitry Andric   bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1506cfca06d7SDimitry Andric   bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1507cfca06d7SDimitry Andric 
1508cfca06d7SDimitry Andric   int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1509cfca06d7SDimitry Andric   if (NumDivBits == -1)
1510cfca06d7SDimitry Andric     return nullptr;
1511cfca06d7SDimitry Andric 
1512cfca06d7SDimitry Andric   Value *Narrowed = nullptr;
1513cfca06d7SDimitry Andric   if (NumDivBits <= 24) {
1514cfca06d7SDimitry Andric     Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1515cfca06d7SDimitry Andric                                   IsDiv, IsSigned);
1516cfca06d7SDimitry Andric   } else if (NumDivBits <= 32) {
1517cfca06d7SDimitry Andric     Narrowed = expandDivRem32(Builder, I, Num, Den);
1518cfca06d7SDimitry Andric   }
1519cfca06d7SDimitry Andric 
1520cfca06d7SDimitry Andric   if (Narrowed) {
1521cfca06d7SDimitry Andric     return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1522cfca06d7SDimitry Andric                       Builder.CreateZExt(Narrowed, Num->getType());
1523cfca06d7SDimitry Andric   }
1524cfca06d7SDimitry Andric 
1525cfca06d7SDimitry Andric   return nullptr;
1526cfca06d7SDimitry Andric }
1527cfca06d7SDimitry Andric 
expandDivRem64(BinaryOperator & I) const15287fa27ce4SDimitry Andric void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1529cfca06d7SDimitry Andric   Instruction::BinaryOps Opc = I.getOpcode();
1530cfca06d7SDimitry Andric   // Do the general expansion.
1531cfca06d7SDimitry Andric   if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1532cfca06d7SDimitry Andric     expandDivisionUpTo64Bits(&I);
1533cfca06d7SDimitry Andric     return;
1534cfca06d7SDimitry Andric   }
1535cfca06d7SDimitry Andric 
1536cfca06d7SDimitry Andric   if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1537cfca06d7SDimitry Andric     expandRemainderUpTo64Bits(&I);
1538cfca06d7SDimitry Andric     return;
1539cfca06d7SDimitry Andric   }
1540cfca06d7SDimitry Andric 
1541cfca06d7SDimitry Andric   llvm_unreachable("not a division");
1542cfca06d7SDimitry Andric }
1543cfca06d7SDimitry Andric 
visitBinaryOperator(BinaryOperator & I)15447fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1545cfca06d7SDimitry Andric   if (foldBinOpIntoSelect(I))
1546cfca06d7SDimitry Andric     return true;
1547cfca06d7SDimitry Andric 
1548b915e9e0SDimitry Andric   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
15497fa27ce4SDimitry Andric       UA->isUniform(&I) && promoteUniformOpToI32(I))
1550eb11fae6SDimitry Andric     return true;
1551eb11fae6SDimitry Andric 
15521d5ae102SDimitry Andric   if (UseMul24Intrin && replaceMulWithMul24(I))
1553e6d15924SDimitry Andric     return true;
1554e6d15924SDimitry Andric 
1555eb11fae6SDimitry Andric   bool Changed = false;
1556eb11fae6SDimitry Andric   Instruction::BinaryOps Opc = I.getOpcode();
1557eb11fae6SDimitry Andric   Type *Ty = I.getType();
1558eb11fae6SDimitry Andric   Value *NewDiv = nullptr;
1559cfca06d7SDimitry Andric   unsigned ScalarSize = Ty->getScalarSizeInBits();
1560cfca06d7SDimitry Andric 
1561cfca06d7SDimitry Andric   SmallVector<BinaryOperator *, 8> Div64ToExpand;
1562cfca06d7SDimitry Andric 
1563eb11fae6SDimitry Andric   if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1564eb11fae6SDimitry Andric        Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1565cfca06d7SDimitry Andric       ScalarSize <= 64 &&
1566cfca06d7SDimitry Andric       !DisableIDivExpand) {
1567eb11fae6SDimitry Andric     Value *Num = I.getOperand(0);
1568eb11fae6SDimitry Andric     Value *Den = I.getOperand(1);
1569eb11fae6SDimitry Andric     IRBuilder<> Builder(&I);
1570eb11fae6SDimitry Andric     Builder.SetCurrentDebugLocation(I.getDebugLoc());
1571eb11fae6SDimitry Andric 
1572cfca06d7SDimitry Andric     if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1573e3b55780SDimitry Andric       NewDiv = PoisonValue::get(VT);
1574eb11fae6SDimitry Andric 
1575eb11fae6SDimitry Andric       for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1576eb11fae6SDimitry Andric         Value *NumEltN = Builder.CreateExtractElement(Num, N);
1577eb11fae6SDimitry Andric         Value *DenEltN = Builder.CreateExtractElement(Den, N);
1578cfca06d7SDimitry Andric 
1579cfca06d7SDimitry Andric         Value *NewElt;
1580cfca06d7SDimitry Andric         if (ScalarSize <= 32) {
1581cfca06d7SDimitry Andric           NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1582eb11fae6SDimitry Andric           if (!NewElt)
1583eb11fae6SDimitry Andric             NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1584cfca06d7SDimitry Andric         } else {
1585cfca06d7SDimitry Andric           // See if this 64-bit division can be shrunk to 32/24-bits before
1586cfca06d7SDimitry Andric           // producing the general expansion.
1587cfca06d7SDimitry Andric           NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1588cfca06d7SDimitry Andric           if (!NewElt) {
1589cfca06d7SDimitry Andric             // The general 64-bit expansion introduces control flow and doesn't
1590cfca06d7SDimitry Andric             // return the new value. Just insert a scalar copy and defer
1591cfca06d7SDimitry Andric             // expanding it.
1592cfca06d7SDimitry Andric             NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1593cfca06d7SDimitry Andric             Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
1594cfca06d7SDimitry Andric           }
1595cfca06d7SDimitry Andric         }
1596cfca06d7SDimitry Andric 
1597ac9a064cSDimitry Andric         if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1598ac9a064cSDimitry Andric           NewEltI->copyIRFlags(&I);
1599ac9a064cSDimitry Andric 
1600eb11fae6SDimitry Andric         NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1601eb11fae6SDimitry Andric       }
1602eb11fae6SDimitry Andric     } else {
1603cfca06d7SDimitry Andric       if (ScalarSize <= 32)
1604eb11fae6SDimitry Andric         NewDiv = expandDivRem32(Builder, I, Num, Den);
1605cfca06d7SDimitry Andric       else {
1606cfca06d7SDimitry Andric         NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1607cfca06d7SDimitry Andric         if (!NewDiv)
1608cfca06d7SDimitry Andric           Div64ToExpand.push_back(&I);
1609cfca06d7SDimitry Andric       }
1610eb11fae6SDimitry Andric     }
1611eb11fae6SDimitry Andric 
1612eb11fae6SDimitry Andric     if (NewDiv) {
1613eb11fae6SDimitry Andric       I.replaceAllUsesWith(NewDiv);
1614eb11fae6SDimitry Andric       I.eraseFromParent();
1615eb11fae6SDimitry Andric       Changed = true;
1616eb11fae6SDimitry Andric     }
1617eb11fae6SDimitry Andric   }
1618b915e9e0SDimitry Andric 
1619cfca06d7SDimitry Andric   if (ExpandDiv64InIR) {
1620cfca06d7SDimitry Andric     // TODO: We get much worse code in specially handled constant cases.
1621cfca06d7SDimitry Andric     for (BinaryOperator *Div : Div64ToExpand) {
1622cfca06d7SDimitry Andric       expandDivRem64(*Div);
16237fa27ce4SDimitry Andric       FlowChanged = true;
1624cfca06d7SDimitry Andric       Changed = true;
1625cfca06d7SDimitry Andric     }
1626cfca06d7SDimitry Andric   }
1627cfca06d7SDimitry Andric 
1628b915e9e0SDimitry Andric   return Changed;
1629b915e9e0SDimitry Andric }
1630b915e9e0SDimitry Andric 
visitLoadInst(LoadInst & I)16317fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1632eb11fae6SDimitry Andric   if (!WidenLoads)
1633eb11fae6SDimitry Andric     return false;
1634eb11fae6SDimitry Andric 
1635d8e91e46SDimitry Andric   if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1636d8e91e46SDimitry Andric        I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1637044eb2f6SDimitry Andric       canWidenScalarExtLoad(I)) {
1638044eb2f6SDimitry Andric     IRBuilder<> Builder(&I);
1639044eb2f6SDimitry Andric     Builder.SetCurrentDebugLocation(I.getDebugLoc());
1640044eb2f6SDimitry Andric 
1641044eb2f6SDimitry Andric     Type *I32Ty = Builder.getInt32Ty();
16427fa27ce4SDimitry Andric     LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1643eb11fae6SDimitry Andric     WidenLoad->copyMetadata(I);
1644eb11fae6SDimitry Andric 
1645eb11fae6SDimitry Andric     // If we have range metadata, we need to convert the type, and not make
1646eb11fae6SDimitry Andric     // assumptions about the high bits.
1647eb11fae6SDimitry Andric     if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1648eb11fae6SDimitry Andric       ConstantInt *Lower =
1649eb11fae6SDimitry Andric         mdconst::extract<ConstantInt>(Range->getOperand(0));
1650eb11fae6SDimitry Andric 
1651c0981da4SDimitry Andric       if (Lower->isNullValue()) {
1652eb11fae6SDimitry Andric         WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1653eb11fae6SDimitry Andric       } else {
1654eb11fae6SDimitry Andric         Metadata *LowAndHigh[] = {
1655eb11fae6SDimitry Andric           ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1656eb11fae6SDimitry Andric           // Don't make assumptions about the high bits.
1657eb11fae6SDimitry Andric           ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1658eb11fae6SDimitry Andric         };
1659eb11fae6SDimitry Andric 
1660eb11fae6SDimitry Andric         WidenLoad->setMetadata(LLVMContext::MD_range,
1661eb11fae6SDimitry Andric                                MDNode::get(Mod->getContext(), LowAndHigh));
1662eb11fae6SDimitry Andric       }
1663eb11fae6SDimitry Andric     }
1664044eb2f6SDimitry Andric 
1665044eb2f6SDimitry Andric     int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
1666044eb2f6SDimitry Andric     Type *IntNTy = Builder.getIntNTy(TySize);
1667044eb2f6SDimitry Andric     Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1668044eb2f6SDimitry Andric     Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1669044eb2f6SDimitry Andric     I.replaceAllUsesWith(ValOrig);
1670044eb2f6SDimitry Andric     I.eraseFromParent();
1671044eb2f6SDimitry Andric     return true;
1672044eb2f6SDimitry Andric   }
1673044eb2f6SDimitry Andric 
1674044eb2f6SDimitry Andric   return false;
1675044eb2f6SDimitry Andric }
1676044eb2f6SDimitry Andric 
visitICmpInst(ICmpInst & I)16777fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
1678b915e9e0SDimitry Andric   bool Changed = false;
1679b915e9e0SDimitry Andric 
1680b915e9e0SDimitry Andric   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
16817fa27ce4SDimitry Andric       UA->isUniform(&I))
1682b915e9e0SDimitry Andric     Changed |= promoteUniformOpToI32(I);
1683b915e9e0SDimitry Andric 
1684b915e9e0SDimitry Andric   return Changed;
1685b915e9e0SDimitry Andric }
1686b915e9e0SDimitry Andric 
visitSelectInst(SelectInst & I)16877fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
16887fa27ce4SDimitry Andric   Value *Cond = I.getCondition();
16897fa27ce4SDimitry Andric   Value *TrueVal = I.getTrueValue();
16907fa27ce4SDimitry Andric   Value *FalseVal = I.getFalseValue();
16917fa27ce4SDimitry Andric   Value *CmpVal;
16927fa27ce4SDimitry Andric   FCmpInst::Predicate Pred;
1693b915e9e0SDimitry Andric 
16947fa27ce4SDimitry Andric   if (ST->has16BitInsts() && needsPromotionToI32(I.getType())) {
16957fa27ce4SDimitry Andric     if (UA->isUniform(&I))
16967fa27ce4SDimitry Andric       return promoteUniformOpToI32(I);
16977fa27ce4SDimitry Andric     return false;
1698b915e9e0SDimitry Andric   }
1699b915e9e0SDimitry Andric 
17007fa27ce4SDimitry Andric   // Match fract pattern with nan check.
17017fa27ce4SDimitry Andric   if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
17027fa27ce4SDimitry Andric     return false;
17037fa27ce4SDimitry Andric 
17047fa27ce4SDimitry Andric   FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
17057fa27ce4SDimitry Andric   if (!FPOp)
17067fa27ce4SDimitry Andric     return false;
17077fa27ce4SDimitry Andric 
17087fa27ce4SDimitry Andric   IRBuilder<> Builder(&I);
17097fa27ce4SDimitry Andric   Builder.setFastMathFlags(FPOp->getFastMathFlags());
17107fa27ce4SDimitry Andric 
17117fa27ce4SDimitry Andric   auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
17127fa27ce4SDimitry Andric   auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
17137fa27ce4SDimitry Andric 
17147fa27ce4SDimitry Andric   Value *Fract = nullptr;
17157fa27ce4SDimitry Andric   if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
17167fa27ce4SDimitry Andric       CmpVal == matchFractPat(*IIFalse)) {
17177fa27ce4SDimitry Andric     // isnan(x) ? x : fract(x)
17187fa27ce4SDimitry Andric     Fract = applyFractPat(Builder, CmpVal);
17197fa27ce4SDimitry Andric   } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
17207fa27ce4SDimitry Andric              CmpVal == matchFractPat(*IITrue)) {
17217fa27ce4SDimitry Andric     // !isnan(x) ? fract(x) : x
17227fa27ce4SDimitry Andric     Fract = applyFractPat(Builder, CmpVal);
17237fa27ce4SDimitry Andric   } else
17247fa27ce4SDimitry Andric     return false;
17257fa27ce4SDimitry Andric 
17267fa27ce4SDimitry Andric   Fract->takeName(&I);
17277fa27ce4SDimitry Andric   I.replaceAllUsesWith(Fract);
17287fa27ce4SDimitry Andric   RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);
17297fa27ce4SDimitry Andric   return true;
17307fa27ce4SDimitry Andric }
17317fa27ce4SDimitry Andric 
areInSameBB(const Value * A,const Value * B)17327fa27ce4SDimitry Andric static bool areInSameBB(const Value *A, const Value *B) {
17337fa27ce4SDimitry Andric   const auto *IA = dyn_cast<Instruction>(A);
17347fa27ce4SDimitry Andric   const auto *IB = dyn_cast<Instruction>(B);
17357fa27ce4SDimitry Andric   return IA && IB && IA->getParent() == IB->getParent();
17367fa27ce4SDimitry Andric }
17377fa27ce4SDimitry Andric 
17387fa27ce4SDimitry Andric // Helper for breaking large PHIs that returns true when an extractelement on V
17397fa27ce4SDimitry Andric // is likely to be folded away by the DAG combiner.
isInterestingPHIIncomingValue(const Value * V)17407fa27ce4SDimitry Andric static bool isInterestingPHIIncomingValue(const Value *V) {
17417fa27ce4SDimitry Andric   const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
17427fa27ce4SDimitry Andric   if (!FVT)
17437fa27ce4SDimitry Andric     return false;
17447fa27ce4SDimitry Andric 
17457fa27ce4SDimitry Andric   const Value *CurVal = V;
17467fa27ce4SDimitry Andric 
17477fa27ce4SDimitry Andric   // Check for insertelements, keeping track of the elements covered.
17487fa27ce4SDimitry Andric   BitVector EltsCovered(FVT->getNumElements());
17497fa27ce4SDimitry Andric   while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
17507fa27ce4SDimitry Andric     const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
17517fa27ce4SDimitry Andric 
17527fa27ce4SDimitry Andric     // Non constant index/out of bounds index -> folding is unlikely.
17537fa27ce4SDimitry Andric     // The latter is more of a sanity check because canonical IR should just
17547fa27ce4SDimitry Andric     // have replaced those with poison.
1755ac9a064cSDimitry Andric     if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
17567fa27ce4SDimitry Andric       return false;
17577fa27ce4SDimitry Andric 
17587fa27ce4SDimitry Andric     const auto *VecSrc = IE->getOperand(0);
17597fa27ce4SDimitry Andric 
17607fa27ce4SDimitry Andric     // If the vector source is another instruction, it must be in the same basic
17617fa27ce4SDimitry Andric     // block. Otherwise, the DAGCombiner won't see the whole thing and is
17627fa27ce4SDimitry Andric     // unlikely to be able to do anything interesting here.
17637fa27ce4SDimitry Andric     if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
17647fa27ce4SDimitry Andric       return false;
17657fa27ce4SDimitry Andric 
17667fa27ce4SDimitry Andric     CurVal = VecSrc;
1767ac9a064cSDimitry Andric     EltsCovered.set(Idx->getZExtValue());
17687fa27ce4SDimitry Andric 
17697fa27ce4SDimitry Andric     // All elements covered.
17707fa27ce4SDimitry Andric     if (EltsCovered.all())
17717fa27ce4SDimitry Andric       return true;
17727fa27ce4SDimitry Andric   }
17737fa27ce4SDimitry Andric 
17747fa27ce4SDimitry Andric   // We either didn't find a single insertelement, or the insertelement chain
17757fa27ce4SDimitry Andric   // ended before all elements were covered. Check for other interesting values.
17767fa27ce4SDimitry Andric 
17777fa27ce4SDimitry Andric   // Constants are always interesting because we can just constant fold the
17787fa27ce4SDimitry Andric   // extractelements.
17797fa27ce4SDimitry Andric   if (isa<Constant>(CurVal))
17807fa27ce4SDimitry Andric     return true;
17817fa27ce4SDimitry Andric 
17827fa27ce4SDimitry Andric   // shufflevector is likely to be profitable if either operand is a constant,
17837fa27ce4SDimitry Andric   // or if either source is in the same block.
17847fa27ce4SDimitry Andric   // This is because shufflevector is most often lowered as a series of
17857fa27ce4SDimitry Andric   // insert/extract elements anyway.
17867fa27ce4SDimitry Andric   if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
17877fa27ce4SDimitry Andric     return isa<Constant>(SV->getOperand(1)) ||
17887fa27ce4SDimitry Andric            areInSameBB(SV, SV->getOperand(0)) ||
17897fa27ce4SDimitry Andric            areInSameBB(SV, SV->getOperand(1));
17907fa27ce4SDimitry Andric   }
17917fa27ce4SDimitry Andric 
17927fa27ce4SDimitry Andric   return false;
17937fa27ce4SDimitry Andric }
17947fa27ce4SDimitry Andric 
collectPHINodes(const PHINode & I,SmallPtrSet<const PHINode *,8> & SeenPHIs)1795b1c73532SDimitry Andric static void collectPHINodes(const PHINode &I,
1796b1c73532SDimitry Andric                             SmallPtrSet<const PHINode *, 8> &SeenPHIs) {
1797b1c73532SDimitry Andric   const auto [It, Inserted] = SeenPHIs.insert(&I);
1798b1c73532SDimitry Andric   if (!Inserted)
1799b1c73532SDimitry Andric     return;
1800b1c73532SDimitry Andric 
1801b1c73532SDimitry Andric   for (const Value *Inc : I.incoming_values()) {
1802b1c73532SDimitry Andric     if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1803b1c73532SDimitry Andric       collectPHINodes(*PhiInc, SeenPHIs);
1804b1c73532SDimitry Andric   }
1805b1c73532SDimitry Andric 
1806b1c73532SDimitry Andric   for (const User *U : I.users()) {
1807b1c73532SDimitry Andric     if (const auto *PhiU = dyn_cast<PHINode>(U))
1808b1c73532SDimitry Andric       collectPHINodes(*PhiU, SeenPHIs);
1809b1c73532SDimitry Andric   }
1810b1c73532SDimitry Andric }
1811b1c73532SDimitry Andric 
canBreakPHINode(const PHINode & I)18127fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1813b1c73532SDimitry Andric   // Check in the cache first.
1814b1c73532SDimitry Andric   if (const auto It = BreakPhiNodesCache.find(&I);
1815b1c73532SDimitry Andric       It != BreakPhiNodesCache.end())
18167fa27ce4SDimitry Andric     return It->second;
18177fa27ce4SDimitry Andric 
1818b1c73532SDimitry Andric   // We consider PHI nodes as part of "chains", so given a PHI node I, we
1819b1c73532SDimitry Andric   // recursively consider all its users and incoming values that are also PHI
1820b1c73532SDimitry Andric   // nodes. We then make a decision about all of those PHIs at once. Either they
1821b1c73532SDimitry Andric   // all get broken up, or none of them do. That way, we avoid cases where a
1822b1c73532SDimitry Andric   // single PHI is/is not broken and we end up reforming/exploding a vector
1823b1c73532SDimitry Andric   // multiple times, or even worse, doing it in a loop.
1824b1c73532SDimitry Andric   SmallPtrSet<const PHINode *, 8> WorkList;
1825b1c73532SDimitry Andric   collectPHINodes(I, WorkList);
18267fa27ce4SDimitry Andric 
1827b1c73532SDimitry Andric #ifndef NDEBUG
1828b1c73532SDimitry Andric   // Check that none of the PHI nodes in the worklist are in the map. If some of
1829b1c73532SDimitry Andric   // them are, it means we're not good enough at collecting related PHIs.
1830b1c73532SDimitry Andric   for (const PHINode *WLP : WorkList) {
1831b1c73532SDimitry Andric     assert(BreakPhiNodesCache.count(WLP) == 0);
1832b1c73532SDimitry Andric   }
1833b1c73532SDimitry Andric #endif
1834b1c73532SDimitry Andric 
1835b1c73532SDimitry Andric   // To consider a PHI profitable to break, we need to see some interesting
1836b1c73532SDimitry Andric   // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1837b1c73532SDimitry Andric   // must have one to consider all PHIs breakable.
1838b1c73532SDimitry Andric   //
1839b1c73532SDimitry Andric   // This threshold has been determined through performance testing.
1840b1c73532SDimitry Andric   //
1841b1c73532SDimitry Andric   // Note that the computation below is equivalent to
1842b1c73532SDimitry Andric   //
1843b1c73532SDimitry Andric   //    (unsigned)ceil((K / 3.0) * 2)
1844b1c73532SDimitry Andric   //
1845b1c73532SDimitry Andric   // It's simply written this way to avoid mixing integral/FP arithmetic.
1846b1c73532SDimitry Andric   const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1847b1c73532SDimitry Andric   unsigned NumBreakablePHIs = 0;
1848b1c73532SDimitry Andric   bool CanBreak = false;
1849b1c73532SDimitry Andric   for (const PHINode *Cur : WorkList) {
18507fa27ce4SDimitry Andric     // Don't break PHIs that have no interesting incoming values. That is, where
1851b1c73532SDimitry Andric     // there is no clear opportunity to fold the "extractelement" instructions
1852b1c73532SDimitry Andric     // we would add.
18537fa27ce4SDimitry Andric     //
18547fa27ce4SDimitry Andric     // Note: IC does not run after this pass, so we're only interested in the
18557fa27ce4SDimitry Andric     // foldings that the DAG combiner can do.
1856b1c73532SDimitry Andric     if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1857b1c73532SDimitry Andric       if (++NumBreakablePHIs >= Threshold) {
1858b1c73532SDimitry Andric         CanBreak = true;
1859b1c73532SDimitry Andric         break;
1860b1c73532SDimitry Andric       }
1861b1c73532SDimitry Andric     }
18627fa27ce4SDimitry Andric   }
18637fa27ce4SDimitry Andric 
1864b1c73532SDimitry Andric   for (const PHINode *Cur : WorkList)
1865b1c73532SDimitry Andric     BreakPhiNodesCache[Cur] = CanBreak;
18667fa27ce4SDimitry Andric 
1867b1c73532SDimitry Andric   return CanBreak;
18687fa27ce4SDimitry Andric }
18697fa27ce4SDimitry Andric 
18707fa27ce4SDimitry Andric /// Helper class for "break large PHIs" (visitPHINode).
18717fa27ce4SDimitry Andric ///
18727fa27ce4SDimitry Andric /// This represents a slice of a PHI's incoming value, which is made up of:
18737fa27ce4SDimitry Andric ///   - The type of the slice (Ty)
18747fa27ce4SDimitry Andric ///   - The index in the incoming value's vector where the slice starts (Idx)
18757fa27ce4SDimitry Andric ///   - The number of elements in the slice (NumElts).
18767fa27ce4SDimitry Andric /// It also keeps track of the NewPHI node inserted for this particular slice.
18777fa27ce4SDimitry Andric ///
18787fa27ce4SDimitry Andric /// Slice examples:
18797fa27ce4SDimitry Andric ///   <4 x i64> -> Split into four i64 slices.
18807fa27ce4SDimitry Andric ///     -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
18817fa27ce4SDimitry Andric ///   <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
18827fa27ce4SDimitry Andric ///     -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
18837fa27ce4SDimitry Andric class VectorSlice {
18847fa27ce4SDimitry Andric public:
VectorSlice(Type * Ty,unsigned Idx,unsigned NumElts)18857fa27ce4SDimitry Andric   VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
18867fa27ce4SDimitry Andric       : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
18877fa27ce4SDimitry Andric 
18887fa27ce4SDimitry Andric   Type *Ty = nullptr;
18897fa27ce4SDimitry Andric   unsigned Idx = 0;
18907fa27ce4SDimitry Andric   unsigned NumElts = 0;
18917fa27ce4SDimitry Andric   PHINode *NewPHI = nullptr;
18927fa27ce4SDimitry Andric 
18937fa27ce4SDimitry Andric   /// Slice \p Inc according to the information contained within this slice.
18947fa27ce4SDimitry Andric   /// This is cached, so if called multiple times for the same \p BB & \p Inc
18957fa27ce4SDimitry Andric   /// pair, it returns the same Sliced value as well.
18967fa27ce4SDimitry Andric   ///
18977fa27ce4SDimitry Andric   /// Note this *intentionally* does not return the same value for, say,
18987fa27ce4SDimitry Andric   /// [%bb.0, %0] & [%bb.1, %0] as:
18997fa27ce4SDimitry Andric   ///   - It could cause issues with dominance (e.g. if bb.1 is seen first, then
19007fa27ce4SDimitry Andric   ///   the value in bb.1 may not be reachable from bb.0 if it's its
19017fa27ce4SDimitry Andric   ///   predecessor.)
19027fa27ce4SDimitry Andric   ///   - We also want to make our extract instructions as local as possible so
19037fa27ce4SDimitry Andric   ///   the DAG has better chances of folding them out. Duplicating them like
19047fa27ce4SDimitry Andric   ///   that is beneficial in that regard.
19057fa27ce4SDimitry Andric   ///
19067fa27ce4SDimitry Andric   /// This is both a minor optimization to avoid creating duplicate
19077fa27ce4SDimitry Andric   /// instructions, but also a requirement for correctness. It is not forbidden
19087fa27ce4SDimitry Andric   /// for a PHI node to have the same [BB, Val] pair multiple times. If we
19097fa27ce4SDimitry Andric   /// returned a new value each time, those previously identical pairs would all
19107fa27ce4SDimitry Andric   /// have different incoming values (from the same block) and it'd cause a "PHI
19117fa27ce4SDimitry Andric   /// node has multiple entries for the same basic block with different incoming
19127fa27ce4SDimitry Andric   /// values!" verifier error.
getSlicedVal(BasicBlock * BB,Value * Inc,StringRef NewValName)19137fa27ce4SDimitry Andric   Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
19147fa27ce4SDimitry Andric     Value *&Res = SlicedVals[{BB, Inc}];
19157fa27ce4SDimitry Andric     if (Res)
19167fa27ce4SDimitry Andric       return Res;
19177fa27ce4SDimitry Andric 
19187fa27ce4SDimitry Andric     IRBuilder<> B(BB->getTerminator());
19197fa27ce4SDimitry Andric     if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
19207fa27ce4SDimitry Andric       B.SetCurrentDebugLocation(IncInst->getDebugLoc());
19217fa27ce4SDimitry Andric 
19227fa27ce4SDimitry Andric     if (NumElts > 1) {
19237fa27ce4SDimitry Andric       SmallVector<int, 4> Mask;
19247fa27ce4SDimitry Andric       for (unsigned K = Idx; K < (Idx + NumElts); ++K)
19257fa27ce4SDimitry Andric         Mask.push_back(K);
19267fa27ce4SDimitry Andric       Res = B.CreateShuffleVector(Inc, Mask, NewValName);
19277fa27ce4SDimitry Andric     } else
19287fa27ce4SDimitry Andric       Res = B.CreateExtractElement(Inc, Idx, NewValName);
19297fa27ce4SDimitry Andric 
19307fa27ce4SDimitry Andric     return Res;
19317fa27ce4SDimitry Andric   }
19327fa27ce4SDimitry Andric 
19337fa27ce4SDimitry Andric private:
19347fa27ce4SDimitry Andric   SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals;
19357fa27ce4SDimitry Andric };
19367fa27ce4SDimitry Andric 
visitPHINode(PHINode & I)19377fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
19387fa27ce4SDimitry Andric   // Break-up fixed-vector PHIs into smaller pieces.
19397fa27ce4SDimitry Andric   // Default threshold is 32, so it breaks up any vector that's >32 bits into
19407fa27ce4SDimitry Andric   // its elements, or into 32-bit pieces (for 8/16 bit elts).
19417fa27ce4SDimitry Andric   //
19427fa27ce4SDimitry Andric   // This is only helpful for DAGISel because it doesn't handle large PHIs as
19437fa27ce4SDimitry Andric   // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
19447fa27ce4SDimitry Andric   // With large, odd-sized PHIs we may end up needing many `build_vector`
19457fa27ce4SDimitry Andric   // operations with most elements being "undef". This inhibits a lot of
19467fa27ce4SDimitry Andric   // optimization opportunities and can result in unreasonably high register
19477fa27ce4SDimitry Andric   // pressure and the inevitable stack spilling.
1948b1c73532SDimitry Andric   if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
19497fa27ce4SDimitry Andric     return false;
19507fa27ce4SDimitry Andric 
19517fa27ce4SDimitry Andric   FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1952b1c73532SDimitry Andric   if (!FVT || FVT->getNumElements() == 1 ||
1953b1c73532SDimitry Andric       DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
19547fa27ce4SDimitry Andric     return false;
19557fa27ce4SDimitry Andric 
1956b1c73532SDimitry Andric   if (!ForceBreakLargePHIs && !canBreakPHINode(I))
19577fa27ce4SDimitry Andric     return false;
19587fa27ce4SDimitry Andric 
19597fa27ce4SDimitry Andric   std::vector<VectorSlice> Slices;
19607fa27ce4SDimitry Andric 
19617fa27ce4SDimitry Andric   Type *EltTy = FVT->getElementType();
19627fa27ce4SDimitry Andric   {
19637fa27ce4SDimitry Andric     unsigned Idx = 0;
19647fa27ce4SDimitry Andric     // For 8/16 bits type, don't scalarize fully but break it up into as many
19657fa27ce4SDimitry Andric     // 32-bit slices as we can, and scalarize the tail.
19667fa27ce4SDimitry Andric     const unsigned EltSize = DL->getTypeSizeInBits(EltTy);
19677fa27ce4SDimitry Andric     const unsigned NumElts = FVT->getNumElements();
19687fa27ce4SDimitry Andric     if (EltSize == 8 || EltSize == 16) {
19697fa27ce4SDimitry Andric       const unsigned SubVecSize = (32 / EltSize);
19707fa27ce4SDimitry Andric       Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
19717fa27ce4SDimitry Andric       for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
19727fa27ce4SDimitry Andric            Idx += SubVecSize)
19737fa27ce4SDimitry Andric         Slices.emplace_back(SubVecTy, Idx, SubVecSize);
19747fa27ce4SDimitry Andric     }
19757fa27ce4SDimitry Andric 
19767fa27ce4SDimitry Andric     // Scalarize all remaining elements.
19777fa27ce4SDimitry Andric     for (; Idx < NumElts; ++Idx)
19787fa27ce4SDimitry Andric       Slices.emplace_back(EltTy, Idx, 1);
19797fa27ce4SDimitry Andric   }
19807fa27ce4SDimitry Andric 
1981b1c73532SDimitry Andric   assert(Slices.size() > 1);
19827fa27ce4SDimitry Andric 
19837fa27ce4SDimitry Andric   // Create one PHI per vector piece. The "VectorSlice" class takes care of
19847fa27ce4SDimitry Andric   // creating the necessary instruction to extract the relevant slices of each
19857fa27ce4SDimitry Andric   // incoming value.
19867fa27ce4SDimitry Andric   IRBuilder<> B(I.getParent());
19877fa27ce4SDimitry Andric   B.SetCurrentDebugLocation(I.getDebugLoc());
19887fa27ce4SDimitry Andric 
19897fa27ce4SDimitry Andric   unsigned IncNameSuffix = 0;
19907fa27ce4SDimitry Andric   for (VectorSlice &S : Slices) {
19917fa27ce4SDimitry Andric     // We need to reset the build on each iteration, because getSlicedVal may
19927fa27ce4SDimitry Andric     // have inserted something into I's BB.
1993ac9a064cSDimitry Andric     B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
19947fa27ce4SDimitry Andric     S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
19957fa27ce4SDimitry Andric 
19967fa27ce4SDimitry Andric     for (const auto &[Idx, BB] : enumerate(I.blocks())) {
19977fa27ce4SDimitry Andric       S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
19987fa27ce4SDimitry Andric                                            "largephi.extractslice" +
19997fa27ce4SDimitry Andric                                                std::to_string(IncNameSuffix++)),
20007fa27ce4SDimitry Andric                             BB);
20017fa27ce4SDimitry Andric     }
20027fa27ce4SDimitry Andric   }
20037fa27ce4SDimitry Andric 
20047fa27ce4SDimitry Andric   // And replace this PHI with a vector of all the previous PHI values.
20057fa27ce4SDimitry Andric   Value *Vec = PoisonValue::get(FVT);
20067fa27ce4SDimitry Andric   unsigned NameSuffix = 0;
20077fa27ce4SDimitry Andric   for (VectorSlice &S : Slices) {
20087fa27ce4SDimitry Andric     const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
20097fa27ce4SDimitry Andric     if (S.NumElts > 1)
20107fa27ce4SDimitry Andric       Vec =
20117fa27ce4SDimitry Andric           B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName);
20127fa27ce4SDimitry Andric     else
20137fa27ce4SDimitry Andric       Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
20147fa27ce4SDimitry Andric   }
20157fa27ce4SDimitry Andric 
20167fa27ce4SDimitry Andric   I.replaceAllUsesWith(Vec);
20177fa27ce4SDimitry Andric   I.eraseFromParent();
20187fa27ce4SDimitry Andric   return true;
20197fa27ce4SDimitry Andric }
20207fa27ce4SDimitry Andric 
2021ac9a064cSDimitry Andric /// \param V  Value to check
2022ac9a064cSDimitry Andric /// \param DL DataLayout
2023ac9a064cSDimitry Andric /// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
2024ac9a064cSDimitry Andric /// \param AS Target Address Space
2025ac9a064cSDimitry Andric /// \return true if \p V cannot be the null value of \p AS, false otherwise.
isPtrKnownNeverNull(const Value * V,const DataLayout & DL,const AMDGPUTargetMachine & TM,unsigned AS)2026ac9a064cSDimitry Andric static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
2027ac9a064cSDimitry Andric                                 const AMDGPUTargetMachine &TM, unsigned AS) {
2028ac9a064cSDimitry Andric   // Pointer cannot be null if it's a block address, GV or alloca.
2029ac9a064cSDimitry Andric   // NOTE: We don't support extern_weak, but if we did, we'd need to check for
2030ac9a064cSDimitry Andric   // it as the symbol could be null in such cases.
2031ac9a064cSDimitry Andric   if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
2032ac9a064cSDimitry Andric     return true;
2033ac9a064cSDimitry Andric 
2034ac9a064cSDimitry Andric   // Check nonnull arguments.
2035ac9a064cSDimitry Andric   if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2036ac9a064cSDimitry Andric     return true;
2037ac9a064cSDimitry Andric 
2038ac9a064cSDimitry Andric   // getUnderlyingObject may have looked through another addrspacecast, although
2039ac9a064cSDimitry Andric   // the optimizable situations most likely folded out by now.
2040ac9a064cSDimitry Andric   if (AS != cast<PointerType>(V->getType())->getAddressSpace())
2041ac9a064cSDimitry Andric     return false;
2042ac9a064cSDimitry Andric 
2043ac9a064cSDimitry Andric   // TODO: Calls that return nonnull?
2044ac9a064cSDimitry Andric 
2045ac9a064cSDimitry Andric   // For all other things, use KnownBits.
2046ac9a064cSDimitry Andric   // We either use 0 or all bits set to indicate null, so check whether the
2047ac9a064cSDimitry Andric   // value can be zero or all ones.
2048ac9a064cSDimitry Andric   //
2049ac9a064cSDimitry Andric   // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2050ac9a064cSDimitry Andric   // address spaces have non-zero null values.
2051ac9a064cSDimitry Andric   auto SrcPtrKB = computeKnownBits(V, DL);
2052ac9a064cSDimitry Andric   const auto NullVal = TM.getNullPointerValue(AS);
2053ac9a064cSDimitry Andric 
2054ac9a064cSDimitry Andric   assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2055ac9a064cSDimitry Andric   assert((NullVal == 0 || NullVal == -1) &&
2056ac9a064cSDimitry Andric          "don't know how to check for this null value!");
2057ac9a064cSDimitry Andric   return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2058ac9a064cSDimitry Andric }
2059ac9a064cSDimitry Andric 
visitAddrSpaceCastInst(AddrSpaceCastInst & I)2060ac9a064cSDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2061ac9a064cSDimitry Andric   // Intrinsic doesn't support vectors, also it seems that it's often difficult
2062ac9a064cSDimitry Andric   // to prove that a vector cannot have any nulls in it so it's unclear if it's
2063ac9a064cSDimitry Andric   // worth supporting.
2064ac9a064cSDimitry Andric   if (I.getType()->isVectorTy())
2065ac9a064cSDimitry Andric     return false;
2066ac9a064cSDimitry Andric 
2067ac9a064cSDimitry Andric   // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2068ac9a064cSDimitry Andric   // This is only worthwhile for casts from/to priv/local to flat.
2069ac9a064cSDimitry Andric   const unsigned SrcAS = I.getSrcAddressSpace();
2070ac9a064cSDimitry Andric   const unsigned DstAS = I.getDestAddressSpace();
2071ac9a064cSDimitry Andric 
2072ac9a064cSDimitry Andric   bool CanLower = false;
2073ac9a064cSDimitry Andric   if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2074ac9a064cSDimitry Andric     CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2075ac9a064cSDimitry Andric                 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2076ac9a064cSDimitry Andric   else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2077ac9a064cSDimitry Andric     CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2078ac9a064cSDimitry Andric                 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2079ac9a064cSDimitry Andric   if (!CanLower)
2080ac9a064cSDimitry Andric     return false;
2081ac9a064cSDimitry Andric 
2082ac9a064cSDimitry Andric   SmallVector<const Value *, 4> WorkList;
2083ac9a064cSDimitry Andric   getUnderlyingObjects(I.getOperand(0), WorkList);
2084ac9a064cSDimitry Andric   if (!all_of(WorkList, [&](const Value *V) {
2085ac9a064cSDimitry Andric         return isPtrKnownNeverNull(V, *DL, *TM, SrcAS);
2086ac9a064cSDimitry Andric       }))
2087ac9a064cSDimitry Andric     return false;
2088ac9a064cSDimitry Andric 
2089ac9a064cSDimitry Andric   IRBuilder<> B(&I);
2090ac9a064cSDimitry Andric   auto *Intrin = B.CreateIntrinsic(
2091ac9a064cSDimitry Andric       I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2092ac9a064cSDimitry Andric   I.replaceAllUsesWith(Intrin);
2093ac9a064cSDimitry Andric   I.eraseFromParent();
2094ac9a064cSDimitry Andric   return true;
2095ac9a064cSDimitry Andric }
2096ac9a064cSDimitry Andric 
visitIntrinsicInst(IntrinsicInst & I)20977fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2098b915e9e0SDimitry Andric   switch (I.getIntrinsicID()) {
2099b915e9e0SDimitry Andric   case Intrinsic::bitreverse:
2100b915e9e0SDimitry Andric     return visitBitreverseIntrinsicInst(I);
21017fa27ce4SDimitry Andric   case Intrinsic::minnum:
21027fa27ce4SDimitry Andric     return visitMinNum(I);
2103b1c73532SDimitry Andric   case Intrinsic::sqrt:
2104b1c73532SDimitry Andric     return visitSqrt(I);
2105b915e9e0SDimitry Andric   default:
2106b915e9e0SDimitry Andric     return false;
2107b915e9e0SDimitry Andric   }
2108b915e9e0SDimitry Andric }
2109b915e9e0SDimitry Andric 
visitBitreverseIntrinsicInst(IntrinsicInst & I)21107fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
2111b915e9e0SDimitry Andric   bool Changed = false;
2112b915e9e0SDimitry Andric 
2113b915e9e0SDimitry Andric   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
21147fa27ce4SDimitry Andric       UA->isUniform(&I))
2115b915e9e0SDimitry Andric     Changed |= promoteUniformBitreverseToI32(I);
2116b915e9e0SDimitry Andric 
2117b915e9e0SDimitry Andric   return Changed;
2118b915e9e0SDimitry Andric }
2119b915e9e0SDimitry Andric 
21207fa27ce4SDimitry Andric /// Match non-nan fract pattern.
21217fa27ce4SDimitry Andric ///   minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
21227fa27ce4SDimitry Andric ///
21237fa27ce4SDimitry Andric /// If fract is a useful instruction for the subtarget. Does not account for the
21247fa27ce4SDimitry Andric /// nan handling; the instruction has a nan check on the input value.
matchFractPat(IntrinsicInst & I)21257fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
21267fa27ce4SDimitry Andric   if (ST->hasFractBug())
21277fa27ce4SDimitry Andric     return nullptr;
21287fa27ce4SDimitry Andric 
21297fa27ce4SDimitry Andric   if (I.getIntrinsicID() != Intrinsic::minnum)
21307fa27ce4SDimitry Andric     return nullptr;
21317fa27ce4SDimitry Andric 
21327fa27ce4SDimitry Andric   Type *Ty = I.getType();
21337fa27ce4SDimitry Andric   if (!isLegalFloatingTy(Ty->getScalarType()))
21347fa27ce4SDimitry Andric     return nullptr;
21357fa27ce4SDimitry Andric 
21367fa27ce4SDimitry Andric   Value *Arg0 = I.getArgOperand(0);
21377fa27ce4SDimitry Andric   Value *Arg1 = I.getArgOperand(1);
21387fa27ce4SDimitry Andric 
21397fa27ce4SDimitry Andric   const APFloat *C;
21407fa27ce4SDimitry Andric   if (!match(Arg1, m_APFloat(C)))
21417fa27ce4SDimitry Andric     return nullptr;
21427fa27ce4SDimitry Andric 
21437fa27ce4SDimitry Andric   APFloat One(1.0);
21447fa27ce4SDimitry Andric   bool LosesInfo;
21457fa27ce4SDimitry Andric   One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
21467fa27ce4SDimitry Andric 
21477fa27ce4SDimitry Andric   // Match nextafter(1.0, -1)
21487fa27ce4SDimitry Andric   One.next(true);
21497fa27ce4SDimitry Andric   if (One != *C)
21507fa27ce4SDimitry Andric     return nullptr;
21517fa27ce4SDimitry Andric 
21527fa27ce4SDimitry Andric   Value *FloorSrc;
21537fa27ce4SDimitry Andric   if (match(Arg0, m_FSub(m_Value(FloorSrc),
21547fa27ce4SDimitry Andric                          m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc)))))
21557fa27ce4SDimitry Andric     return FloorSrc;
21567fa27ce4SDimitry Andric   return nullptr;
21577fa27ce4SDimitry Andric }
21587fa27ce4SDimitry Andric 
applyFractPat(IRBuilder<> & Builder,Value * FractArg)21597fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
21607fa27ce4SDimitry Andric                                                Value *FractArg) {
21617fa27ce4SDimitry Andric   SmallVector<Value *, 4> FractVals;
21627fa27ce4SDimitry Andric   extractValues(Builder, FractVals, FractArg);
21637fa27ce4SDimitry Andric 
21647fa27ce4SDimitry Andric   SmallVector<Value *, 4> ResultVals(FractVals.size());
21657fa27ce4SDimitry Andric 
21667fa27ce4SDimitry Andric   Type *Ty = FractArg->getType()->getScalarType();
21677fa27ce4SDimitry Andric   for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
21687fa27ce4SDimitry Andric     ResultVals[I] =
21697fa27ce4SDimitry Andric         Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
21707fa27ce4SDimitry Andric   }
21717fa27ce4SDimitry Andric 
21727fa27ce4SDimitry Andric   return insertValues(Builder, FractArg->getType(), ResultVals);
21737fa27ce4SDimitry Andric }
21747fa27ce4SDimitry Andric 
visitMinNum(IntrinsicInst & I)21757fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
21767fa27ce4SDimitry Andric   Value *FractArg = matchFractPat(I);
21777fa27ce4SDimitry Andric   if (!FractArg)
21787fa27ce4SDimitry Andric     return false;
21797fa27ce4SDimitry Andric 
21807fa27ce4SDimitry Andric   // Match pattern for fract intrinsic in contexts where the nan check has been
21817fa27ce4SDimitry Andric   // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2182ac9a064cSDimitry Andric   if (!I.hasNoNaNs() &&
2183ac9a064cSDimitry Andric       !isKnownNeverNaN(FractArg, /*Depth=*/0, SimplifyQuery(*DL, TLInfo)))
21847fa27ce4SDimitry Andric     return false;
21857fa27ce4SDimitry Andric 
21867fa27ce4SDimitry Andric   IRBuilder<> Builder(&I);
21877fa27ce4SDimitry Andric   FastMathFlags FMF = I.getFastMathFlags();
21887fa27ce4SDimitry Andric   FMF.setNoNaNs();
21897fa27ce4SDimitry Andric   Builder.setFastMathFlags(FMF);
21907fa27ce4SDimitry Andric 
21917fa27ce4SDimitry Andric   Value *Fract = applyFractPat(Builder, FractArg);
21927fa27ce4SDimitry Andric   Fract->takeName(&I);
21937fa27ce4SDimitry Andric   I.replaceAllUsesWith(Fract);
21947fa27ce4SDimitry Andric 
21957fa27ce4SDimitry Andric   RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);
21967fa27ce4SDimitry Andric   return true;
21977fa27ce4SDimitry Andric }
21987fa27ce4SDimitry Andric 
isOneOrNegOne(const Value * Val)2199b1c73532SDimitry Andric static bool isOneOrNegOne(const Value *Val) {
2200b1c73532SDimitry Andric   const APFloat *C;
2201b1c73532SDimitry Andric   return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;
2202b1c73532SDimitry Andric }
2203b1c73532SDimitry Andric 
2204b1c73532SDimitry Andric // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
visitSqrt(IntrinsicInst & Sqrt)2205b1c73532SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2206b1c73532SDimitry Andric   Type *Ty = Sqrt.getType()->getScalarType();
2207b1c73532SDimitry Andric   if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST->has16BitInsts()))
2208b1c73532SDimitry Andric     return false;
2209b1c73532SDimitry Andric 
2210b1c73532SDimitry Andric   const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2211b1c73532SDimitry Andric   FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2212b1c73532SDimitry Andric 
2213b1c73532SDimitry Andric   // We're trying to handle the fast-but-not-that-fast case only. The lowering
2214b1c73532SDimitry Andric   // of fast llvm.sqrt will give the raw instruction anyway.
2215b1c73532SDimitry Andric   if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
2216b1c73532SDimitry Andric     return false;
2217b1c73532SDimitry Andric 
2218b1c73532SDimitry Andric   const float ReqdAccuracy = FPOp->getFPAccuracy();
2219b1c73532SDimitry Andric 
2220b1c73532SDimitry Andric   // Defer correctly rounded expansion to codegen.
2221b1c73532SDimitry Andric   if (ReqdAccuracy < 1.0f)
2222b1c73532SDimitry Andric     return false;
2223b1c73532SDimitry Andric 
2224b1c73532SDimitry Andric   // FIXME: This is an ugly hack for this pass using forward iteration instead
2225b1c73532SDimitry Andric   // of reverse. If it worked like a normal combiner, the rsq would form before
2226b1c73532SDimitry Andric   // we saw a sqrt call.
2227b1c73532SDimitry Andric   auto *FDiv =
2228b1c73532SDimitry Andric       dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());
2229b1c73532SDimitry Andric   if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
2230b1c73532SDimitry Andric       FDiv->getFPAccuracy() >= 1.0f &&
2231b1c73532SDimitry Andric       canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&
2232b1c73532SDimitry Andric       // TODO: We should also handle the arcp case for the fdiv with non-1 value
2233b1c73532SDimitry Andric       isOneOrNegOne(FDiv->getOperand(0)))
2234b1c73532SDimitry Andric     return false;
2235b1c73532SDimitry Andric 
2236b1c73532SDimitry Andric   Value *SrcVal = Sqrt.getOperand(0);
2237b1c73532SDimitry Andric   bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2238b1c73532SDimitry Andric 
2239b1c73532SDimitry Andric   // The raw instruction is 1 ulp, but the correction for denormal handling
2240b1c73532SDimitry Andric   // brings it to 2.
2241b1c73532SDimitry Andric   if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2242b1c73532SDimitry Andric     return false;
2243b1c73532SDimitry Andric 
2244b1c73532SDimitry Andric   IRBuilder<> Builder(&Sqrt);
2245b1c73532SDimitry Andric   SmallVector<Value *, 4> SrcVals;
2246b1c73532SDimitry Andric   extractValues(Builder, SrcVals, SrcVal);
2247b1c73532SDimitry Andric 
2248b1c73532SDimitry Andric   SmallVector<Value *, 4> ResultVals(SrcVals.size());
2249b1c73532SDimitry Andric   for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2250b1c73532SDimitry Andric     if (CanTreatAsDAZ)
2251b1c73532SDimitry Andric       ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2252b1c73532SDimitry Andric     else
2253b1c73532SDimitry Andric       ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2254b1c73532SDimitry Andric   }
2255b1c73532SDimitry Andric 
2256b1c73532SDimitry Andric   Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2257b1c73532SDimitry Andric   NewSqrt->takeName(&Sqrt);
2258b1c73532SDimitry Andric   Sqrt.replaceAllUsesWith(NewSqrt);
2259b1c73532SDimitry Andric   Sqrt.eraseFromParent();
2260b1c73532SDimitry Andric   return true;
2261b1c73532SDimitry Andric }
2262b1c73532SDimitry Andric 
doInitialization(Module & M)226301095a5dSDimitry Andric bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
22647fa27ce4SDimitry Andric   Impl.Mod = &M;
22657fa27ce4SDimitry Andric   Impl.DL = &Impl.Mod->getDataLayout();
2266b1c73532SDimitry Andric   Impl.SqrtF32 = nullptr;
2267b1c73532SDimitry Andric   Impl.LdexpF32 = nullptr;
226801095a5dSDimitry Andric   return false;
226901095a5dSDimitry Andric }
227001095a5dSDimitry Andric 
runOnFunction(Function & F)227101095a5dSDimitry Andric bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2272b5630dbaSDimitry Andric   if (skipFunction(F))
227301095a5dSDimitry Andric     return false;
227401095a5dSDimitry Andric 
2275b5630dbaSDimitry Andric   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2276b5630dbaSDimitry Andric   if (!TPC)
2277b5630dbaSDimitry Andric     return false;
2278b5630dbaSDimitry Andric 
2279eb11fae6SDimitry Andric   const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2280ac9a064cSDimitry Andric   Impl.TM = &TM;
22817fa27ce4SDimitry Andric   Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
22827fa27ce4SDimitry Andric   Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
22837fa27ce4SDimitry Andric   Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
22847fa27ce4SDimitry Andric   Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2285cfca06d7SDimitry Andric   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
22867fa27ce4SDimitry Andric   Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
22877fa27ce4SDimitry Andric   Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2288312c0ed1SDimitry Andric   SIModeRegisterDefaults Mode(F, *Impl.ST);
22897fa27ce4SDimitry Andric   Impl.HasFP32DenormalFlush =
22907fa27ce4SDimitry Andric       Mode.FP32Denormals == DenormalMode::getPreserveSign();
22917fa27ce4SDimitry Andric   return Impl.run(F);
2292a7fe922bSDimitry Andric }
2293a7fe922bSDimitry Andric 
run(Function & F,FunctionAnalysisManager & FAM)22947fa27ce4SDimitry Andric PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
22957fa27ce4SDimitry Andric                                                 FunctionAnalysisManager &FAM) {
22967fa27ce4SDimitry Andric   AMDGPUCodeGenPrepareImpl Impl;
22977fa27ce4SDimitry Andric   Impl.Mod = F.getParent();
22987fa27ce4SDimitry Andric   Impl.DL = &Impl.Mod->getDataLayout();
2299ac9a064cSDimitry Andric   Impl.TM = static_cast<const AMDGPUTargetMachine *>(&TM);
23007fa27ce4SDimitry Andric   Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
23017fa27ce4SDimitry Andric   Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
23027fa27ce4SDimitry Andric   Impl.AC = &FAM.getResult<AssumptionAnalysis>(F);
23037fa27ce4SDimitry Andric   Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F);
23047fa27ce4SDimitry Andric   Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
23057fa27ce4SDimitry Andric   Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2306312c0ed1SDimitry Andric   SIModeRegisterDefaults Mode(F, *Impl.ST);
23077fa27ce4SDimitry Andric   Impl.HasFP32DenormalFlush =
23087fa27ce4SDimitry Andric       Mode.FP32Denormals == DenormalMode::getPreserveSign();
23097fa27ce4SDimitry Andric   PreservedAnalyses PA = PreservedAnalyses::none();
23107fa27ce4SDimitry Andric   if (!Impl.FlowChanged)
23117fa27ce4SDimitry Andric     PA.preserveSet<CFGAnalyses>();
23127fa27ce4SDimitry Andric   return Impl.run(F) ? PA : PreservedAnalyses::all();
231301095a5dSDimitry Andric }
231401095a5dSDimitry Andric 
2315b5630dbaSDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
231601095a5dSDimitry Andric                       "AMDGPU IR optimizations", false, false)
2317eb11fae6SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
23187fa27ce4SDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
23197fa27ce4SDimitry Andric INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
2320b5630dbaSDimitry Andric INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2321b5630dbaSDimitry Andric                     false, false)
232201095a5dSDimitry Andric 
232301095a5dSDimitry Andric char AMDGPUCodeGenPrepare::ID = 0;
232401095a5dSDimitry Andric 
createAMDGPUCodeGenPreparePass()2325b5630dbaSDimitry Andric FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
2326b5630dbaSDimitry Andric   return new AMDGPUCodeGenPrepare();
232701095a5dSDimitry Andric }
2328