101095a5dSDimitry Andric //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
201095a5dSDimitry Andric //
3e6d15924SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e6d15924SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5e6d15924SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
601095a5dSDimitry Andric //
701095a5dSDimitry Andric //===----------------------------------------------------------------------===//
801095a5dSDimitry Andric //
901095a5dSDimitry Andric /// \file
1001095a5dSDimitry Andric /// This pass does misc. AMDGPU optimizations on IR before instruction
1101095a5dSDimitry Andric /// selection.
1201095a5dSDimitry Andric //
1301095a5dSDimitry Andric //===----------------------------------------------------------------------===//
1401095a5dSDimitry Andric
1501095a5dSDimitry Andric #include "AMDGPU.h"
16a7fe922bSDimitry Andric #include "AMDGPUTargetMachine.h"
177fa27ce4SDimitry Andric #include "SIModeRegisterDefaults.h"
18eb11fae6SDimitry Andric #include "llvm/Analysis/AssumptionCache.h"
19cfca06d7SDimitry Andric #include "llvm/Analysis/ConstantFolding.h"
207fa27ce4SDimitry Andric #include "llvm/Analysis/TargetLibraryInfo.h"
217fa27ce4SDimitry Andric #include "llvm/Analysis/UniformityAnalysis.h"
22eb11fae6SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
23b5630dbaSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
24cfca06d7SDimitry Andric #include "llvm/IR/Dominators.h"
257fa27ce4SDimitry Andric #include "llvm/IR/IRBuilder.h"
267ab83427SDimitry Andric #include "llvm/IR/InstVisitor.h"
27b60736ecSDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
287fa27ce4SDimitry Andric #include "llvm/IR/PatternMatch.h"
29706b4fc4SDimitry Andric #include "llvm/InitializePasses.h"
3071d5a254SDimitry Andric #include "llvm/Pass.h"
31b60736ecSDimitry Andric #include "llvm/Support/KnownBits.h"
32cfca06d7SDimitry Andric #include "llvm/Transforms/Utils/IntegerDivision.h"
337fa27ce4SDimitry Andric #include "llvm/Transforms/Utils/Local.h"
3401095a5dSDimitry Andric
3501095a5dSDimitry Andric #define DEBUG_TYPE "amdgpu-codegenprepare"
3601095a5dSDimitry Andric
3701095a5dSDimitry Andric using namespace llvm;
387fa27ce4SDimitry Andric using namespace llvm::PatternMatch;
3901095a5dSDimitry Andric
4001095a5dSDimitry Andric namespace {
4101095a5dSDimitry Andric
42eb11fae6SDimitry Andric static cl::opt<bool> WidenLoads(
43eb11fae6SDimitry Andric "amdgpu-codegenprepare-widen-constant-loads",
44eb11fae6SDimitry Andric cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
45eb11fae6SDimitry Andric cl::ReallyHidden,
46cfca06d7SDimitry Andric cl::init(false));
47eb11fae6SDimitry Andric
48b60736ecSDimitry Andric static cl::opt<bool> Widen16BitOps(
49b60736ecSDimitry Andric "amdgpu-codegenprepare-widen-16-bit-ops",
50b60736ecSDimitry Andric cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
51b60736ecSDimitry Andric cl::ReallyHidden,
52b60736ecSDimitry Andric cl::init(true));
53b60736ecSDimitry Andric
547fa27ce4SDimitry Andric static cl::opt<bool>
55b1c73532SDimitry Andric BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
567fa27ce4SDimitry Andric cl::desc("Break large PHI nodes for DAGISel"),
577fa27ce4SDimitry Andric cl::ReallyHidden, cl::init(true));
587fa27ce4SDimitry Andric
597fa27ce4SDimitry Andric static cl::opt<bool>
60b1c73532SDimitry Andric ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
617fa27ce4SDimitry Andric cl::desc("For testing purposes, always break large "
627fa27ce4SDimitry Andric "PHIs even if it isn't profitable."),
637fa27ce4SDimitry Andric cl::ReallyHidden, cl::init(false));
647fa27ce4SDimitry Andric
65b1c73532SDimitry Andric static cl::opt<unsigned> BreakLargePHIsThreshold(
667fa27ce4SDimitry Andric "amdgpu-codegenprepare-break-large-phis-threshold",
677fa27ce4SDimitry Andric cl::desc("Minimum type size in bits for breaking large PHI nodes"),
687fa27ce4SDimitry Andric cl::ReallyHidden, cl::init(32));
697fa27ce4SDimitry Andric
701d5ae102SDimitry Andric static cl::opt<bool> UseMul24Intrin(
711d5ae102SDimitry Andric "amdgpu-codegenprepare-mul24",
721d5ae102SDimitry Andric cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
731d5ae102SDimitry Andric cl::ReallyHidden,
741d5ae102SDimitry Andric cl::init(true));
751d5ae102SDimitry Andric
76cfca06d7SDimitry Andric // Legalize 64-bit division by using the generic IR expansion.
77cfca06d7SDimitry Andric static cl::opt<bool> ExpandDiv64InIR(
78cfca06d7SDimitry Andric "amdgpu-codegenprepare-expand-div64",
79cfca06d7SDimitry Andric cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80cfca06d7SDimitry Andric cl::ReallyHidden,
81cfca06d7SDimitry Andric cl::init(false));
82cfca06d7SDimitry Andric
83cfca06d7SDimitry Andric // Leave all division operations as they are. This supersedes ExpandDiv64InIR
84cfca06d7SDimitry Andric // and is used for testing the legalizer.
85cfca06d7SDimitry Andric static cl::opt<bool> DisableIDivExpand(
86cfca06d7SDimitry Andric "amdgpu-codegenprepare-disable-idiv-expansion",
87cfca06d7SDimitry Andric cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88cfca06d7SDimitry Andric cl::ReallyHidden,
89cfca06d7SDimitry Andric cl::init(false));
90cfca06d7SDimitry Andric
917fa27ce4SDimitry Andric // Disable processing of fdiv so we can better test the backend implementations.
927fa27ce4SDimitry Andric static cl::opt<bool> DisableFDivExpand(
937fa27ce4SDimitry Andric "amdgpu-codegenprepare-disable-fdiv-expansion",
947fa27ce4SDimitry Andric cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
957fa27ce4SDimitry Andric cl::ReallyHidden,
967fa27ce4SDimitry Andric cl::init(false));
977fa27ce4SDimitry Andric
987fa27ce4SDimitry Andric class AMDGPUCodeGenPrepareImpl
997fa27ce4SDimitry Andric : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
1007fa27ce4SDimitry Andric public:
101eb11fae6SDimitry Andric const GCNSubtarget *ST = nullptr;
102ac9a064cSDimitry Andric const AMDGPUTargetMachine *TM = nullptr;
1037fa27ce4SDimitry Andric const TargetLibraryInfo *TLInfo = nullptr;
104eb11fae6SDimitry Andric AssumptionCache *AC = nullptr;
105cfca06d7SDimitry Andric DominatorTree *DT = nullptr;
1067fa27ce4SDimitry Andric UniformityInfo *UA = nullptr;
10771d5a254SDimitry Andric Module *Mod = nullptr;
108e6d15924SDimitry Andric const DataLayout *DL = nullptr;
10971d5a254SDimitry Andric bool HasUnsafeFPMath = false;
1107fa27ce4SDimitry Andric bool HasFP32DenormalFlush = false;
1117fa27ce4SDimitry Andric bool FlowChanged = false;
112b1c73532SDimitry Andric mutable Function *SqrtF32 = nullptr;
113b1c73532SDimitry Andric mutable Function *LdexpF32 = nullptr;
1147fa27ce4SDimitry Andric
1157fa27ce4SDimitry Andric DenseMap<const PHINode *, bool> BreakPhiNodesCache;
1167fa27ce4SDimitry Andric
getSqrtF32() const117b1c73532SDimitry Andric Function *getSqrtF32() const {
118b1c73532SDimitry Andric if (SqrtF32)
119b1c73532SDimitry Andric return SqrtF32;
120b1c73532SDimitry Andric
121b1c73532SDimitry Andric LLVMContext &Ctx = Mod->getContext();
122b1c73532SDimitry Andric SqrtF32 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_sqrt,
123b1c73532SDimitry Andric {Type::getFloatTy(Ctx)});
124b1c73532SDimitry Andric return SqrtF32;
125b1c73532SDimitry Andric }
126b1c73532SDimitry Andric
getLdexpF32() const127b1c73532SDimitry Andric Function *getLdexpF32() const {
128b1c73532SDimitry Andric if (LdexpF32)
129b1c73532SDimitry Andric return LdexpF32;
130b1c73532SDimitry Andric
131b1c73532SDimitry Andric LLVMContext &Ctx = Mod->getContext();
132b1c73532SDimitry Andric LdexpF32 = Intrinsic::getDeclaration(
133b1c73532SDimitry Andric Mod, Intrinsic::ldexp, {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
134b1c73532SDimitry Andric return LdexpF32;
135b1c73532SDimitry Andric }
136b1c73532SDimitry Andric
1377fa27ce4SDimitry Andric bool canBreakPHINode(const PHINode &I);
13801095a5dSDimitry Andric
139eb11fae6SDimitry Andric /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
140b915e9e0SDimitry Andric /// binary operation \p V.
141b915e9e0SDimitry Andric ///
142b915e9e0SDimitry Andric /// \returns Binary operation \p V.
143b915e9e0SDimitry Andric /// \returns \p T's base element bit width.
144b915e9e0SDimitry Andric unsigned getBaseElementBitWidth(const Type *T) const;
145b915e9e0SDimitry Andric
146b915e9e0SDimitry Andric /// \returns Equivalent 32 bit integer type for given type \p T. For example,
147b915e9e0SDimitry Andric /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
148b915e9e0SDimitry Andric /// is returned.
149b915e9e0SDimitry Andric Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
150b915e9e0SDimitry Andric
151b915e9e0SDimitry Andric /// \returns True if binary operation \p I is a signed binary operation, false
152b915e9e0SDimitry Andric /// otherwise.
153b915e9e0SDimitry Andric bool isSigned(const BinaryOperator &I) const;
154b915e9e0SDimitry Andric
155b915e9e0SDimitry Andric /// \returns True if the condition of 'select' operation \p I comes from a
156b915e9e0SDimitry Andric /// signed 'icmp' operation, false otherwise.
157b915e9e0SDimitry Andric bool isSigned(const SelectInst &I) const;
158b915e9e0SDimitry Andric
159b915e9e0SDimitry Andric /// \returns True if type \p T needs to be promoted to 32 bit integer type,
160b915e9e0SDimitry Andric /// false otherwise.
161b915e9e0SDimitry Andric bool needsPromotionToI32(const Type *T) const;
162b915e9e0SDimitry Andric
1637fa27ce4SDimitry Andric /// Return true if \p T is a legal scalar floating point type.
1647fa27ce4SDimitry Andric bool isLegalFloatingTy(const Type *T) const;
1657fa27ce4SDimitry Andric
1667fa27ce4SDimitry Andric /// Wrapper to pass all the arguments to computeKnownFPClass
computeKnownFPClass(const Value * V,FPClassTest Interested,const Instruction * CtxI) const1677fa27ce4SDimitry Andric KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
1687fa27ce4SDimitry Andric const Instruction *CtxI) const {
1697fa27ce4SDimitry Andric return llvm::computeKnownFPClass(V, *DL, Interested, 0, TLInfo, AC, CtxI,
1707fa27ce4SDimitry Andric DT);
1717fa27ce4SDimitry Andric }
1727fa27ce4SDimitry Andric
canIgnoreDenormalInput(const Value * V,const Instruction * CtxI) const1737fa27ce4SDimitry Andric bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
1747fa27ce4SDimitry Andric return HasFP32DenormalFlush ||
1757fa27ce4SDimitry Andric computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal();
1767fa27ce4SDimitry Andric }
1777fa27ce4SDimitry Andric
178eb11fae6SDimitry Andric /// Promotes uniform binary operation \p I to equivalent 32 bit binary
179b915e9e0SDimitry Andric /// operation.
180b915e9e0SDimitry Andric ///
181b915e9e0SDimitry Andric /// \details \p I's base element bit width must be greater than 1 and less
182b915e9e0SDimitry Andric /// than or equal 16. Promotion is done by sign or zero extending operands to
183b915e9e0SDimitry Andric /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
184b915e9e0SDimitry Andric /// truncating the result of 32 bit binary operation back to \p I's original
185b915e9e0SDimitry Andric /// type. Division operation is not promoted.
186b915e9e0SDimitry Andric ///
187b915e9e0SDimitry Andric /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
188b915e9e0SDimitry Andric /// false otherwise.
189b915e9e0SDimitry Andric bool promoteUniformOpToI32(BinaryOperator &I) const;
190b915e9e0SDimitry Andric
191eb11fae6SDimitry Andric /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
192b915e9e0SDimitry Andric ///
193b915e9e0SDimitry Andric /// \details \p I's base element bit width must be greater than 1 and less
194b915e9e0SDimitry Andric /// than or equal 16. Promotion is done by sign or zero extending operands to
195b915e9e0SDimitry Andric /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
196b915e9e0SDimitry Andric ///
197b915e9e0SDimitry Andric /// \returns True.
198b915e9e0SDimitry Andric bool promoteUniformOpToI32(ICmpInst &I) const;
199b915e9e0SDimitry Andric
200eb11fae6SDimitry Andric /// Promotes uniform 'select' operation \p I to 32 bit 'select'
201b915e9e0SDimitry Andric /// operation.
202b915e9e0SDimitry Andric ///
203b915e9e0SDimitry Andric /// \details \p I's base element bit width must be greater than 1 and less
204b915e9e0SDimitry Andric /// than or equal 16. Promotion is done by sign or zero extending operands to
205b915e9e0SDimitry Andric /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
206b915e9e0SDimitry Andric /// result of 32 bit 'select' operation back to \p I's original type.
207b915e9e0SDimitry Andric ///
208b915e9e0SDimitry Andric /// \returns True.
209b915e9e0SDimitry Andric bool promoteUniformOpToI32(SelectInst &I) const;
210b915e9e0SDimitry Andric
211eb11fae6SDimitry Andric /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
212b915e9e0SDimitry Andric /// intrinsic.
213b915e9e0SDimitry Andric ///
214b915e9e0SDimitry Andric /// \details \p I's base element bit width must be greater than 1 and less
215b915e9e0SDimitry Andric /// than or equal 16. Promotion is done by zero extending the operand to 32
216b915e9e0SDimitry Andric /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
217b915e9e0SDimitry Andric /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
218b915e9e0SDimitry Andric /// shift amount is 32 minus \p I's base element bit width), and truncating
219b915e9e0SDimitry Andric /// the result of the shift operation back to \p I's original type.
220b915e9e0SDimitry Andric ///
221b915e9e0SDimitry Andric /// \returns True.
222b915e9e0SDimitry Andric bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
223eb11fae6SDimitry Andric
224c0981da4SDimitry Andric /// \returns The minimum number of bits needed to store the value of \Op as an
225c0981da4SDimitry Andric /// unsigned integer. Truncating to this size and then zero-extending to
2266f8fc217SDimitry Andric /// the original will not change the value.
2276f8fc217SDimitry Andric unsigned numBitsUnsigned(Value *Op) const;
228c0981da4SDimitry Andric
229c0981da4SDimitry Andric /// \returns The minimum number of bits needed to store the value of \Op as a
230c0981da4SDimitry Andric /// signed integer. Truncating to this size and then sign-extending to
2316f8fc217SDimitry Andric /// the original size will not change the value.
2326f8fc217SDimitry Andric unsigned numBitsSigned(Value *Op) const;
233e6d15924SDimitry Andric
234e6d15924SDimitry Andric /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
235e6d15924SDimitry Andric /// SelectionDAG has an issue where an and asserting the bits are known
236e6d15924SDimitry Andric bool replaceMulWithMul24(BinaryOperator &I) const;
237e6d15924SDimitry Andric
238cfca06d7SDimitry Andric /// Perform same function as equivalently named function in DAGCombiner. Since
239cfca06d7SDimitry Andric /// we expand some divisions here, we need to perform this before obscuring.
240cfca06d7SDimitry Andric bool foldBinOpIntoSelect(BinaryOperator &I) const;
241cfca06d7SDimitry Andric
242cfca06d7SDimitry Andric bool divHasSpecialOptimization(BinaryOperator &I,
243cfca06d7SDimitry Andric Value *Num, Value *Den) const;
244cfca06d7SDimitry Andric int getDivNumBits(BinaryOperator &I,
245cfca06d7SDimitry Andric Value *Num, Value *Den,
246cfca06d7SDimitry Andric unsigned AtLeast, bool Signed) const;
247cfca06d7SDimitry Andric
248eb11fae6SDimitry Andric /// Expands 24 bit div or rem.
249eb11fae6SDimitry Andric Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
250eb11fae6SDimitry Andric Value *Num, Value *Den,
251eb11fae6SDimitry Andric bool IsDiv, bool IsSigned) const;
252eb11fae6SDimitry Andric
253cfca06d7SDimitry Andric Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
254cfca06d7SDimitry Andric Value *Num, Value *Den, unsigned NumBits,
255cfca06d7SDimitry Andric bool IsDiv, bool IsSigned) const;
256cfca06d7SDimitry Andric
257eb11fae6SDimitry Andric /// Expands 32 bit div or rem.
258eb11fae6SDimitry Andric Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
259eb11fae6SDimitry Andric Value *Num, Value *Den) const;
260eb11fae6SDimitry Andric
261cfca06d7SDimitry Andric Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
262cfca06d7SDimitry Andric Value *Num, Value *Den) const;
263cfca06d7SDimitry Andric void expandDivRem64(BinaryOperator &I) const;
264cfca06d7SDimitry Andric
265eb11fae6SDimitry Andric /// Widen a scalar load.
266044eb2f6SDimitry Andric ///
267044eb2f6SDimitry Andric /// \details \p Widen scalar load for uniform, small type loads from constant
268044eb2f6SDimitry Andric // memory / to a full 32-bits and then truncate the input to allow a scalar
269044eb2f6SDimitry Andric // load instead of a vector load.
270044eb2f6SDimitry Andric //
271044eb2f6SDimitry Andric /// \returns True.
272044eb2f6SDimitry Andric
273044eb2f6SDimitry Andric bool canWidenScalarExtLoad(LoadInst &I) const;
274b915e9e0SDimitry Andric
2757fa27ce4SDimitry Andric Value *matchFractPat(IntrinsicInst &I);
2767fa27ce4SDimitry Andric Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
2777fa27ce4SDimitry Andric
2787fa27ce4SDimitry Andric bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
2797fa27ce4SDimitry Andric FastMathFlags SqrtFMF) const;
2807fa27ce4SDimitry Andric
2817fa27ce4SDimitry Andric Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
2827fa27ce4SDimitry Andric FastMathFlags DivFMF, FastMathFlags SqrtFMF,
2837fa27ce4SDimitry Andric const Instruction *CtxI) const;
2847fa27ce4SDimitry Andric
2857fa27ce4SDimitry Andric Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
2867fa27ce4SDimitry Andric FastMathFlags FMF, const Instruction *CtxI) const;
2877fa27ce4SDimitry Andric Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
2887fa27ce4SDimitry Andric float ReqdAccuracy) const;
2897fa27ce4SDimitry Andric
2907fa27ce4SDimitry Andric Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
2917fa27ce4SDimitry Andric FastMathFlags DivFMF, FastMathFlags SqrtFMF,
2927fa27ce4SDimitry Andric Value *RsqOp, const Instruction *FDiv,
2937fa27ce4SDimitry Andric float ReqdAccuracy) const;
2947fa27ce4SDimitry Andric
2957fa27ce4SDimitry Andric std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
2967fa27ce4SDimitry Andric Value *Src) const;
2977fa27ce4SDimitry Andric
2987fa27ce4SDimitry Andric Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
2997fa27ce4SDimitry Andric bool IsNegative) const;
3007fa27ce4SDimitry Andric Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
3017fa27ce4SDimitry Andric FastMathFlags FMF) const;
302b1c73532SDimitry Andric Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
303b1c73532SDimitry Andric FastMathFlags FMF) const;
3047fa27ce4SDimitry Andric
30501095a5dSDimitry Andric public:
306a7fe922bSDimitry Andric bool visitFDiv(BinaryOperator &I);
307a7fe922bSDimitry Andric
visitInstruction(Instruction & I)308b915e9e0SDimitry Andric bool visitInstruction(Instruction &I) { return false; }
309b915e9e0SDimitry Andric bool visitBinaryOperator(BinaryOperator &I);
310044eb2f6SDimitry Andric bool visitLoadInst(LoadInst &I);
311b915e9e0SDimitry Andric bool visitICmpInst(ICmpInst &I);
312b915e9e0SDimitry Andric bool visitSelectInst(SelectInst &I);
3137fa27ce4SDimitry Andric bool visitPHINode(PHINode &I);
314ac9a064cSDimitry Andric bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
315b915e9e0SDimitry Andric
316b915e9e0SDimitry Andric bool visitIntrinsicInst(IntrinsicInst &I);
317b915e9e0SDimitry Andric bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
3187fa27ce4SDimitry Andric bool visitMinNum(IntrinsicInst &I);
319b1c73532SDimitry Andric bool visitSqrt(IntrinsicInst &I);
3207fa27ce4SDimitry Andric bool run(Function &F);
3217fa27ce4SDimitry Andric };
32201095a5dSDimitry Andric
3237fa27ce4SDimitry Andric class AMDGPUCodeGenPrepare : public FunctionPass {
3247fa27ce4SDimitry Andric private:
3257fa27ce4SDimitry Andric AMDGPUCodeGenPrepareImpl Impl;
32601095a5dSDimitry Andric
3277fa27ce4SDimitry Andric public:
3287fa27ce4SDimitry Andric static char ID;
AMDGPUCodeGenPrepare()3297fa27ce4SDimitry Andric AMDGPUCodeGenPrepare() : FunctionPass(ID) {
3307fa27ce4SDimitry Andric initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry());
3317fa27ce4SDimitry Andric }
getAnalysisUsage(AnalysisUsage & AU) const33201095a5dSDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override {
333eb11fae6SDimitry Andric AU.addRequired<AssumptionCacheTracker>();
3347fa27ce4SDimitry Andric AU.addRequired<UniformityInfoWrapperPass>();
3357fa27ce4SDimitry Andric AU.addRequired<TargetLibraryInfoWrapperPass>();
336cfca06d7SDimitry Andric
337cfca06d7SDimitry Andric // FIXME: Division expansion needs to preserve the dominator tree.
338cfca06d7SDimitry Andric if (!ExpandDiv64InIR)
33901095a5dSDimitry Andric AU.setPreservesAll();
34001095a5dSDimitry Andric }
3417fa27ce4SDimitry Andric bool runOnFunction(Function &F) override;
3427fa27ce4SDimitry Andric bool doInitialization(Module &M) override;
getPassName() const3437fa27ce4SDimitry Andric StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
34401095a5dSDimitry Andric };
34501095a5dSDimitry Andric
34671d5a254SDimitry Andric } // end anonymous namespace
347b915e9e0SDimitry Andric
run(Function & F)3487fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
349b1c73532SDimitry Andric BreakPhiNodesCache.clear();
3507fa27ce4SDimitry Andric bool MadeChange = false;
3517fa27ce4SDimitry Andric
3527fa27ce4SDimitry Andric Function::iterator NextBB;
3537fa27ce4SDimitry Andric for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
3547fa27ce4SDimitry Andric BasicBlock *BB = &*FI;
3557fa27ce4SDimitry Andric NextBB = std::next(FI);
3567fa27ce4SDimitry Andric
3577fa27ce4SDimitry Andric BasicBlock::iterator Next;
3587fa27ce4SDimitry Andric for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
3597fa27ce4SDimitry Andric I = Next) {
3607fa27ce4SDimitry Andric Next = std::next(I);
3617fa27ce4SDimitry Andric
3627fa27ce4SDimitry Andric MadeChange |= visit(*I);
3637fa27ce4SDimitry Andric
3647fa27ce4SDimitry Andric if (Next != E) { // Control flow changed
3657fa27ce4SDimitry Andric BasicBlock *NextInstBB = Next->getParent();
3667fa27ce4SDimitry Andric if (NextInstBB != BB) {
3677fa27ce4SDimitry Andric BB = NextInstBB;
3687fa27ce4SDimitry Andric E = BB->end();
3697fa27ce4SDimitry Andric FE = F.end();
3707fa27ce4SDimitry Andric }
3717fa27ce4SDimitry Andric }
3727fa27ce4SDimitry Andric }
3737fa27ce4SDimitry Andric }
3747fa27ce4SDimitry Andric return MadeChange;
3757fa27ce4SDimitry Andric }
3767fa27ce4SDimitry Andric
getBaseElementBitWidth(const Type * T) const3777fa27ce4SDimitry Andric unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
378b915e9e0SDimitry Andric assert(needsPromotionToI32(T) && "T does not need promotion to i32");
379b915e9e0SDimitry Andric
380b915e9e0SDimitry Andric if (T->isIntegerTy())
381b915e9e0SDimitry Andric return T->getIntegerBitWidth();
382b915e9e0SDimitry Andric return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
383b915e9e0SDimitry Andric }
384b915e9e0SDimitry Andric
getI32Ty(IRBuilder<> & B,const Type * T) const3857fa27ce4SDimitry Andric Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
386b915e9e0SDimitry Andric assert(needsPromotionToI32(T) && "T does not need promotion to i32");
387b915e9e0SDimitry Andric
388b915e9e0SDimitry Andric if (T->isIntegerTy())
389b915e9e0SDimitry Andric return B.getInt32Ty();
390cfca06d7SDimitry Andric return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
391b915e9e0SDimitry Andric }
392b915e9e0SDimitry Andric
isSigned(const BinaryOperator & I) const3937fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
394b915e9e0SDimitry Andric return I.getOpcode() == Instruction::AShr ||
395b915e9e0SDimitry Andric I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
396b915e9e0SDimitry Andric }
397b915e9e0SDimitry Andric
isSigned(const SelectInst & I) const3987fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
399b915e9e0SDimitry Andric return isa<ICmpInst>(I.getOperand(0)) ?
400b915e9e0SDimitry Andric cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
401b915e9e0SDimitry Andric }
402b915e9e0SDimitry Andric
needsPromotionToI32(const Type * T) const4037fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
404b60736ecSDimitry Andric if (!Widen16BitOps)
405b60736ecSDimitry Andric return false;
406b60736ecSDimitry Andric
40771d5a254SDimitry Andric const IntegerType *IntTy = dyn_cast<IntegerType>(T);
40871d5a254SDimitry Andric if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
409b915e9e0SDimitry Andric return true;
41071d5a254SDimitry Andric
41171d5a254SDimitry Andric if (const VectorType *VT = dyn_cast<VectorType>(T)) {
41271d5a254SDimitry Andric // TODO: The set of packed operations is more limited, so may want to
41371d5a254SDimitry Andric // promote some anyway.
41471d5a254SDimitry Andric if (ST->hasVOP3PInsts())
415b915e9e0SDimitry Andric return false;
41671d5a254SDimitry Andric
41771d5a254SDimitry Andric return needsPromotionToI32(VT->getElementType());
41871d5a254SDimitry Andric }
41971d5a254SDimitry Andric
42071d5a254SDimitry Andric return false;
42171d5a254SDimitry Andric }
42271d5a254SDimitry Andric
isLegalFloatingTy(const Type * Ty) const4237fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
4247fa27ce4SDimitry Andric return Ty->isFloatTy() || Ty->isDoubleTy() ||
4257fa27ce4SDimitry Andric (Ty->isHalfTy() && ST->has16BitInsts());
4267fa27ce4SDimitry Andric }
4277fa27ce4SDimitry Andric
42871d5a254SDimitry Andric // Return true if the op promoted to i32 should have nsw set.
promotedOpIsNSW(const Instruction & I)42971d5a254SDimitry Andric static bool promotedOpIsNSW(const Instruction &I) {
43071d5a254SDimitry Andric switch (I.getOpcode()) {
43171d5a254SDimitry Andric case Instruction::Shl:
43271d5a254SDimitry Andric case Instruction::Add:
43371d5a254SDimitry Andric case Instruction::Sub:
43471d5a254SDimitry Andric return true;
43571d5a254SDimitry Andric case Instruction::Mul:
43671d5a254SDimitry Andric return I.hasNoUnsignedWrap();
43771d5a254SDimitry Andric default:
43871d5a254SDimitry Andric return false;
43971d5a254SDimitry Andric }
44071d5a254SDimitry Andric }
44171d5a254SDimitry Andric
44271d5a254SDimitry Andric // Return true if the op promoted to i32 should have nuw set.
promotedOpIsNUW(const Instruction & I)44371d5a254SDimitry Andric static bool promotedOpIsNUW(const Instruction &I) {
44471d5a254SDimitry Andric switch (I.getOpcode()) {
44571d5a254SDimitry Andric case Instruction::Shl:
44671d5a254SDimitry Andric case Instruction::Add:
44771d5a254SDimitry Andric case Instruction::Mul:
44871d5a254SDimitry Andric return true;
44971d5a254SDimitry Andric case Instruction::Sub:
45071d5a254SDimitry Andric return I.hasNoUnsignedWrap();
45171d5a254SDimitry Andric default:
45271d5a254SDimitry Andric return false;
45371d5a254SDimitry Andric }
454b915e9e0SDimitry Andric }
455b915e9e0SDimitry Andric
canWidenScalarExtLoad(LoadInst & I) const4567fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
457044eb2f6SDimitry Andric Type *Ty = I.getType();
458044eb2f6SDimitry Andric const DataLayout &DL = Mod->getDataLayout();
459044eb2f6SDimitry Andric int TySize = DL.getTypeSizeInBits(Ty);
460cfca06d7SDimitry Andric Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
461044eb2f6SDimitry Andric
4627fa27ce4SDimitry Andric return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I);
463044eb2f6SDimitry Andric }
464044eb2f6SDimitry Andric
promoteUniformOpToI32(BinaryOperator & I) const4657fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
466b915e9e0SDimitry Andric assert(needsPromotionToI32(I.getType()) &&
467b915e9e0SDimitry Andric "I does not need promotion to i32");
468b915e9e0SDimitry Andric
469b915e9e0SDimitry Andric if (I.getOpcode() == Instruction::SDiv ||
470eb11fae6SDimitry Andric I.getOpcode() == Instruction::UDiv ||
471eb11fae6SDimitry Andric I.getOpcode() == Instruction::SRem ||
472eb11fae6SDimitry Andric I.getOpcode() == Instruction::URem)
473b915e9e0SDimitry Andric return false;
474b915e9e0SDimitry Andric
475b915e9e0SDimitry Andric IRBuilder<> Builder(&I);
476b915e9e0SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc());
477b915e9e0SDimitry Andric
478b915e9e0SDimitry Andric Type *I32Ty = getI32Ty(Builder, I.getType());
479b915e9e0SDimitry Andric Value *ExtOp0 = nullptr;
480b915e9e0SDimitry Andric Value *ExtOp1 = nullptr;
481b915e9e0SDimitry Andric Value *ExtRes = nullptr;
482b915e9e0SDimitry Andric Value *TruncRes = nullptr;
483b915e9e0SDimitry Andric
484b915e9e0SDimitry Andric if (isSigned(I)) {
485b915e9e0SDimitry Andric ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
486b915e9e0SDimitry Andric ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
487b915e9e0SDimitry Andric } else {
488b915e9e0SDimitry Andric ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
489b915e9e0SDimitry Andric ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
490b915e9e0SDimitry Andric }
49171d5a254SDimitry Andric
49271d5a254SDimitry Andric ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
49371d5a254SDimitry Andric if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
49471d5a254SDimitry Andric if (promotedOpIsNSW(cast<Instruction>(I)))
49571d5a254SDimitry Andric Inst->setHasNoSignedWrap();
49671d5a254SDimitry Andric
49771d5a254SDimitry Andric if (promotedOpIsNUW(cast<Instruction>(I)))
49871d5a254SDimitry Andric Inst->setHasNoUnsignedWrap();
49971d5a254SDimitry Andric
50071d5a254SDimitry Andric if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
50171d5a254SDimitry Andric Inst->setIsExact(ExactOp->isExact());
50271d5a254SDimitry Andric }
50371d5a254SDimitry Andric
504b915e9e0SDimitry Andric TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
505b915e9e0SDimitry Andric
506b915e9e0SDimitry Andric I.replaceAllUsesWith(TruncRes);
507b915e9e0SDimitry Andric I.eraseFromParent();
508b915e9e0SDimitry Andric
509b915e9e0SDimitry Andric return true;
510b915e9e0SDimitry Andric }
511b915e9e0SDimitry Andric
promoteUniformOpToI32(ICmpInst & I) const5127fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
513b915e9e0SDimitry Andric assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
514b915e9e0SDimitry Andric "I does not need promotion to i32");
515b915e9e0SDimitry Andric
516b915e9e0SDimitry Andric IRBuilder<> Builder(&I);
517b915e9e0SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc());
518b915e9e0SDimitry Andric
519b915e9e0SDimitry Andric Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
520b915e9e0SDimitry Andric Value *ExtOp0 = nullptr;
521b915e9e0SDimitry Andric Value *ExtOp1 = nullptr;
522b915e9e0SDimitry Andric Value *NewICmp = nullptr;
523b915e9e0SDimitry Andric
524b915e9e0SDimitry Andric if (I.isSigned()) {
525b915e9e0SDimitry Andric ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
526b915e9e0SDimitry Andric ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
527b915e9e0SDimitry Andric } else {
528b915e9e0SDimitry Andric ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
529b915e9e0SDimitry Andric ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
530b915e9e0SDimitry Andric }
531b915e9e0SDimitry Andric NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
532b915e9e0SDimitry Andric
533b915e9e0SDimitry Andric I.replaceAllUsesWith(NewICmp);
534b915e9e0SDimitry Andric I.eraseFromParent();
535b915e9e0SDimitry Andric
536b915e9e0SDimitry Andric return true;
537b915e9e0SDimitry Andric }
538b915e9e0SDimitry Andric
promoteUniformOpToI32(SelectInst & I) const5397fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
540b915e9e0SDimitry Andric assert(needsPromotionToI32(I.getType()) &&
541b915e9e0SDimitry Andric "I does not need promotion to i32");
542b915e9e0SDimitry Andric
543b915e9e0SDimitry Andric IRBuilder<> Builder(&I);
544b915e9e0SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc());
545b915e9e0SDimitry Andric
546b915e9e0SDimitry Andric Type *I32Ty = getI32Ty(Builder, I.getType());
547b915e9e0SDimitry Andric Value *ExtOp1 = nullptr;
548b915e9e0SDimitry Andric Value *ExtOp2 = nullptr;
549b915e9e0SDimitry Andric Value *ExtRes = nullptr;
550b915e9e0SDimitry Andric Value *TruncRes = nullptr;
551b915e9e0SDimitry Andric
552b915e9e0SDimitry Andric if (isSigned(I)) {
553b915e9e0SDimitry Andric ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
554b915e9e0SDimitry Andric ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
555b915e9e0SDimitry Andric } else {
556b915e9e0SDimitry Andric ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
557b915e9e0SDimitry Andric ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
558b915e9e0SDimitry Andric }
559b915e9e0SDimitry Andric ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
560b915e9e0SDimitry Andric TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
561b915e9e0SDimitry Andric
562b915e9e0SDimitry Andric I.replaceAllUsesWith(TruncRes);
563b915e9e0SDimitry Andric I.eraseFromParent();
564b915e9e0SDimitry Andric
565b915e9e0SDimitry Andric return true;
566b915e9e0SDimitry Andric }
567b915e9e0SDimitry Andric
promoteUniformBitreverseToI32(IntrinsicInst & I) const5687fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
569b915e9e0SDimitry Andric IntrinsicInst &I) const {
570b915e9e0SDimitry Andric assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
571b915e9e0SDimitry Andric "I must be bitreverse intrinsic");
572b915e9e0SDimitry Andric assert(needsPromotionToI32(I.getType()) &&
573b915e9e0SDimitry Andric "I does not need promotion to i32");
574b915e9e0SDimitry Andric
575b915e9e0SDimitry Andric IRBuilder<> Builder(&I);
576b915e9e0SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc());
577b915e9e0SDimitry Andric
578b915e9e0SDimitry Andric Type *I32Ty = getI32Ty(Builder, I.getType());
579b915e9e0SDimitry Andric Function *I32 =
580b915e9e0SDimitry Andric Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
581b915e9e0SDimitry Andric Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
582b915e9e0SDimitry Andric Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
583b915e9e0SDimitry Andric Value *LShrOp =
584b915e9e0SDimitry Andric Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
585b915e9e0SDimitry Andric Value *TruncRes =
586b915e9e0SDimitry Andric Builder.CreateTrunc(LShrOp, I.getType());
587b915e9e0SDimitry Andric
588b915e9e0SDimitry Andric I.replaceAllUsesWith(TruncRes);
589b915e9e0SDimitry Andric I.eraseFromParent();
590b915e9e0SDimitry Andric
591b915e9e0SDimitry Andric return true;
592b915e9e0SDimitry Andric }
593b915e9e0SDimitry Andric
numBitsUnsigned(Value * Op) const5947fa27ce4SDimitry Andric unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
5956f8fc217SDimitry Andric return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();
596e6d15924SDimitry Andric }
597e6d15924SDimitry Andric
numBitsSigned(Value * Op) const5987fa27ce4SDimitry Andric unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
5996f8fc217SDimitry Andric return ComputeMaxSignificantBits(Op, *DL, 0, AC);
600e6d15924SDimitry Andric }
601e6d15924SDimitry Andric
extractValues(IRBuilder<> & Builder,SmallVectorImpl<Value * > & Values,Value * V)602e6d15924SDimitry Andric static void extractValues(IRBuilder<> &Builder,
603e6d15924SDimitry Andric SmallVectorImpl<Value *> &Values, Value *V) {
604cfca06d7SDimitry Andric auto *VT = dyn_cast<FixedVectorType>(V->getType());
605e6d15924SDimitry Andric if (!VT) {
606e6d15924SDimitry Andric Values.push_back(V);
607e6d15924SDimitry Andric return;
608e6d15924SDimitry Andric }
609e6d15924SDimitry Andric
610e6d15924SDimitry Andric for (int I = 0, E = VT->getNumElements(); I != E; ++I)
611e6d15924SDimitry Andric Values.push_back(Builder.CreateExtractElement(V, I));
612e6d15924SDimitry Andric }
613e6d15924SDimitry Andric
insertValues(IRBuilder<> & Builder,Type * Ty,SmallVectorImpl<Value * > & Values)614e6d15924SDimitry Andric static Value *insertValues(IRBuilder<> &Builder,
615e6d15924SDimitry Andric Type *Ty,
616e6d15924SDimitry Andric SmallVectorImpl<Value *> &Values) {
617e3b55780SDimitry Andric if (!Ty->isVectorTy()) {
618e3b55780SDimitry Andric assert(Values.size() == 1);
619e6d15924SDimitry Andric return Values[0];
620e3b55780SDimitry Andric }
621e6d15924SDimitry Andric
622e3b55780SDimitry Andric Value *NewVal = PoisonValue::get(Ty);
623e6d15924SDimitry Andric for (int I = 0, E = Values.size(); I != E; ++I)
624e6d15924SDimitry Andric NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
625e6d15924SDimitry Andric
626e6d15924SDimitry Andric return NewVal;
627e6d15924SDimitry Andric }
628e6d15924SDimitry Andric
replaceMulWithMul24(BinaryOperator & I) const6297fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
630e6d15924SDimitry Andric if (I.getOpcode() != Instruction::Mul)
631e6d15924SDimitry Andric return false;
632e6d15924SDimitry Andric
633e6d15924SDimitry Andric Type *Ty = I.getType();
634e6d15924SDimitry Andric unsigned Size = Ty->getScalarSizeInBits();
635e6d15924SDimitry Andric if (Size <= 16 && ST->has16BitInsts())
636e6d15924SDimitry Andric return false;
637e6d15924SDimitry Andric
638e6d15924SDimitry Andric // Prefer scalar if this could be s_mul_i32
6397fa27ce4SDimitry Andric if (UA->isUniform(&I))
640e6d15924SDimitry Andric return false;
641e6d15924SDimitry Andric
642e6d15924SDimitry Andric Value *LHS = I.getOperand(0);
643e6d15924SDimitry Andric Value *RHS = I.getOperand(1);
644e6d15924SDimitry Andric IRBuilder<> Builder(&I);
645e6d15924SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc());
646e6d15924SDimitry Andric
647c0981da4SDimitry Andric unsigned LHSBits = 0, RHSBits = 0;
648c0981da4SDimitry Andric bool IsSigned = false;
649e6d15924SDimitry Andric
6506f8fc217SDimitry Andric if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
6516f8fc217SDimitry Andric (RHSBits = numBitsUnsigned(RHS)) <= 24) {
652c0981da4SDimitry Andric IsSigned = false;
653c0981da4SDimitry Andric
6546f8fc217SDimitry Andric } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
6556f8fc217SDimitry Andric (RHSBits = numBitsSigned(RHS)) <= 24) {
656c0981da4SDimitry Andric IsSigned = true;
657c0981da4SDimitry Andric
658e6d15924SDimitry Andric } else
659e6d15924SDimitry Andric return false;
660e6d15924SDimitry Andric
661e6d15924SDimitry Andric SmallVector<Value *, 4> LHSVals;
662e6d15924SDimitry Andric SmallVector<Value *, 4> RHSVals;
663e6d15924SDimitry Andric SmallVector<Value *, 4> ResultVals;
664e6d15924SDimitry Andric extractValues(Builder, LHSVals, LHS);
665e6d15924SDimitry Andric extractValues(Builder, RHSVals, RHS);
666e6d15924SDimitry Andric
667e6d15924SDimitry Andric IntegerType *I32Ty = Builder.getInt32Ty();
668b1c73532SDimitry Andric IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
669b1c73532SDimitry Andric Type *DstTy = LHSVals[0]->getType();
670b1c73532SDimitry Andric
671e6d15924SDimitry Andric for (int I = 0, E = LHSVals.size(); I != E; ++I) {
672b1c73532SDimitry Andric Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
673b1c73532SDimitry Andric : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
674b1c73532SDimitry Andric Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
675b1c73532SDimitry Andric : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
676b1c73532SDimitry Andric Intrinsic::ID ID =
677b1c73532SDimitry Andric IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
678b1c73532SDimitry Andric Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
679b1c73532SDimitry Andric Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
680b1c73532SDimitry Andric : Builder.CreateZExtOrTrunc(Result, DstTy);
681b1c73532SDimitry Andric ResultVals.push_back(Result);
682e6d15924SDimitry Andric }
683e6d15924SDimitry Andric
6841d5ae102SDimitry Andric Value *NewVal = insertValues(Builder, Ty, ResultVals);
6851d5ae102SDimitry Andric NewVal->takeName(&I);
6861d5ae102SDimitry Andric I.replaceAllUsesWith(NewVal);
687e6d15924SDimitry Andric I.eraseFromParent();
688e6d15924SDimitry Andric
689e6d15924SDimitry Andric return true;
690e6d15924SDimitry Andric }
691e6d15924SDimitry Andric
692cfca06d7SDimitry Andric // Find a select instruction, which may have been casted. This is mostly to deal
693cfca06d7SDimitry Andric // with cases where i16 selects were promoted here to i32.
findSelectThroughCast(Value * V,CastInst * & Cast)694cfca06d7SDimitry Andric static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
695cfca06d7SDimitry Andric Cast = nullptr;
696cfca06d7SDimitry Andric if (SelectInst *Sel = dyn_cast<SelectInst>(V))
697cfca06d7SDimitry Andric return Sel;
698eb11fae6SDimitry Andric
699cfca06d7SDimitry Andric if ((Cast = dyn_cast<CastInst>(V))) {
700cfca06d7SDimitry Andric if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
701cfca06d7SDimitry Andric return Sel;
702a7fe922bSDimitry Andric }
703a7fe922bSDimitry Andric
704cfca06d7SDimitry Andric return nullptr;
705cfca06d7SDimitry Andric }
706a7fe922bSDimitry Andric
foldBinOpIntoSelect(BinaryOperator & BO) const7077fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
708cfca06d7SDimitry Andric // Don't do this unless the old select is going away. We want to eliminate the
709cfca06d7SDimitry Andric // binary operator, not replace a binop with a select.
710cfca06d7SDimitry Andric int SelOpNo = 0;
711cfca06d7SDimitry Andric
712cfca06d7SDimitry Andric CastInst *CastOp;
713cfca06d7SDimitry Andric
714cfca06d7SDimitry Andric // TODO: Should probably try to handle some cases with multiple
715cfca06d7SDimitry Andric // users. Duplicating the select may be profitable for division.
716cfca06d7SDimitry Andric SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
717cfca06d7SDimitry Andric if (!Sel || !Sel->hasOneUse()) {
718cfca06d7SDimitry Andric SelOpNo = 1;
719cfca06d7SDimitry Andric Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
720cfca06d7SDimitry Andric }
721cfca06d7SDimitry Andric
722cfca06d7SDimitry Andric if (!Sel || !Sel->hasOneUse())
723a7fe922bSDimitry Andric return false;
724a7fe922bSDimitry Andric
725cfca06d7SDimitry Andric Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
726cfca06d7SDimitry Andric Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
727cfca06d7SDimitry Andric Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
728cfca06d7SDimitry Andric if (!CBO || !CT || !CF)
729cfca06d7SDimitry Andric return false;
730cfca06d7SDimitry Andric
731cfca06d7SDimitry Andric if (CastOp) {
732cfca06d7SDimitry Andric if (!CastOp->hasOneUse())
733cfca06d7SDimitry Andric return false;
734cfca06d7SDimitry Andric CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
735cfca06d7SDimitry Andric CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
736cfca06d7SDimitry Andric }
737cfca06d7SDimitry Andric
738cfca06d7SDimitry Andric // TODO: Handle special 0/-1 cases DAG combine does, although we only really
739cfca06d7SDimitry Andric // need to handle divisions here.
740cfca06d7SDimitry Andric Constant *FoldedT = SelOpNo ?
741cfca06d7SDimitry Andric ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
742cfca06d7SDimitry Andric ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
7431f917f69SDimitry Andric if (!FoldedT || isa<ConstantExpr>(FoldedT))
744cfca06d7SDimitry Andric return false;
745cfca06d7SDimitry Andric
746cfca06d7SDimitry Andric Constant *FoldedF = SelOpNo ?
747cfca06d7SDimitry Andric ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
748cfca06d7SDimitry Andric ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
7491f917f69SDimitry Andric if (!FoldedF || isa<ConstantExpr>(FoldedF))
750cfca06d7SDimitry Andric return false;
751cfca06d7SDimitry Andric
752cfca06d7SDimitry Andric IRBuilder<> Builder(&BO);
753cfca06d7SDimitry Andric Builder.SetCurrentDebugLocation(BO.getDebugLoc());
754cfca06d7SDimitry Andric if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
755cfca06d7SDimitry Andric Builder.setFastMathFlags(FPOp->getFastMathFlags());
756cfca06d7SDimitry Andric
757cfca06d7SDimitry Andric Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
758cfca06d7SDimitry Andric FoldedT, FoldedF);
759cfca06d7SDimitry Andric NewSelect->takeName(&BO);
760cfca06d7SDimitry Andric BO.replaceAllUsesWith(NewSelect);
761cfca06d7SDimitry Andric BO.eraseFromParent();
762cfca06d7SDimitry Andric if (CastOp)
763cfca06d7SDimitry Andric CastOp->eraseFromParent();
764cfca06d7SDimitry Andric Sel->eraseFromParent();
765cfca06d7SDimitry Andric return true;
766cfca06d7SDimitry Andric }
767cfca06d7SDimitry Andric
7687fa27ce4SDimitry Andric std::pair<Value *, Value *>
getFrexpResults(IRBuilder<> & Builder,Value * Src) const7697fa27ce4SDimitry Andric AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
7707fa27ce4SDimitry Andric Value *Src) const {
7717fa27ce4SDimitry Andric Type *Ty = Src->getType();
7727fa27ce4SDimitry Andric Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
7737fa27ce4SDimitry Andric {Ty, Builder.getInt32Ty()}, Src);
7747fa27ce4SDimitry Andric Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
7757fa27ce4SDimitry Andric
7767fa27ce4SDimitry Andric // Bypass the bug workaround for the exponent result since it doesn't matter.
7777fa27ce4SDimitry Andric // TODO: Does the bug workaround even really need to consider the exponent
7787fa27ce4SDimitry Andric // result? It's unspecified by the spec.
7797fa27ce4SDimitry Andric
7807fa27ce4SDimitry Andric Value *FrexpExp =
7817fa27ce4SDimitry Andric ST->hasFractBug()
7827fa27ce4SDimitry Andric ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
7837fa27ce4SDimitry Andric {Builder.getInt32Ty(), Ty}, Src)
7847fa27ce4SDimitry Andric : Builder.CreateExtractValue(Frexp, {1});
7857fa27ce4SDimitry Andric return {FrexpMant, FrexpExp};
7867fa27ce4SDimitry Andric }
7877fa27ce4SDimitry Andric
7887fa27ce4SDimitry Andric /// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
emitRcpIEEE1ULP(IRBuilder<> & Builder,Value * Src,bool IsNegative) const7897fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
7907fa27ce4SDimitry Andric Value *Src,
7917fa27ce4SDimitry Andric bool IsNegative) const {
7927fa27ce4SDimitry Andric // Same as for 1.0, but expand the sign out of the constant.
7937fa27ce4SDimitry Andric // -1.0 / x -> rcp (fneg x)
7947fa27ce4SDimitry Andric if (IsNegative)
7957fa27ce4SDimitry Andric Src = Builder.CreateFNeg(Src);
7967fa27ce4SDimitry Andric
7977fa27ce4SDimitry Andric // The rcp instruction doesn't support denormals, so scale the input
7987fa27ce4SDimitry Andric // out of the denormal range and convert at the end.
7997fa27ce4SDimitry Andric //
8007fa27ce4SDimitry Andric // Expand as 2^-n * (1.0 / (x * 2^n))
8017fa27ce4SDimitry Andric
8027fa27ce4SDimitry Andric // TODO: Skip scaling if input is known never denormal and the input
8037fa27ce4SDimitry Andric // range won't underflow to denormal. The hard part is knowing the
8047fa27ce4SDimitry Andric // result. We need a range check, the result could be denormal for
8057fa27ce4SDimitry Andric // 0x1p+126 < den <= 0x1p+127.
8067fa27ce4SDimitry Andric auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
8077fa27ce4SDimitry Andric Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
8087fa27ce4SDimitry Andric Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
809b1c73532SDimitry Andric return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
8107fa27ce4SDimitry Andric }
8117fa27ce4SDimitry Andric
8127fa27ce4SDimitry Andric /// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
emitFrexpDiv(IRBuilder<> & Builder,Value * LHS,Value * RHS,FastMathFlags FMF) const8137fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
8147fa27ce4SDimitry Andric Value *RHS,
8157fa27ce4SDimitry Andric FastMathFlags FMF) const {
8167fa27ce4SDimitry Andric // If we have have to work around the fract/frexp bug, we're worse off than
8177fa27ce4SDimitry Andric // using the fdiv.fast expansion. The full safe expansion is faster if we have
8187fa27ce4SDimitry Andric // fast FMA.
8197fa27ce4SDimitry Andric if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() &&
8207fa27ce4SDimitry Andric (!FMF.noNaNs() || !FMF.noInfs()))
8217fa27ce4SDimitry Andric return nullptr;
8227fa27ce4SDimitry Andric
8237fa27ce4SDimitry Andric // We're scaling the LHS to avoid a denormal input, and scale the denominator
8247fa27ce4SDimitry Andric // to avoid large values underflowing the result.
8257fa27ce4SDimitry Andric auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
8267fa27ce4SDimitry Andric
8277fa27ce4SDimitry Andric Value *Rcp =
8287fa27ce4SDimitry Andric Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
8297fa27ce4SDimitry Andric
8307fa27ce4SDimitry Andric auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
8317fa27ce4SDimitry Andric Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
8327fa27ce4SDimitry Andric
8337fa27ce4SDimitry Andric // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
8347fa27ce4SDimitry Andric // result.
8357fa27ce4SDimitry Andric Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
836b1c73532SDimitry Andric return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
837b1c73532SDimitry Andric }
838b1c73532SDimitry Andric
839b1c73532SDimitry Andric /// Emit a sqrt that handles denormals and is accurate to 2ulp.
emitSqrtIEEE2ULP(IRBuilder<> & Builder,Value * Src,FastMathFlags FMF) const840b1c73532SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
841b1c73532SDimitry Andric Value *Src,
842b1c73532SDimitry Andric FastMathFlags FMF) const {
843b1c73532SDimitry Andric Type *Ty = Src->getType();
844b1c73532SDimitry Andric APFloat SmallestNormal =
845b1c73532SDimitry Andric APFloat::getSmallestNormalized(Ty->getFltSemantics());
846b1c73532SDimitry Andric Value *NeedScale =
847b1c73532SDimitry Andric Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
848b1c73532SDimitry Andric
849b1c73532SDimitry Andric ConstantInt *Zero = Builder.getInt32(0);
850b1c73532SDimitry Andric Value *InputScaleFactor =
851b1c73532SDimitry Andric Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
852b1c73532SDimitry Andric
853b1c73532SDimitry Andric Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
854b1c73532SDimitry Andric
855b1c73532SDimitry Andric Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
856b1c73532SDimitry Andric
857b1c73532SDimitry Andric Value *OutputScaleFactor =
858b1c73532SDimitry Andric Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
859b1c73532SDimitry Andric return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
8607fa27ce4SDimitry Andric }
8617fa27ce4SDimitry Andric
8627fa27ce4SDimitry Andric /// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
emitRsqIEEE1ULP(IRBuilder<> & Builder,Value * Src,bool IsNegative)8637fa27ce4SDimitry Andric static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
8647fa27ce4SDimitry Andric bool IsNegative) {
8657fa27ce4SDimitry Andric // bool need_scale = x < 0x1p-126f;
8667fa27ce4SDimitry Andric // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
8677fa27ce4SDimitry Andric // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
8687fa27ce4SDimitry Andric // rsq(x * input_scale) * output_scale;
8697fa27ce4SDimitry Andric
8707fa27ce4SDimitry Andric Type *Ty = Src->getType();
8717fa27ce4SDimitry Andric APFloat SmallestNormal =
8727fa27ce4SDimitry Andric APFloat::getSmallestNormalized(Ty->getFltSemantics());
8737fa27ce4SDimitry Andric Value *NeedScale =
8747fa27ce4SDimitry Andric Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
8757fa27ce4SDimitry Andric Constant *One = ConstantFP::get(Ty, 1.0);
8767fa27ce4SDimitry Andric Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
8777fa27ce4SDimitry Andric Constant *OutputScale =
8787fa27ce4SDimitry Andric ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
8797fa27ce4SDimitry Andric
8807fa27ce4SDimitry Andric Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
8817fa27ce4SDimitry Andric
8827fa27ce4SDimitry Andric Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
8837fa27ce4SDimitry Andric Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
8847fa27ce4SDimitry Andric Value *OutputScaleFactor = Builder.CreateSelect(
8857fa27ce4SDimitry Andric NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
8867fa27ce4SDimitry Andric
8877fa27ce4SDimitry Andric return Builder.CreateFMul(Rsq, OutputScaleFactor);
8887fa27ce4SDimitry Andric }
8897fa27ce4SDimitry Andric
canOptimizeWithRsq(const FPMathOperator * SqrtOp,FastMathFlags DivFMF,FastMathFlags SqrtFMF) const8907fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
8917fa27ce4SDimitry Andric FastMathFlags DivFMF,
8927fa27ce4SDimitry Andric FastMathFlags SqrtFMF) const {
8937fa27ce4SDimitry Andric // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
8947fa27ce4SDimitry Andric if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
8957fa27ce4SDimitry Andric return false;
8967fa27ce4SDimitry Andric
8977fa27ce4SDimitry Andric // v_rsq_f32 gives 1ulp
8987fa27ce4SDimitry Andric return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
8997fa27ce4SDimitry Andric SqrtOp->getFPAccuracy() >= 1.0f;
9007fa27ce4SDimitry Andric }
9017fa27ce4SDimitry Andric
optimizeWithRsq(IRBuilder<> & Builder,Value * Num,Value * Den,const FastMathFlags DivFMF,const FastMathFlags SqrtFMF,const Instruction * CtxI) const9027fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
903b1c73532SDimitry Andric IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
904b1c73532SDimitry Andric const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
9057fa27ce4SDimitry Andric // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
9067fa27ce4SDimitry Andric assert(DivFMF.allowContract() && SqrtFMF.allowContract());
9077fa27ce4SDimitry Andric
9087fa27ce4SDimitry Andric // rsq_f16 is accurate to 0.51 ulp.
9097fa27ce4SDimitry Andric // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
9107fa27ce4SDimitry Andric // rsq_f64 is never accurate.
9117fa27ce4SDimitry Andric const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
9127fa27ce4SDimitry Andric if (!CLHS)
9137fa27ce4SDimitry Andric return nullptr;
9147fa27ce4SDimitry Andric
9157fa27ce4SDimitry Andric assert(Den->getType()->isFloatTy());
9167fa27ce4SDimitry Andric
9177fa27ce4SDimitry Andric bool IsNegative = false;
9187fa27ce4SDimitry Andric
9197fa27ce4SDimitry Andric // TODO: Handle other numerator values with arcp.
9207fa27ce4SDimitry Andric if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
9217fa27ce4SDimitry Andric // Add in the sqrt flags.
9227fa27ce4SDimitry Andric IRBuilder<>::FastMathFlagGuard Guard(Builder);
923b1c73532SDimitry Andric Builder.setFastMathFlags(DivFMF | SqrtFMF);
9247fa27ce4SDimitry Andric
925b1c73532SDimitry Andric if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
9267fa27ce4SDimitry Andric canIgnoreDenormalInput(Den, CtxI)) {
9277fa27ce4SDimitry Andric Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
9287fa27ce4SDimitry Andric // -1.0 / sqrt(x) -> fneg(rsq(x))
9297fa27ce4SDimitry Andric return IsNegative ? Builder.CreateFNeg(Result) : Result;
9307fa27ce4SDimitry Andric }
9317fa27ce4SDimitry Andric
9327fa27ce4SDimitry Andric return emitRsqIEEE1ULP(Builder, Den, IsNegative);
9337fa27ce4SDimitry Andric }
9347fa27ce4SDimitry Andric
9357fa27ce4SDimitry Andric return nullptr;
9367fa27ce4SDimitry Andric }
9377fa27ce4SDimitry Andric
938cfca06d7SDimitry Andric // Optimize fdiv with rcp:
939cfca06d7SDimitry Andric //
940cfca06d7SDimitry Andric // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
941cfca06d7SDimitry Andric // allowed with unsafe-fp-math or afn.
942cfca06d7SDimitry Andric //
9437fa27ce4SDimitry Andric // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
9447fa27ce4SDimitry Andric Value *
optimizeWithRcp(IRBuilder<> & Builder,Value * Num,Value * Den,FastMathFlags FMF,const Instruction * CtxI) const9457fa27ce4SDimitry Andric AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
9467fa27ce4SDimitry Andric Value *Den, FastMathFlags FMF,
9477fa27ce4SDimitry Andric const Instruction *CtxI) const {
9487fa27ce4SDimitry Andric // rcp_f16 is accurate to 0.51 ulp.
9497fa27ce4SDimitry Andric // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
9507fa27ce4SDimitry Andric // rcp_f64 is never accurate.
9517fa27ce4SDimitry Andric assert(Den->getType()->isFloatTy());
952cfca06d7SDimitry Andric
953cfca06d7SDimitry Andric if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
9547fa27ce4SDimitry Andric bool IsNegative = false;
9557fa27ce4SDimitry Andric if (CLHS->isExactlyValue(1.0) ||
9567fa27ce4SDimitry Andric (IsNegative = CLHS->isExactlyValue(-1.0))) {
9577fa27ce4SDimitry Andric Value *Src = Den;
9587fa27ce4SDimitry Andric
9597fa27ce4SDimitry Andric if (HasFP32DenormalFlush || FMF.approxFunc()) {
9607fa27ce4SDimitry Andric // -1.0 / x -> 1.0 / fneg(x)
9617fa27ce4SDimitry Andric if (IsNegative)
9627fa27ce4SDimitry Andric Src = Builder.CreateFNeg(Src);
963cfca06d7SDimitry Andric
964cfca06d7SDimitry Andric // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
965cfca06d7SDimitry Andric // the CI documentation has a worst case error of 1 ulp.
9667fa27ce4SDimitry Andric // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
9677fa27ce4SDimitry Andric // to use it as long as we aren't trying to use denormals.
968cfca06d7SDimitry Andric //
969cfca06d7SDimitry Andric // v_rcp_f16 and v_rsq_f16 DO support denormals.
970cfca06d7SDimitry Andric
971cfca06d7SDimitry Andric // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
972cfca06d7SDimitry Andric // insert rsq intrinsic here.
973cfca06d7SDimitry Andric
974cfca06d7SDimitry Andric // 1.0 / x -> rcp(x)
9757fa27ce4SDimitry Andric return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
976cfca06d7SDimitry Andric }
977cfca06d7SDimitry Andric
9787fa27ce4SDimitry Andric // TODO: If the input isn't denormal, and we know the input exponent isn't
9797fa27ce4SDimitry Andric // big enough to introduce a denormal we can avoid the scaling.
9807fa27ce4SDimitry Andric return emitRcpIEEE1ULP(Builder, Src, IsNegative);
981cfca06d7SDimitry Andric }
982cfca06d7SDimitry Andric }
983cfca06d7SDimitry Andric
9847fa27ce4SDimitry Andric if (FMF.allowReciprocal()) {
985cfca06d7SDimitry Andric // x / y -> x * (1.0 / y)
9867fa27ce4SDimitry Andric
9877fa27ce4SDimitry Andric // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
9887fa27ce4SDimitry Andric // will never underflow.
9897fa27ce4SDimitry Andric if (HasFP32DenormalFlush || FMF.approxFunc()) {
9907fa27ce4SDimitry Andric Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
991cfca06d7SDimitry Andric return Builder.CreateFMul(Num, Recip);
992cfca06d7SDimitry Andric }
9937fa27ce4SDimitry Andric
9947fa27ce4SDimitry Andric Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
9957fa27ce4SDimitry Andric return Builder.CreateFMul(Num, Recip);
9967fa27ce4SDimitry Andric }
9977fa27ce4SDimitry Andric
998cfca06d7SDimitry Andric return nullptr;
999cfca06d7SDimitry Andric }
1000cfca06d7SDimitry Andric
1001cfca06d7SDimitry Andric // optimize with fdiv.fast:
1002cfca06d7SDimitry Andric //
1003cfca06d7SDimitry Andric // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1004cfca06d7SDimitry Andric //
1005cfca06d7SDimitry Andric // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1006cfca06d7SDimitry Andric //
1007cfca06d7SDimitry Andric // NOTE: optimizeWithRcp should be tried first because rcp is the preference.
optimizeWithFDivFast(IRBuilder<> & Builder,Value * Num,Value * Den,float ReqdAccuracy) const10087fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
10097fa27ce4SDimitry Andric IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
1010cfca06d7SDimitry Andric // fdiv.fast can achieve 2.5 ULP accuracy.
1011cfca06d7SDimitry Andric if (ReqdAccuracy < 2.5f)
1012cfca06d7SDimitry Andric return nullptr;
1013cfca06d7SDimitry Andric
1014cfca06d7SDimitry Andric // Only have fdiv.fast for f32.
10157fa27ce4SDimitry Andric assert(Den->getType()->isFloatTy());
1016cfca06d7SDimitry Andric
1017cfca06d7SDimitry Andric bool NumIsOne = false;
1018cfca06d7SDimitry Andric if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
1019cfca06d7SDimitry Andric if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
1020cfca06d7SDimitry Andric NumIsOne = true;
1021cfca06d7SDimitry Andric }
1022cfca06d7SDimitry Andric
1023cfca06d7SDimitry Andric // fdiv does not support denormals. But 1.0/x is always fine to use it.
10247fa27ce4SDimitry Andric //
10257fa27ce4SDimitry Andric // TODO: This works for any value with a specific known exponent range, don't
10267fa27ce4SDimitry Andric // just limit to constant 1.
10277fa27ce4SDimitry Andric if (!HasFP32DenormalFlush && !NumIsOne)
1028cfca06d7SDimitry Andric return nullptr;
1029cfca06d7SDimitry Andric
10307fa27ce4SDimitry Andric return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
10317fa27ce4SDimitry Andric }
10327fa27ce4SDimitry Andric
visitFDivElement(IRBuilder<> & Builder,Value * Num,Value * Den,FastMathFlags DivFMF,FastMathFlags SqrtFMF,Value * RsqOp,const Instruction * FDivInst,float ReqdDivAccuracy) const10337fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
10347fa27ce4SDimitry Andric IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
10357fa27ce4SDimitry Andric FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
10367fa27ce4SDimitry Andric float ReqdDivAccuracy) const {
10377fa27ce4SDimitry Andric if (RsqOp) {
10387fa27ce4SDimitry Andric Value *Rsq =
10397fa27ce4SDimitry Andric optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
10407fa27ce4SDimitry Andric if (Rsq)
10417fa27ce4SDimitry Andric return Rsq;
10427fa27ce4SDimitry Andric }
10437fa27ce4SDimitry Andric
10447fa27ce4SDimitry Andric Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
10457fa27ce4SDimitry Andric if (Rcp)
10467fa27ce4SDimitry Andric return Rcp;
10477fa27ce4SDimitry Andric
10487fa27ce4SDimitry Andric // In the basic case fdiv_fast has the same instruction count as the frexp div
10497fa27ce4SDimitry Andric // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
10507fa27ce4SDimitry Andric // potentially be fused into a user. Also, materialization of the constants
10517fa27ce4SDimitry Andric // can be reused for multiple instances.
10527fa27ce4SDimitry Andric Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
10537fa27ce4SDimitry Andric if (FDivFast)
10547fa27ce4SDimitry Andric return FDivFast;
10557fa27ce4SDimitry Andric
10567fa27ce4SDimitry Andric return emitFrexpDiv(Builder, Num, Den, DivFMF);
1057cfca06d7SDimitry Andric }
1058cfca06d7SDimitry Andric
1059cfca06d7SDimitry Andric // Optimizations is performed based on fpmath, fast math flags as well as
1060cfca06d7SDimitry Andric // denormals to optimize fdiv with either rcp or fdiv.fast.
1061cfca06d7SDimitry Andric //
1062cfca06d7SDimitry Andric // With rcp:
1063cfca06d7SDimitry Andric // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1064cfca06d7SDimitry Andric // allowed with unsafe-fp-math or afn.
1065cfca06d7SDimitry Andric //
1066cfca06d7SDimitry Andric // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
1067cfca06d7SDimitry Andric //
1068cfca06d7SDimitry Andric // With fdiv.fast:
1069cfca06d7SDimitry Andric // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1070cfca06d7SDimitry Andric //
1071cfca06d7SDimitry Andric // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1072cfca06d7SDimitry Andric //
1073cfca06d7SDimitry Andric // NOTE: rcp is the preference in cases that both are legal.
visitFDiv(BinaryOperator & FDiv)10747fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
10757fa27ce4SDimitry Andric if (DisableFDivExpand)
10767fa27ce4SDimitry Andric return false;
1077cfca06d7SDimitry Andric
1078cfca06d7SDimitry Andric Type *Ty = FDiv.getType()->getScalarType();
10797fa27ce4SDimitry Andric if (!Ty->isFloatTy())
10807fa27ce4SDimitry Andric return false;
1081cfca06d7SDimitry Andric
1082b60736ecSDimitry Andric // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
10837fa27ce4SDimitry Andric // expansion around them in codegen. f16 is good enough to always use.
1084a7fe922bSDimitry Andric
1085a7fe922bSDimitry Andric const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
10867fa27ce4SDimitry Andric const FastMathFlags DivFMF = FPOp->getFastMathFlags();
1087cfca06d7SDimitry Andric const float ReqdAccuracy = FPOp->getFPAccuracy();
1088a7fe922bSDimitry Andric
10897fa27ce4SDimitry Andric FastMathFlags SqrtFMF;
1090a7fe922bSDimitry Andric
1091a7fe922bSDimitry Andric Value *Num = FDiv.getOperand(0);
1092a7fe922bSDimitry Andric Value *Den = FDiv.getOperand(1);
1093a7fe922bSDimitry Andric
10947fa27ce4SDimitry Andric Value *RsqOp = nullptr;
10957fa27ce4SDimitry Andric auto *DenII = dyn_cast<IntrinsicInst>(Den);
10967fa27ce4SDimitry Andric if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
10977fa27ce4SDimitry Andric DenII->hasOneUse()) {
10987fa27ce4SDimitry Andric const auto *SqrtOp = cast<FPMathOperator>(DenII);
10997fa27ce4SDimitry Andric SqrtFMF = SqrtOp->getFastMathFlags();
11007fa27ce4SDimitry Andric if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
11017fa27ce4SDimitry Andric RsqOp = SqrtOp->getOperand(0);
1102a7fe922bSDimitry Andric }
1103a7fe922bSDimitry Andric
1104b1c73532SDimitry Andric // Inaccurate rcp is allowed with unsafe-fp-math or afn.
1105b1c73532SDimitry Andric //
1106b1c73532SDimitry Andric // Defer to codegen to handle this.
1107b1c73532SDimitry Andric //
1108b1c73532SDimitry Andric // TODO: Decide on an interpretation for interactions between afn + arcp +
1109b1c73532SDimitry Andric // !fpmath, and make it consistent between here and codegen. For now, defer
1110b1c73532SDimitry Andric // expansion of afn to codegen. The current interpretation is so aggressive we
1111b1c73532SDimitry Andric // don't need any pre-consideration here when we have better information. A
1112b1c73532SDimitry Andric // more conservative interpretation could use handling here.
1113b1c73532SDimitry Andric const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
1114b1c73532SDimitry Andric if (!RsqOp && AllowInaccurateRcp)
1115b1c73532SDimitry Andric return false;
1116b1c73532SDimitry Andric
1117b1c73532SDimitry Andric // Defer the correct implementations to codegen.
1118b1c73532SDimitry Andric if (ReqdAccuracy < 1.0f)
1119b1c73532SDimitry Andric return false;
1120b1c73532SDimitry Andric
11217fa27ce4SDimitry Andric IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
11227fa27ce4SDimitry Andric Builder.setFastMathFlags(DivFMF);
11237fa27ce4SDimitry Andric Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
11247fa27ce4SDimitry Andric
11257fa27ce4SDimitry Andric SmallVector<Value *, 4> NumVals;
11267fa27ce4SDimitry Andric SmallVector<Value *, 4> DenVals;
11277fa27ce4SDimitry Andric SmallVector<Value *, 4> RsqDenVals;
11287fa27ce4SDimitry Andric extractValues(Builder, NumVals, Num);
11297fa27ce4SDimitry Andric extractValues(Builder, DenVals, Den);
11307fa27ce4SDimitry Andric
11317fa27ce4SDimitry Andric if (RsqOp)
11327fa27ce4SDimitry Andric extractValues(Builder, RsqDenVals, RsqOp);
11337fa27ce4SDimitry Andric
11347fa27ce4SDimitry Andric SmallVector<Value *, 4> ResultVals(NumVals.size());
11357fa27ce4SDimitry Andric for (int I = 0, E = NumVals.size(); I != E; ++I) {
11367fa27ce4SDimitry Andric Value *NumElt = NumVals[I];
11377fa27ce4SDimitry Andric Value *DenElt = DenVals[I];
11387fa27ce4SDimitry Andric Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
11397fa27ce4SDimitry Andric
11407fa27ce4SDimitry Andric Value *NewElt =
11417fa27ce4SDimitry Andric visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
11427fa27ce4SDimitry Andric cast<Instruction>(FPOp), ReqdAccuracy);
11437fa27ce4SDimitry Andric if (!NewElt) {
11447fa27ce4SDimitry Andric // Keep the original, but scalarized.
11457fa27ce4SDimitry Andric
11467fa27ce4SDimitry Andric // This has the unfortunate side effect of sometimes scalarizing when
11477fa27ce4SDimitry Andric // we're not going to do anything.
11487fa27ce4SDimitry Andric NewElt = Builder.CreateFDiv(NumElt, DenElt);
11497fa27ce4SDimitry Andric if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
11507fa27ce4SDimitry Andric NewEltInst->copyMetadata(FDiv);
1151a7fe922bSDimitry Andric }
1152a7fe922bSDimitry Andric
11537fa27ce4SDimitry Andric ResultVals[I] = NewElt;
1154a7fe922bSDimitry Andric }
1155a7fe922bSDimitry Andric
11567fa27ce4SDimitry Andric Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
1157344a3780SDimitry Andric
11587fa27ce4SDimitry Andric if (NewVal) {
11597fa27ce4SDimitry Andric FDiv.replaceAllUsesWith(NewVal);
11607fa27ce4SDimitry Andric NewVal->takeName(&FDiv);
11617fa27ce4SDimitry Andric RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLInfo);
11627fa27ce4SDimitry Andric }
1163344a3780SDimitry Andric
1164344a3780SDimitry Andric return true;
1165344a3780SDimitry Andric }
1166344a3780SDimitry Andric
hasUnsafeFPMath(const Function & F)1167a7fe922bSDimitry Andric static bool hasUnsafeFPMath(const Function &F) {
1168a7fe922bSDimitry Andric Attribute Attr = F.getFnAttribute("unsafe-fp-math");
1169344a3780SDimitry Andric return Attr.getValueAsBool();
1170a7fe922bSDimitry Andric }
1171a7fe922bSDimitry Andric
getMul64(IRBuilder<> & Builder,Value * LHS,Value * RHS)1172eb11fae6SDimitry Andric static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
1173eb11fae6SDimitry Andric Value *LHS, Value *RHS) {
1174eb11fae6SDimitry Andric Type *I32Ty = Builder.getInt32Ty();
1175eb11fae6SDimitry Andric Type *I64Ty = Builder.getInt64Ty();
1176b915e9e0SDimitry Andric
1177eb11fae6SDimitry Andric Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
1178eb11fae6SDimitry Andric Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
1179eb11fae6SDimitry Andric Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1180eb11fae6SDimitry Andric Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
1181eb11fae6SDimitry Andric Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1182eb11fae6SDimitry Andric Hi = Builder.CreateTrunc(Hi, I32Ty);
1183e3b55780SDimitry Andric return std::pair(Lo, Hi);
1184eb11fae6SDimitry Andric }
1185eb11fae6SDimitry Andric
getMulHu(IRBuilder<> & Builder,Value * LHS,Value * RHS)1186eb11fae6SDimitry Andric static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1187eb11fae6SDimitry Andric return getMul64(Builder, LHS, RHS).second;
1188eb11fae6SDimitry Andric }
1189eb11fae6SDimitry Andric
1190145449b1SDimitry Andric /// Figure out how many bits are really needed for this division. \p AtLeast is
1191cfca06d7SDimitry Andric /// an optimization hint to bypass the second ComputeNumSignBits call if we the
1192cfca06d7SDimitry Andric /// first one is insufficient. Returns -1 on failure.
getDivNumBits(BinaryOperator & I,Value * Num,Value * Den,unsigned AtLeast,bool IsSigned) const11937fa27ce4SDimitry Andric int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
11947fa27ce4SDimitry Andric Value *Den, unsigned AtLeast,
11957fa27ce4SDimitry Andric bool IsSigned) const {
1196cfca06d7SDimitry Andric const DataLayout &DL = Mod->getDataLayout();
1197cfca06d7SDimitry Andric unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
1198cfca06d7SDimitry Andric if (LHSSignBits < AtLeast)
1199cfca06d7SDimitry Andric return -1;
1200cfca06d7SDimitry Andric
1201cfca06d7SDimitry Andric unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
1202cfca06d7SDimitry Andric if (RHSSignBits < AtLeast)
1203cfca06d7SDimitry Andric return -1;
1204cfca06d7SDimitry Andric
1205cfca06d7SDimitry Andric unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1206cfca06d7SDimitry Andric unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
1207cfca06d7SDimitry Andric if (IsSigned)
1208cfca06d7SDimitry Andric ++DivBits;
1209cfca06d7SDimitry Andric return DivBits;
1210cfca06d7SDimitry Andric }
1211cfca06d7SDimitry Andric
1212eb11fae6SDimitry Andric // The fractional part of a float is enough to accurately represent up to
1213eb11fae6SDimitry Andric // a 24-bit signed integer.
expandDivRem24(IRBuilder<> & Builder,BinaryOperator & I,Value * Num,Value * Den,bool IsDiv,bool IsSigned) const12147fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
12157fa27ce4SDimitry Andric BinaryOperator &I, Value *Num,
12167fa27ce4SDimitry Andric Value *Den, bool IsDiv,
12177fa27ce4SDimitry Andric bool IsSigned) const {
1218ac9a064cSDimitry Andric unsigned SSBits = Num->getType()->getScalarSizeInBits();
1219ac9a064cSDimitry Andric // If Num bits <= 24, assume 0 signbits.
1220ac9a064cSDimitry Andric unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);
1221ac9a064cSDimitry Andric int DivBits = getDivNumBits(I, Num, Den, AtLeast, IsSigned);
1222cfca06d7SDimitry Andric if (DivBits == -1)
1223eb11fae6SDimitry Andric return nullptr;
1224cfca06d7SDimitry Andric return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
1225cfca06d7SDimitry Andric }
1226eb11fae6SDimitry Andric
expandDivRem24Impl(IRBuilder<> & Builder,BinaryOperator & I,Value * Num,Value * Den,unsigned DivBits,bool IsDiv,bool IsSigned) const12277fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
12287fa27ce4SDimitry Andric IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
12297fa27ce4SDimitry Andric unsigned DivBits, bool IsDiv, bool IsSigned) const {
1230eb11fae6SDimitry Andric Type *I32Ty = Builder.getInt32Ty();
1231cfca06d7SDimitry Andric Num = Builder.CreateTrunc(Num, I32Ty);
1232cfca06d7SDimitry Andric Den = Builder.CreateTrunc(Den, I32Ty);
1233cfca06d7SDimitry Andric
1234eb11fae6SDimitry Andric Type *F32Ty = Builder.getFloatTy();
1235eb11fae6SDimitry Andric ConstantInt *One = Builder.getInt32(1);
1236eb11fae6SDimitry Andric Value *JQ = One;
1237eb11fae6SDimitry Andric
1238eb11fae6SDimitry Andric if (IsSigned) {
1239eb11fae6SDimitry Andric // char|short jq = ia ^ ib;
1240eb11fae6SDimitry Andric JQ = Builder.CreateXor(Num, Den);
1241eb11fae6SDimitry Andric
1242eb11fae6SDimitry Andric // jq = jq >> (bitsize - 2)
1243eb11fae6SDimitry Andric JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1244eb11fae6SDimitry Andric
1245eb11fae6SDimitry Andric // jq = jq | 0x1
1246eb11fae6SDimitry Andric JQ = Builder.CreateOr(JQ, One);
1247eb11fae6SDimitry Andric }
1248eb11fae6SDimitry Andric
1249eb11fae6SDimitry Andric // int ia = (int)LHS;
1250eb11fae6SDimitry Andric Value *IA = Num;
1251eb11fae6SDimitry Andric
1252eb11fae6SDimitry Andric // int ib, (int)RHS;
1253eb11fae6SDimitry Andric Value *IB = Den;
1254eb11fae6SDimitry Andric
1255eb11fae6SDimitry Andric // float fa = (float)ia;
1256eb11fae6SDimitry Andric Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1257eb11fae6SDimitry Andric : Builder.CreateUIToFP(IA, F32Ty);
1258eb11fae6SDimitry Andric
1259eb11fae6SDimitry Andric // float fb = (float)ib;
1260eb11fae6SDimitry Andric Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1261eb11fae6SDimitry Andric : Builder.CreateUIToFP(IB,F32Ty);
1262eb11fae6SDimitry Andric
1263cfca06d7SDimitry Andric Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
1264cfca06d7SDimitry Andric Builder.getFloatTy());
1265cfca06d7SDimitry Andric Value *RCP = Builder.CreateCall(RcpDecl, { FB });
1266eb11fae6SDimitry Andric Value *FQM = Builder.CreateFMul(FA, RCP);
1267eb11fae6SDimitry Andric
1268eb11fae6SDimitry Andric // fq = trunc(fqm);
1269d8e91e46SDimitry Andric CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1270eb11fae6SDimitry Andric FQ->copyFastMathFlags(Builder.getFastMathFlags());
1271eb11fae6SDimitry Andric
1272eb11fae6SDimitry Andric // float fqneg = -fq;
1273eb11fae6SDimitry Andric Value *FQNeg = Builder.CreateFNeg(FQ);
1274eb11fae6SDimitry Andric
1275eb11fae6SDimitry Andric // float fr = mad(fqneg, fb, fa);
1276cfca06d7SDimitry Andric auto FMAD = !ST->hasMadMacF32Insts()
1277cfca06d7SDimitry Andric ? Intrinsic::fma
1278cfca06d7SDimitry Andric : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1279cfca06d7SDimitry Andric Value *FR = Builder.CreateIntrinsic(FMAD,
1280d8e91e46SDimitry Andric {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1281eb11fae6SDimitry Andric
1282eb11fae6SDimitry Andric // int iq = (int)fq;
1283eb11fae6SDimitry Andric Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1284eb11fae6SDimitry Andric : Builder.CreateFPToUI(FQ, I32Ty);
1285eb11fae6SDimitry Andric
1286eb11fae6SDimitry Andric // fr = fabs(fr);
1287d8e91e46SDimitry Andric FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
1288eb11fae6SDimitry Andric
1289eb11fae6SDimitry Andric // fb = fabs(fb);
1290d8e91e46SDimitry Andric FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
1291eb11fae6SDimitry Andric
1292eb11fae6SDimitry Andric // int cv = fr >= fb;
1293eb11fae6SDimitry Andric Value *CV = Builder.CreateFCmpOGE(FR, FB);
1294eb11fae6SDimitry Andric
1295eb11fae6SDimitry Andric // jq = (cv ? jq : 0);
1296eb11fae6SDimitry Andric JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1297eb11fae6SDimitry Andric
1298eb11fae6SDimitry Andric // dst = iq + jq;
1299eb11fae6SDimitry Andric Value *Div = Builder.CreateAdd(IQ, JQ);
1300eb11fae6SDimitry Andric
1301eb11fae6SDimitry Andric Value *Res = Div;
1302eb11fae6SDimitry Andric if (!IsDiv) {
1303eb11fae6SDimitry Andric // Rem needs compensation, it's easier to recompute it
1304eb11fae6SDimitry Andric Value *Rem = Builder.CreateMul(Div, Den);
1305eb11fae6SDimitry Andric Res = Builder.CreateSub(Num, Rem);
1306eb11fae6SDimitry Andric }
1307eb11fae6SDimitry Andric
1308cfca06d7SDimitry Andric if (DivBits != 0 && DivBits < 32) {
1309cfca06d7SDimitry Andric // Extend in register from the number of bits this divide really is.
1310eb11fae6SDimitry Andric if (IsSigned) {
1311cfca06d7SDimitry Andric int InRegBits = 32 - DivBits;
1312cfca06d7SDimitry Andric
1313cfca06d7SDimitry Andric Res = Builder.CreateShl(Res, InRegBits);
1314cfca06d7SDimitry Andric Res = Builder.CreateAShr(Res, InRegBits);
1315eb11fae6SDimitry Andric } else {
1316cfca06d7SDimitry Andric ConstantInt *TruncMask
1317cfca06d7SDimitry Andric = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1318eb11fae6SDimitry Andric Res = Builder.CreateAnd(Res, TruncMask);
1319eb11fae6SDimitry Andric }
1320cfca06d7SDimitry Andric }
1321eb11fae6SDimitry Andric
1322eb11fae6SDimitry Andric return Res;
1323eb11fae6SDimitry Andric }
1324eb11fae6SDimitry Andric
1325cfca06d7SDimitry Andric // Try to recognize special cases the DAG will emit special, better expansions
1326cfca06d7SDimitry Andric // than the general expansion we do here.
1327cfca06d7SDimitry Andric
1328cfca06d7SDimitry Andric // TODO: It would be better to just directly handle those optimizations here.
divHasSpecialOptimization(BinaryOperator & I,Value * Num,Value * Den) const13297fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
13307fa27ce4SDimitry Andric Value *Num,
13317fa27ce4SDimitry Andric Value *Den) const {
1332cfca06d7SDimitry Andric if (Constant *C = dyn_cast<Constant>(Den)) {
1333cfca06d7SDimitry Andric // Arbitrary constants get a better expansion as long as a wider mulhi is
1334cfca06d7SDimitry Andric // legal.
1335cfca06d7SDimitry Andric if (C->getType()->getScalarSizeInBits() <= 32)
1336cfca06d7SDimitry Andric return true;
1337cfca06d7SDimitry Andric
1338cfca06d7SDimitry Andric // TODO: Sdiv check for not exact for some reason.
1339cfca06d7SDimitry Andric
1340cfca06d7SDimitry Andric // If there's no wider mulhi, there's only a better expansion for powers of
1341cfca06d7SDimitry Andric // two.
1342cfca06d7SDimitry Andric // TODO: Should really know for each vector element.
1343cfca06d7SDimitry Andric if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
1344cfca06d7SDimitry Andric return true;
1345cfca06d7SDimitry Andric
1346cfca06d7SDimitry Andric return false;
1347cfca06d7SDimitry Andric }
1348cfca06d7SDimitry Andric
1349cfca06d7SDimitry Andric if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1350cfca06d7SDimitry Andric // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1351cfca06d7SDimitry Andric if (BinOpDen->getOpcode() == Instruction::Shl &&
1352cfca06d7SDimitry Andric isa<Constant>(BinOpDen->getOperand(0)) &&
1353cfca06d7SDimitry Andric isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
1354cfca06d7SDimitry Andric 0, AC, &I, DT)) {
1355cfca06d7SDimitry Andric return true;
1356cfca06d7SDimitry Andric }
1357cfca06d7SDimitry Andric }
1358cfca06d7SDimitry Andric
1359cfca06d7SDimitry Andric return false;
1360cfca06d7SDimitry Andric }
1361cfca06d7SDimitry Andric
getSign32(Value * V,IRBuilder<> & Builder,const DataLayout * DL)1362cfca06d7SDimitry Andric static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
1363cfca06d7SDimitry Andric // Check whether the sign can be determined statically.
1364cfca06d7SDimitry Andric KnownBits Known = computeKnownBits(V, *DL);
1365cfca06d7SDimitry Andric if (Known.isNegative())
1366cfca06d7SDimitry Andric return Constant::getAllOnesValue(V->getType());
1367cfca06d7SDimitry Andric if (Known.isNonNegative())
1368cfca06d7SDimitry Andric return Constant::getNullValue(V->getType());
1369cfca06d7SDimitry Andric return Builder.CreateAShr(V, Builder.getInt32(31));
1370cfca06d7SDimitry Andric }
1371cfca06d7SDimitry Andric
expandDivRem32(IRBuilder<> & Builder,BinaryOperator & I,Value * X,Value * Y) const13727fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1373cfca06d7SDimitry Andric BinaryOperator &I, Value *X,
1374cfca06d7SDimitry Andric Value *Y) const {
1375eb11fae6SDimitry Andric Instruction::BinaryOps Opc = I.getOpcode();
1376eb11fae6SDimitry Andric assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1377eb11fae6SDimitry Andric Opc == Instruction::SRem || Opc == Instruction::SDiv);
1378eb11fae6SDimitry Andric
1379eb11fae6SDimitry Andric FastMathFlags FMF;
1380eb11fae6SDimitry Andric FMF.setFast();
1381eb11fae6SDimitry Andric Builder.setFastMathFlags(FMF);
1382eb11fae6SDimitry Andric
1383cfca06d7SDimitry Andric if (divHasSpecialOptimization(I, X, Y))
1384cfca06d7SDimitry Andric return nullptr; // Keep it for later optimization.
1385eb11fae6SDimitry Andric
1386eb11fae6SDimitry Andric bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1387eb11fae6SDimitry Andric bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1388eb11fae6SDimitry Andric
1389cfca06d7SDimitry Andric Type *Ty = X->getType();
1390eb11fae6SDimitry Andric Type *I32Ty = Builder.getInt32Ty();
1391eb11fae6SDimitry Andric Type *F32Ty = Builder.getFloatTy();
1392eb11fae6SDimitry Andric
1393ac9a064cSDimitry Andric if (Ty->getScalarSizeInBits() != 32) {
1394eb11fae6SDimitry Andric if (IsSigned) {
1395ac9a064cSDimitry Andric X = Builder.CreateSExtOrTrunc(X, I32Ty);
1396ac9a064cSDimitry Andric Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1397eb11fae6SDimitry Andric } else {
1398ac9a064cSDimitry Andric X = Builder.CreateZExtOrTrunc(X, I32Ty);
1399ac9a064cSDimitry Andric Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1400eb11fae6SDimitry Andric }
1401eb11fae6SDimitry Andric }
1402eb11fae6SDimitry Andric
1403cfca06d7SDimitry Andric if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1404cfca06d7SDimitry Andric return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1405cfca06d7SDimitry Andric Builder.CreateZExtOrTrunc(Res, Ty);
1406eb11fae6SDimitry Andric }
1407eb11fae6SDimitry Andric
1408eb11fae6SDimitry Andric ConstantInt *Zero = Builder.getInt32(0);
1409eb11fae6SDimitry Andric ConstantInt *One = Builder.getInt32(1);
1410eb11fae6SDimitry Andric
1411eb11fae6SDimitry Andric Value *Sign = nullptr;
1412eb11fae6SDimitry Andric if (IsSigned) {
1413cfca06d7SDimitry Andric Value *SignX = getSign32(X, Builder, DL);
1414cfca06d7SDimitry Andric Value *SignY = getSign32(Y, Builder, DL);
1415eb11fae6SDimitry Andric // Remainder sign is the same as LHS
1416cfca06d7SDimitry Andric Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1417eb11fae6SDimitry Andric
1418cfca06d7SDimitry Andric X = Builder.CreateAdd(X, SignX);
1419cfca06d7SDimitry Andric Y = Builder.CreateAdd(Y, SignY);
1420eb11fae6SDimitry Andric
1421cfca06d7SDimitry Andric X = Builder.CreateXor(X, SignX);
1422cfca06d7SDimitry Andric Y = Builder.CreateXor(Y, SignY);
1423eb11fae6SDimitry Andric }
1424eb11fae6SDimitry Andric
1425cfca06d7SDimitry Andric // The algorithm here is based on ideas from "Software Integer Division", Tom
1426cfca06d7SDimitry Andric // Rodeheffer, August 2008.
1427cfca06d7SDimitry Andric //
1428cfca06d7SDimitry Andric // unsigned udiv(unsigned x, unsigned y) {
1429cfca06d7SDimitry Andric // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1430cfca06d7SDimitry Andric // // that this is a lower bound on inv(y), even if some of the calculations
1431cfca06d7SDimitry Andric // // round up.
1432cfca06d7SDimitry Andric // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1433cfca06d7SDimitry Andric //
1434cfca06d7SDimitry Andric // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1435cfca06d7SDimitry Andric // // Empirically this is guaranteed to give a "two-y" lower bound on
1436cfca06d7SDimitry Andric // // inv(y).
1437cfca06d7SDimitry Andric // z += umulh(z, -y * z);
1438cfca06d7SDimitry Andric //
1439cfca06d7SDimitry Andric // // Quotient/remainder estimate.
1440cfca06d7SDimitry Andric // unsigned q = umulh(x, z);
1441cfca06d7SDimitry Andric // unsigned r = x - q * y;
1442cfca06d7SDimitry Andric //
1443cfca06d7SDimitry Andric // // Two rounds of quotient/remainder refinement.
1444cfca06d7SDimitry Andric // if (r >= y) {
1445cfca06d7SDimitry Andric // ++q;
1446cfca06d7SDimitry Andric // r -= y;
1447cfca06d7SDimitry Andric // }
1448cfca06d7SDimitry Andric // if (r >= y) {
1449cfca06d7SDimitry Andric // ++q;
1450cfca06d7SDimitry Andric // r -= y;
1451cfca06d7SDimitry Andric // }
1452cfca06d7SDimitry Andric //
1453cfca06d7SDimitry Andric // return q;
1454cfca06d7SDimitry Andric // }
1455eb11fae6SDimitry Andric
1456cfca06d7SDimitry Andric // Initial estimate of inv(y).
1457cfca06d7SDimitry Andric Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1458cfca06d7SDimitry Andric Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
1459cfca06d7SDimitry Andric Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
14607fa27ce4SDimitry Andric Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1461cfca06d7SDimitry Andric Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1462cfca06d7SDimitry Andric Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1463eb11fae6SDimitry Andric
1464cfca06d7SDimitry Andric // One round of UNR.
1465cfca06d7SDimitry Andric Value *NegY = Builder.CreateSub(Zero, Y);
1466cfca06d7SDimitry Andric Value *NegYZ = Builder.CreateMul(NegY, Z);
1467cfca06d7SDimitry Andric Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1468eb11fae6SDimitry Andric
1469cfca06d7SDimitry Andric // Quotient/remainder estimate.
1470cfca06d7SDimitry Andric Value *Q = getMulHu(Builder, X, Z);
1471cfca06d7SDimitry Andric Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1472eb11fae6SDimitry Andric
1473cfca06d7SDimitry Andric // First quotient/remainder refinement.
1474cfca06d7SDimitry Andric Value *Cond = Builder.CreateICmpUGE(R, Y);
1475cfca06d7SDimitry Andric if (IsDiv)
1476cfca06d7SDimitry Andric Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1477cfca06d7SDimitry Andric R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1478eb11fae6SDimitry Andric
1479cfca06d7SDimitry Andric // Second quotient/remainder refinement.
1480cfca06d7SDimitry Andric Cond = Builder.CreateICmpUGE(R, Y);
1481eb11fae6SDimitry Andric Value *Res;
1482cfca06d7SDimitry Andric if (IsDiv)
1483cfca06d7SDimitry Andric Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1484cfca06d7SDimitry Andric else
1485cfca06d7SDimitry Andric Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1486eb11fae6SDimitry Andric
1487eb11fae6SDimitry Andric if (IsSigned) {
1488eb11fae6SDimitry Andric Res = Builder.CreateXor(Res, Sign);
1489eb11fae6SDimitry Andric Res = Builder.CreateSub(Res, Sign);
1490ac9a064cSDimitry Andric Res = Builder.CreateSExtOrTrunc(Res, Ty);
1491ac9a064cSDimitry Andric } else {
1492ac9a064cSDimitry Andric Res = Builder.CreateZExtOrTrunc(Res, Ty);
1493eb11fae6SDimitry Andric }
1494eb11fae6SDimitry Andric return Res;
1495eb11fae6SDimitry Andric }
1496eb11fae6SDimitry Andric
shrinkDivRem64(IRBuilder<> & Builder,BinaryOperator & I,Value * Num,Value * Den) const14977fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
14987fa27ce4SDimitry Andric BinaryOperator &I, Value *Num,
14997fa27ce4SDimitry Andric Value *Den) const {
1500cfca06d7SDimitry Andric if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1501cfca06d7SDimitry Andric return nullptr; // Keep it for later optimization.
1502cfca06d7SDimitry Andric
1503cfca06d7SDimitry Andric Instruction::BinaryOps Opc = I.getOpcode();
1504cfca06d7SDimitry Andric
1505cfca06d7SDimitry Andric bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1506cfca06d7SDimitry Andric bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1507cfca06d7SDimitry Andric
1508cfca06d7SDimitry Andric int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1509cfca06d7SDimitry Andric if (NumDivBits == -1)
1510cfca06d7SDimitry Andric return nullptr;
1511cfca06d7SDimitry Andric
1512cfca06d7SDimitry Andric Value *Narrowed = nullptr;
1513cfca06d7SDimitry Andric if (NumDivBits <= 24) {
1514cfca06d7SDimitry Andric Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1515cfca06d7SDimitry Andric IsDiv, IsSigned);
1516cfca06d7SDimitry Andric } else if (NumDivBits <= 32) {
1517cfca06d7SDimitry Andric Narrowed = expandDivRem32(Builder, I, Num, Den);
1518cfca06d7SDimitry Andric }
1519cfca06d7SDimitry Andric
1520cfca06d7SDimitry Andric if (Narrowed) {
1521cfca06d7SDimitry Andric return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1522cfca06d7SDimitry Andric Builder.CreateZExt(Narrowed, Num->getType());
1523cfca06d7SDimitry Andric }
1524cfca06d7SDimitry Andric
1525cfca06d7SDimitry Andric return nullptr;
1526cfca06d7SDimitry Andric }
1527cfca06d7SDimitry Andric
expandDivRem64(BinaryOperator & I) const15287fa27ce4SDimitry Andric void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1529cfca06d7SDimitry Andric Instruction::BinaryOps Opc = I.getOpcode();
1530cfca06d7SDimitry Andric // Do the general expansion.
1531cfca06d7SDimitry Andric if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1532cfca06d7SDimitry Andric expandDivisionUpTo64Bits(&I);
1533cfca06d7SDimitry Andric return;
1534cfca06d7SDimitry Andric }
1535cfca06d7SDimitry Andric
1536cfca06d7SDimitry Andric if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1537cfca06d7SDimitry Andric expandRemainderUpTo64Bits(&I);
1538cfca06d7SDimitry Andric return;
1539cfca06d7SDimitry Andric }
1540cfca06d7SDimitry Andric
1541cfca06d7SDimitry Andric llvm_unreachable("not a division");
1542cfca06d7SDimitry Andric }
1543cfca06d7SDimitry Andric
visitBinaryOperator(BinaryOperator & I)15447fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1545cfca06d7SDimitry Andric if (foldBinOpIntoSelect(I))
1546cfca06d7SDimitry Andric return true;
1547cfca06d7SDimitry Andric
1548b915e9e0SDimitry Andric if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
15497fa27ce4SDimitry Andric UA->isUniform(&I) && promoteUniformOpToI32(I))
1550eb11fae6SDimitry Andric return true;
1551eb11fae6SDimitry Andric
15521d5ae102SDimitry Andric if (UseMul24Intrin && replaceMulWithMul24(I))
1553e6d15924SDimitry Andric return true;
1554e6d15924SDimitry Andric
1555eb11fae6SDimitry Andric bool Changed = false;
1556eb11fae6SDimitry Andric Instruction::BinaryOps Opc = I.getOpcode();
1557eb11fae6SDimitry Andric Type *Ty = I.getType();
1558eb11fae6SDimitry Andric Value *NewDiv = nullptr;
1559cfca06d7SDimitry Andric unsigned ScalarSize = Ty->getScalarSizeInBits();
1560cfca06d7SDimitry Andric
1561cfca06d7SDimitry Andric SmallVector<BinaryOperator *, 8> Div64ToExpand;
1562cfca06d7SDimitry Andric
1563eb11fae6SDimitry Andric if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1564eb11fae6SDimitry Andric Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1565cfca06d7SDimitry Andric ScalarSize <= 64 &&
1566cfca06d7SDimitry Andric !DisableIDivExpand) {
1567eb11fae6SDimitry Andric Value *Num = I.getOperand(0);
1568eb11fae6SDimitry Andric Value *Den = I.getOperand(1);
1569eb11fae6SDimitry Andric IRBuilder<> Builder(&I);
1570eb11fae6SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc());
1571eb11fae6SDimitry Andric
1572cfca06d7SDimitry Andric if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1573e3b55780SDimitry Andric NewDiv = PoisonValue::get(VT);
1574eb11fae6SDimitry Andric
1575eb11fae6SDimitry Andric for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1576eb11fae6SDimitry Andric Value *NumEltN = Builder.CreateExtractElement(Num, N);
1577eb11fae6SDimitry Andric Value *DenEltN = Builder.CreateExtractElement(Den, N);
1578cfca06d7SDimitry Andric
1579cfca06d7SDimitry Andric Value *NewElt;
1580cfca06d7SDimitry Andric if (ScalarSize <= 32) {
1581cfca06d7SDimitry Andric NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1582eb11fae6SDimitry Andric if (!NewElt)
1583eb11fae6SDimitry Andric NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1584cfca06d7SDimitry Andric } else {
1585cfca06d7SDimitry Andric // See if this 64-bit division can be shrunk to 32/24-bits before
1586cfca06d7SDimitry Andric // producing the general expansion.
1587cfca06d7SDimitry Andric NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1588cfca06d7SDimitry Andric if (!NewElt) {
1589cfca06d7SDimitry Andric // The general 64-bit expansion introduces control flow and doesn't
1590cfca06d7SDimitry Andric // return the new value. Just insert a scalar copy and defer
1591cfca06d7SDimitry Andric // expanding it.
1592cfca06d7SDimitry Andric NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1593cfca06d7SDimitry Andric Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
1594cfca06d7SDimitry Andric }
1595cfca06d7SDimitry Andric }
1596cfca06d7SDimitry Andric
1597ac9a064cSDimitry Andric if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1598ac9a064cSDimitry Andric NewEltI->copyIRFlags(&I);
1599ac9a064cSDimitry Andric
1600eb11fae6SDimitry Andric NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1601eb11fae6SDimitry Andric }
1602eb11fae6SDimitry Andric } else {
1603cfca06d7SDimitry Andric if (ScalarSize <= 32)
1604eb11fae6SDimitry Andric NewDiv = expandDivRem32(Builder, I, Num, Den);
1605cfca06d7SDimitry Andric else {
1606cfca06d7SDimitry Andric NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1607cfca06d7SDimitry Andric if (!NewDiv)
1608cfca06d7SDimitry Andric Div64ToExpand.push_back(&I);
1609cfca06d7SDimitry Andric }
1610eb11fae6SDimitry Andric }
1611eb11fae6SDimitry Andric
1612eb11fae6SDimitry Andric if (NewDiv) {
1613eb11fae6SDimitry Andric I.replaceAllUsesWith(NewDiv);
1614eb11fae6SDimitry Andric I.eraseFromParent();
1615eb11fae6SDimitry Andric Changed = true;
1616eb11fae6SDimitry Andric }
1617eb11fae6SDimitry Andric }
1618b915e9e0SDimitry Andric
1619cfca06d7SDimitry Andric if (ExpandDiv64InIR) {
1620cfca06d7SDimitry Andric // TODO: We get much worse code in specially handled constant cases.
1621cfca06d7SDimitry Andric for (BinaryOperator *Div : Div64ToExpand) {
1622cfca06d7SDimitry Andric expandDivRem64(*Div);
16237fa27ce4SDimitry Andric FlowChanged = true;
1624cfca06d7SDimitry Andric Changed = true;
1625cfca06d7SDimitry Andric }
1626cfca06d7SDimitry Andric }
1627cfca06d7SDimitry Andric
1628b915e9e0SDimitry Andric return Changed;
1629b915e9e0SDimitry Andric }
1630b915e9e0SDimitry Andric
visitLoadInst(LoadInst & I)16317fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1632eb11fae6SDimitry Andric if (!WidenLoads)
1633eb11fae6SDimitry Andric return false;
1634eb11fae6SDimitry Andric
1635d8e91e46SDimitry Andric if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1636d8e91e46SDimitry Andric I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1637044eb2f6SDimitry Andric canWidenScalarExtLoad(I)) {
1638044eb2f6SDimitry Andric IRBuilder<> Builder(&I);
1639044eb2f6SDimitry Andric Builder.SetCurrentDebugLocation(I.getDebugLoc());
1640044eb2f6SDimitry Andric
1641044eb2f6SDimitry Andric Type *I32Ty = Builder.getInt32Ty();
16427fa27ce4SDimitry Andric LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1643eb11fae6SDimitry Andric WidenLoad->copyMetadata(I);
1644eb11fae6SDimitry Andric
1645eb11fae6SDimitry Andric // If we have range metadata, we need to convert the type, and not make
1646eb11fae6SDimitry Andric // assumptions about the high bits.
1647eb11fae6SDimitry Andric if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1648eb11fae6SDimitry Andric ConstantInt *Lower =
1649eb11fae6SDimitry Andric mdconst::extract<ConstantInt>(Range->getOperand(0));
1650eb11fae6SDimitry Andric
1651c0981da4SDimitry Andric if (Lower->isNullValue()) {
1652eb11fae6SDimitry Andric WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1653eb11fae6SDimitry Andric } else {
1654eb11fae6SDimitry Andric Metadata *LowAndHigh[] = {
1655eb11fae6SDimitry Andric ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1656eb11fae6SDimitry Andric // Don't make assumptions about the high bits.
1657eb11fae6SDimitry Andric ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1658eb11fae6SDimitry Andric };
1659eb11fae6SDimitry Andric
1660eb11fae6SDimitry Andric WidenLoad->setMetadata(LLVMContext::MD_range,
1661eb11fae6SDimitry Andric MDNode::get(Mod->getContext(), LowAndHigh));
1662eb11fae6SDimitry Andric }
1663eb11fae6SDimitry Andric }
1664044eb2f6SDimitry Andric
1665044eb2f6SDimitry Andric int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
1666044eb2f6SDimitry Andric Type *IntNTy = Builder.getIntNTy(TySize);
1667044eb2f6SDimitry Andric Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1668044eb2f6SDimitry Andric Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1669044eb2f6SDimitry Andric I.replaceAllUsesWith(ValOrig);
1670044eb2f6SDimitry Andric I.eraseFromParent();
1671044eb2f6SDimitry Andric return true;
1672044eb2f6SDimitry Andric }
1673044eb2f6SDimitry Andric
1674044eb2f6SDimitry Andric return false;
1675044eb2f6SDimitry Andric }
1676044eb2f6SDimitry Andric
visitICmpInst(ICmpInst & I)16777fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
1678b915e9e0SDimitry Andric bool Changed = false;
1679b915e9e0SDimitry Andric
1680b915e9e0SDimitry Andric if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
16817fa27ce4SDimitry Andric UA->isUniform(&I))
1682b915e9e0SDimitry Andric Changed |= promoteUniformOpToI32(I);
1683b915e9e0SDimitry Andric
1684b915e9e0SDimitry Andric return Changed;
1685b915e9e0SDimitry Andric }
1686b915e9e0SDimitry Andric
visitSelectInst(SelectInst & I)16877fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
16887fa27ce4SDimitry Andric Value *Cond = I.getCondition();
16897fa27ce4SDimitry Andric Value *TrueVal = I.getTrueValue();
16907fa27ce4SDimitry Andric Value *FalseVal = I.getFalseValue();
16917fa27ce4SDimitry Andric Value *CmpVal;
16927fa27ce4SDimitry Andric FCmpInst::Predicate Pred;
1693b915e9e0SDimitry Andric
16947fa27ce4SDimitry Andric if (ST->has16BitInsts() && needsPromotionToI32(I.getType())) {
16957fa27ce4SDimitry Andric if (UA->isUniform(&I))
16967fa27ce4SDimitry Andric return promoteUniformOpToI32(I);
16977fa27ce4SDimitry Andric return false;
1698b915e9e0SDimitry Andric }
1699b915e9e0SDimitry Andric
17007fa27ce4SDimitry Andric // Match fract pattern with nan check.
17017fa27ce4SDimitry Andric if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
17027fa27ce4SDimitry Andric return false;
17037fa27ce4SDimitry Andric
17047fa27ce4SDimitry Andric FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
17057fa27ce4SDimitry Andric if (!FPOp)
17067fa27ce4SDimitry Andric return false;
17077fa27ce4SDimitry Andric
17087fa27ce4SDimitry Andric IRBuilder<> Builder(&I);
17097fa27ce4SDimitry Andric Builder.setFastMathFlags(FPOp->getFastMathFlags());
17107fa27ce4SDimitry Andric
17117fa27ce4SDimitry Andric auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
17127fa27ce4SDimitry Andric auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
17137fa27ce4SDimitry Andric
17147fa27ce4SDimitry Andric Value *Fract = nullptr;
17157fa27ce4SDimitry Andric if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
17167fa27ce4SDimitry Andric CmpVal == matchFractPat(*IIFalse)) {
17177fa27ce4SDimitry Andric // isnan(x) ? x : fract(x)
17187fa27ce4SDimitry Andric Fract = applyFractPat(Builder, CmpVal);
17197fa27ce4SDimitry Andric } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
17207fa27ce4SDimitry Andric CmpVal == matchFractPat(*IITrue)) {
17217fa27ce4SDimitry Andric // !isnan(x) ? fract(x) : x
17227fa27ce4SDimitry Andric Fract = applyFractPat(Builder, CmpVal);
17237fa27ce4SDimitry Andric } else
17247fa27ce4SDimitry Andric return false;
17257fa27ce4SDimitry Andric
17267fa27ce4SDimitry Andric Fract->takeName(&I);
17277fa27ce4SDimitry Andric I.replaceAllUsesWith(Fract);
17287fa27ce4SDimitry Andric RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);
17297fa27ce4SDimitry Andric return true;
17307fa27ce4SDimitry Andric }
17317fa27ce4SDimitry Andric
areInSameBB(const Value * A,const Value * B)17327fa27ce4SDimitry Andric static bool areInSameBB(const Value *A, const Value *B) {
17337fa27ce4SDimitry Andric const auto *IA = dyn_cast<Instruction>(A);
17347fa27ce4SDimitry Andric const auto *IB = dyn_cast<Instruction>(B);
17357fa27ce4SDimitry Andric return IA && IB && IA->getParent() == IB->getParent();
17367fa27ce4SDimitry Andric }
17377fa27ce4SDimitry Andric
17387fa27ce4SDimitry Andric // Helper for breaking large PHIs that returns true when an extractelement on V
17397fa27ce4SDimitry Andric // is likely to be folded away by the DAG combiner.
isInterestingPHIIncomingValue(const Value * V)17407fa27ce4SDimitry Andric static bool isInterestingPHIIncomingValue(const Value *V) {
17417fa27ce4SDimitry Andric const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
17427fa27ce4SDimitry Andric if (!FVT)
17437fa27ce4SDimitry Andric return false;
17447fa27ce4SDimitry Andric
17457fa27ce4SDimitry Andric const Value *CurVal = V;
17467fa27ce4SDimitry Andric
17477fa27ce4SDimitry Andric // Check for insertelements, keeping track of the elements covered.
17487fa27ce4SDimitry Andric BitVector EltsCovered(FVT->getNumElements());
17497fa27ce4SDimitry Andric while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
17507fa27ce4SDimitry Andric const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
17517fa27ce4SDimitry Andric
17527fa27ce4SDimitry Andric // Non constant index/out of bounds index -> folding is unlikely.
17537fa27ce4SDimitry Andric // The latter is more of a sanity check because canonical IR should just
17547fa27ce4SDimitry Andric // have replaced those with poison.
1755ac9a064cSDimitry Andric if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
17567fa27ce4SDimitry Andric return false;
17577fa27ce4SDimitry Andric
17587fa27ce4SDimitry Andric const auto *VecSrc = IE->getOperand(0);
17597fa27ce4SDimitry Andric
17607fa27ce4SDimitry Andric // If the vector source is another instruction, it must be in the same basic
17617fa27ce4SDimitry Andric // block. Otherwise, the DAGCombiner won't see the whole thing and is
17627fa27ce4SDimitry Andric // unlikely to be able to do anything interesting here.
17637fa27ce4SDimitry Andric if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
17647fa27ce4SDimitry Andric return false;
17657fa27ce4SDimitry Andric
17667fa27ce4SDimitry Andric CurVal = VecSrc;
1767ac9a064cSDimitry Andric EltsCovered.set(Idx->getZExtValue());
17687fa27ce4SDimitry Andric
17697fa27ce4SDimitry Andric // All elements covered.
17707fa27ce4SDimitry Andric if (EltsCovered.all())
17717fa27ce4SDimitry Andric return true;
17727fa27ce4SDimitry Andric }
17737fa27ce4SDimitry Andric
17747fa27ce4SDimitry Andric // We either didn't find a single insertelement, or the insertelement chain
17757fa27ce4SDimitry Andric // ended before all elements were covered. Check for other interesting values.
17767fa27ce4SDimitry Andric
17777fa27ce4SDimitry Andric // Constants are always interesting because we can just constant fold the
17787fa27ce4SDimitry Andric // extractelements.
17797fa27ce4SDimitry Andric if (isa<Constant>(CurVal))
17807fa27ce4SDimitry Andric return true;
17817fa27ce4SDimitry Andric
17827fa27ce4SDimitry Andric // shufflevector is likely to be profitable if either operand is a constant,
17837fa27ce4SDimitry Andric // or if either source is in the same block.
17847fa27ce4SDimitry Andric // This is because shufflevector is most often lowered as a series of
17857fa27ce4SDimitry Andric // insert/extract elements anyway.
17867fa27ce4SDimitry Andric if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
17877fa27ce4SDimitry Andric return isa<Constant>(SV->getOperand(1)) ||
17887fa27ce4SDimitry Andric areInSameBB(SV, SV->getOperand(0)) ||
17897fa27ce4SDimitry Andric areInSameBB(SV, SV->getOperand(1));
17907fa27ce4SDimitry Andric }
17917fa27ce4SDimitry Andric
17927fa27ce4SDimitry Andric return false;
17937fa27ce4SDimitry Andric }
17947fa27ce4SDimitry Andric
collectPHINodes(const PHINode & I,SmallPtrSet<const PHINode *,8> & SeenPHIs)1795b1c73532SDimitry Andric static void collectPHINodes(const PHINode &I,
1796b1c73532SDimitry Andric SmallPtrSet<const PHINode *, 8> &SeenPHIs) {
1797b1c73532SDimitry Andric const auto [It, Inserted] = SeenPHIs.insert(&I);
1798b1c73532SDimitry Andric if (!Inserted)
1799b1c73532SDimitry Andric return;
1800b1c73532SDimitry Andric
1801b1c73532SDimitry Andric for (const Value *Inc : I.incoming_values()) {
1802b1c73532SDimitry Andric if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1803b1c73532SDimitry Andric collectPHINodes(*PhiInc, SeenPHIs);
1804b1c73532SDimitry Andric }
1805b1c73532SDimitry Andric
1806b1c73532SDimitry Andric for (const User *U : I.users()) {
1807b1c73532SDimitry Andric if (const auto *PhiU = dyn_cast<PHINode>(U))
1808b1c73532SDimitry Andric collectPHINodes(*PhiU, SeenPHIs);
1809b1c73532SDimitry Andric }
1810b1c73532SDimitry Andric }
1811b1c73532SDimitry Andric
canBreakPHINode(const PHINode & I)18127fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1813b1c73532SDimitry Andric // Check in the cache first.
1814b1c73532SDimitry Andric if (const auto It = BreakPhiNodesCache.find(&I);
1815b1c73532SDimitry Andric It != BreakPhiNodesCache.end())
18167fa27ce4SDimitry Andric return It->second;
18177fa27ce4SDimitry Andric
1818b1c73532SDimitry Andric // We consider PHI nodes as part of "chains", so given a PHI node I, we
1819b1c73532SDimitry Andric // recursively consider all its users and incoming values that are also PHI
1820b1c73532SDimitry Andric // nodes. We then make a decision about all of those PHIs at once. Either they
1821b1c73532SDimitry Andric // all get broken up, or none of them do. That way, we avoid cases where a
1822b1c73532SDimitry Andric // single PHI is/is not broken and we end up reforming/exploding a vector
1823b1c73532SDimitry Andric // multiple times, or even worse, doing it in a loop.
1824b1c73532SDimitry Andric SmallPtrSet<const PHINode *, 8> WorkList;
1825b1c73532SDimitry Andric collectPHINodes(I, WorkList);
18267fa27ce4SDimitry Andric
1827b1c73532SDimitry Andric #ifndef NDEBUG
1828b1c73532SDimitry Andric // Check that none of the PHI nodes in the worklist are in the map. If some of
1829b1c73532SDimitry Andric // them are, it means we're not good enough at collecting related PHIs.
1830b1c73532SDimitry Andric for (const PHINode *WLP : WorkList) {
1831b1c73532SDimitry Andric assert(BreakPhiNodesCache.count(WLP) == 0);
1832b1c73532SDimitry Andric }
1833b1c73532SDimitry Andric #endif
1834b1c73532SDimitry Andric
1835b1c73532SDimitry Andric // To consider a PHI profitable to break, we need to see some interesting
1836b1c73532SDimitry Andric // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1837b1c73532SDimitry Andric // must have one to consider all PHIs breakable.
1838b1c73532SDimitry Andric //
1839b1c73532SDimitry Andric // This threshold has been determined through performance testing.
1840b1c73532SDimitry Andric //
1841b1c73532SDimitry Andric // Note that the computation below is equivalent to
1842b1c73532SDimitry Andric //
1843b1c73532SDimitry Andric // (unsigned)ceil((K / 3.0) * 2)
1844b1c73532SDimitry Andric //
1845b1c73532SDimitry Andric // It's simply written this way to avoid mixing integral/FP arithmetic.
1846b1c73532SDimitry Andric const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1847b1c73532SDimitry Andric unsigned NumBreakablePHIs = 0;
1848b1c73532SDimitry Andric bool CanBreak = false;
1849b1c73532SDimitry Andric for (const PHINode *Cur : WorkList) {
18507fa27ce4SDimitry Andric // Don't break PHIs that have no interesting incoming values. That is, where
1851b1c73532SDimitry Andric // there is no clear opportunity to fold the "extractelement" instructions
1852b1c73532SDimitry Andric // we would add.
18537fa27ce4SDimitry Andric //
18547fa27ce4SDimitry Andric // Note: IC does not run after this pass, so we're only interested in the
18557fa27ce4SDimitry Andric // foldings that the DAG combiner can do.
1856b1c73532SDimitry Andric if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1857b1c73532SDimitry Andric if (++NumBreakablePHIs >= Threshold) {
1858b1c73532SDimitry Andric CanBreak = true;
1859b1c73532SDimitry Andric break;
1860b1c73532SDimitry Andric }
1861b1c73532SDimitry Andric }
18627fa27ce4SDimitry Andric }
18637fa27ce4SDimitry Andric
1864b1c73532SDimitry Andric for (const PHINode *Cur : WorkList)
1865b1c73532SDimitry Andric BreakPhiNodesCache[Cur] = CanBreak;
18667fa27ce4SDimitry Andric
1867b1c73532SDimitry Andric return CanBreak;
18687fa27ce4SDimitry Andric }
18697fa27ce4SDimitry Andric
18707fa27ce4SDimitry Andric /// Helper class for "break large PHIs" (visitPHINode).
18717fa27ce4SDimitry Andric ///
18727fa27ce4SDimitry Andric /// This represents a slice of a PHI's incoming value, which is made up of:
18737fa27ce4SDimitry Andric /// - The type of the slice (Ty)
18747fa27ce4SDimitry Andric /// - The index in the incoming value's vector where the slice starts (Idx)
18757fa27ce4SDimitry Andric /// - The number of elements in the slice (NumElts).
18767fa27ce4SDimitry Andric /// It also keeps track of the NewPHI node inserted for this particular slice.
18777fa27ce4SDimitry Andric ///
18787fa27ce4SDimitry Andric /// Slice examples:
18797fa27ce4SDimitry Andric /// <4 x i64> -> Split into four i64 slices.
18807fa27ce4SDimitry Andric /// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
18817fa27ce4SDimitry Andric /// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
18827fa27ce4SDimitry Andric /// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
18837fa27ce4SDimitry Andric class VectorSlice {
18847fa27ce4SDimitry Andric public:
VectorSlice(Type * Ty,unsigned Idx,unsigned NumElts)18857fa27ce4SDimitry Andric VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
18867fa27ce4SDimitry Andric : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
18877fa27ce4SDimitry Andric
18887fa27ce4SDimitry Andric Type *Ty = nullptr;
18897fa27ce4SDimitry Andric unsigned Idx = 0;
18907fa27ce4SDimitry Andric unsigned NumElts = 0;
18917fa27ce4SDimitry Andric PHINode *NewPHI = nullptr;
18927fa27ce4SDimitry Andric
18937fa27ce4SDimitry Andric /// Slice \p Inc according to the information contained within this slice.
18947fa27ce4SDimitry Andric /// This is cached, so if called multiple times for the same \p BB & \p Inc
18957fa27ce4SDimitry Andric /// pair, it returns the same Sliced value as well.
18967fa27ce4SDimitry Andric ///
18977fa27ce4SDimitry Andric /// Note this *intentionally* does not return the same value for, say,
18987fa27ce4SDimitry Andric /// [%bb.0, %0] & [%bb.1, %0] as:
18997fa27ce4SDimitry Andric /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
19007fa27ce4SDimitry Andric /// the value in bb.1 may not be reachable from bb.0 if it's its
19017fa27ce4SDimitry Andric /// predecessor.)
19027fa27ce4SDimitry Andric /// - We also want to make our extract instructions as local as possible so
19037fa27ce4SDimitry Andric /// the DAG has better chances of folding them out. Duplicating them like
19047fa27ce4SDimitry Andric /// that is beneficial in that regard.
19057fa27ce4SDimitry Andric ///
19067fa27ce4SDimitry Andric /// This is both a minor optimization to avoid creating duplicate
19077fa27ce4SDimitry Andric /// instructions, but also a requirement for correctness. It is not forbidden
19087fa27ce4SDimitry Andric /// for a PHI node to have the same [BB, Val] pair multiple times. If we
19097fa27ce4SDimitry Andric /// returned a new value each time, those previously identical pairs would all
19107fa27ce4SDimitry Andric /// have different incoming values (from the same block) and it'd cause a "PHI
19117fa27ce4SDimitry Andric /// node has multiple entries for the same basic block with different incoming
19127fa27ce4SDimitry Andric /// values!" verifier error.
getSlicedVal(BasicBlock * BB,Value * Inc,StringRef NewValName)19137fa27ce4SDimitry Andric Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
19147fa27ce4SDimitry Andric Value *&Res = SlicedVals[{BB, Inc}];
19157fa27ce4SDimitry Andric if (Res)
19167fa27ce4SDimitry Andric return Res;
19177fa27ce4SDimitry Andric
19187fa27ce4SDimitry Andric IRBuilder<> B(BB->getTerminator());
19197fa27ce4SDimitry Andric if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
19207fa27ce4SDimitry Andric B.SetCurrentDebugLocation(IncInst->getDebugLoc());
19217fa27ce4SDimitry Andric
19227fa27ce4SDimitry Andric if (NumElts > 1) {
19237fa27ce4SDimitry Andric SmallVector<int, 4> Mask;
19247fa27ce4SDimitry Andric for (unsigned K = Idx; K < (Idx + NumElts); ++K)
19257fa27ce4SDimitry Andric Mask.push_back(K);
19267fa27ce4SDimitry Andric Res = B.CreateShuffleVector(Inc, Mask, NewValName);
19277fa27ce4SDimitry Andric } else
19287fa27ce4SDimitry Andric Res = B.CreateExtractElement(Inc, Idx, NewValName);
19297fa27ce4SDimitry Andric
19307fa27ce4SDimitry Andric return Res;
19317fa27ce4SDimitry Andric }
19327fa27ce4SDimitry Andric
19337fa27ce4SDimitry Andric private:
19347fa27ce4SDimitry Andric SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals;
19357fa27ce4SDimitry Andric };
19367fa27ce4SDimitry Andric
visitPHINode(PHINode & I)19377fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
19387fa27ce4SDimitry Andric // Break-up fixed-vector PHIs into smaller pieces.
19397fa27ce4SDimitry Andric // Default threshold is 32, so it breaks up any vector that's >32 bits into
19407fa27ce4SDimitry Andric // its elements, or into 32-bit pieces (for 8/16 bit elts).
19417fa27ce4SDimitry Andric //
19427fa27ce4SDimitry Andric // This is only helpful for DAGISel because it doesn't handle large PHIs as
19437fa27ce4SDimitry Andric // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
19447fa27ce4SDimitry Andric // With large, odd-sized PHIs we may end up needing many `build_vector`
19457fa27ce4SDimitry Andric // operations with most elements being "undef". This inhibits a lot of
19467fa27ce4SDimitry Andric // optimization opportunities and can result in unreasonably high register
19477fa27ce4SDimitry Andric // pressure and the inevitable stack spilling.
1948b1c73532SDimitry Andric if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
19497fa27ce4SDimitry Andric return false;
19507fa27ce4SDimitry Andric
19517fa27ce4SDimitry Andric FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1952b1c73532SDimitry Andric if (!FVT || FVT->getNumElements() == 1 ||
1953b1c73532SDimitry Andric DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
19547fa27ce4SDimitry Andric return false;
19557fa27ce4SDimitry Andric
1956b1c73532SDimitry Andric if (!ForceBreakLargePHIs && !canBreakPHINode(I))
19577fa27ce4SDimitry Andric return false;
19587fa27ce4SDimitry Andric
19597fa27ce4SDimitry Andric std::vector<VectorSlice> Slices;
19607fa27ce4SDimitry Andric
19617fa27ce4SDimitry Andric Type *EltTy = FVT->getElementType();
19627fa27ce4SDimitry Andric {
19637fa27ce4SDimitry Andric unsigned Idx = 0;
19647fa27ce4SDimitry Andric // For 8/16 bits type, don't scalarize fully but break it up into as many
19657fa27ce4SDimitry Andric // 32-bit slices as we can, and scalarize the tail.
19667fa27ce4SDimitry Andric const unsigned EltSize = DL->getTypeSizeInBits(EltTy);
19677fa27ce4SDimitry Andric const unsigned NumElts = FVT->getNumElements();
19687fa27ce4SDimitry Andric if (EltSize == 8 || EltSize == 16) {
19697fa27ce4SDimitry Andric const unsigned SubVecSize = (32 / EltSize);
19707fa27ce4SDimitry Andric Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
19717fa27ce4SDimitry Andric for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
19727fa27ce4SDimitry Andric Idx += SubVecSize)
19737fa27ce4SDimitry Andric Slices.emplace_back(SubVecTy, Idx, SubVecSize);
19747fa27ce4SDimitry Andric }
19757fa27ce4SDimitry Andric
19767fa27ce4SDimitry Andric // Scalarize all remaining elements.
19777fa27ce4SDimitry Andric for (; Idx < NumElts; ++Idx)
19787fa27ce4SDimitry Andric Slices.emplace_back(EltTy, Idx, 1);
19797fa27ce4SDimitry Andric }
19807fa27ce4SDimitry Andric
1981b1c73532SDimitry Andric assert(Slices.size() > 1);
19827fa27ce4SDimitry Andric
19837fa27ce4SDimitry Andric // Create one PHI per vector piece. The "VectorSlice" class takes care of
19847fa27ce4SDimitry Andric // creating the necessary instruction to extract the relevant slices of each
19857fa27ce4SDimitry Andric // incoming value.
19867fa27ce4SDimitry Andric IRBuilder<> B(I.getParent());
19877fa27ce4SDimitry Andric B.SetCurrentDebugLocation(I.getDebugLoc());
19887fa27ce4SDimitry Andric
19897fa27ce4SDimitry Andric unsigned IncNameSuffix = 0;
19907fa27ce4SDimitry Andric for (VectorSlice &S : Slices) {
19917fa27ce4SDimitry Andric // We need to reset the build on each iteration, because getSlicedVal may
19927fa27ce4SDimitry Andric // have inserted something into I's BB.
1993ac9a064cSDimitry Andric B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
19947fa27ce4SDimitry Andric S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
19957fa27ce4SDimitry Andric
19967fa27ce4SDimitry Andric for (const auto &[Idx, BB] : enumerate(I.blocks())) {
19977fa27ce4SDimitry Andric S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
19987fa27ce4SDimitry Andric "largephi.extractslice" +
19997fa27ce4SDimitry Andric std::to_string(IncNameSuffix++)),
20007fa27ce4SDimitry Andric BB);
20017fa27ce4SDimitry Andric }
20027fa27ce4SDimitry Andric }
20037fa27ce4SDimitry Andric
20047fa27ce4SDimitry Andric // And replace this PHI with a vector of all the previous PHI values.
20057fa27ce4SDimitry Andric Value *Vec = PoisonValue::get(FVT);
20067fa27ce4SDimitry Andric unsigned NameSuffix = 0;
20077fa27ce4SDimitry Andric for (VectorSlice &S : Slices) {
20087fa27ce4SDimitry Andric const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
20097fa27ce4SDimitry Andric if (S.NumElts > 1)
20107fa27ce4SDimitry Andric Vec =
20117fa27ce4SDimitry Andric B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName);
20127fa27ce4SDimitry Andric else
20137fa27ce4SDimitry Andric Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
20147fa27ce4SDimitry Andric }
20157fa27ce4SDimitry Andric
20167fa27ce4SDimitry Andric I.replaceAllUsesWith(Vec);
20177fa27ce4SDimitry Andric I.eraseFromParent();
20187fa27ce4SDimitry Andric return true;
20197fa27ce4SDimitry Andric }
20207fa27ce4SDimitry Andric
2021ac9a064cSDimitry Andric /// \param V Value to check
2022ac9a064cSDimitry Andric /// \param DL DataLayout
2023ac9a064cSDimitry Andric /// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
2024ac9a064cSDimitry Andric /// \param AS Target Address Space
2025ac9a064cSDimitry Andric /// \return true if \p V cannot be the null value of \p AS, false otherwise.
isPtrKnownNeverNull(const Value * V,const DataLayout & DL,const AMDGPUTargetMachine & TM,unsigned AS)2026ac9a064cSDimitry Andric static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
2027ac9a064cSDimitry Andric const AMDGPUTargetMachine &TM, unsigned AS) {
2028ac9a064cSDimitry Andric // Pointer cannot be null if it's a block address, GV or alloca.
2029ac9a064cSDimitry Andric // NOTE: We don't support extern_weak, but if we did, we'd need to check for
2030ac9a064cSDimitry Andric // it as the symbol could be null in such cases.
2031ac9a064cSDimitry Andric if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
2032ac9a064cSDimitry Andric return true;
2033ac9a064cSDimitry Andric
2034ac9a064cSDimitry Andric // Check nonnull arguments.
2035ac9a064cSDimitry Andric if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2036ac9a064cSDimitry Andric return true;
2037ac9a064cSDimitry Andric
2038ac9a064cSDimitry Andric // getUnderlyingObject may have looked through another addrspacecast, although
2039ac9a064cSDimitry Andric // the optimizable situations most likely folded out by now.
2040ac9a064cSDimitry Andric if (AS != cast<PointerType>(V->getType())->getAddressSpace())
2041ac9a064cSDimitry Andric return false;
2042ac9a064cSDimitry Andric
2043ac9a064cSDimitry Andric // TODO: Calls that return nonnull?
2044ac9a064cSDimitry Andric
2045ac9a064cSDimitry Andric // For all other things, use KnownBits.
2046ac9a064cSDimitry Andric // We either use 0 or all bits set to indicate null, so check whether the
2047ac9a064cSDimitry Andric // value can be zero or all ones.
2048ac9a064cSDimitry Andric //
2049ac9a064cSDimitry Andric // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2050ac9a064cSDimitry Andric // address spaces have non-zero null values.
2051ac9a064cSDimitry Andric auto SrcPtrKB = computeKnownBits(V, DL);
2052ac9a064cSDimitry Andric const auto NullVal = TM.getNullPointerValue(AS);
2053ac9a064cSDimitry Andric
2054ac9a064cSDimitry Andric assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2055ac9a064cSDimitry Andric assert((NullVal == 0 || NullVal == -1) &&
2056ac9a064cSDimitry Andric "don't know how to check for this null value!");
2057ac9a064cSDimitry Andric return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2058ac9a064cSDimitry Andric }
2059ac9a064cSDimitry Andric
visitAddrSpaceCastInst(AddrSpaceCastInst & I)2060ac9a064cSDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2061ac9a064cSDimitry Andric // Intrinsic doesn't support vectors, also it seems that it's often difficult
2062ac9a064cSDimitry Andric // to prove that a vector cannot have any nulls in it so it's unclear if it's
2063ac9a064cSDimitry Andric // worth supporting.
2064ac9a064cSDimitry Andric if (I.getType()->isVectorTy())
2065ac9a064cSDimitry Andric return false;
2066ac9a064cSDimitry Andric
2067ac9a064cSDimitry Andric // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2068ac9a064cSDimitry Andric // This is only worthwhile for casts from/to priv/local to flat.
2069ac9a064cSDimitry Andric const unsigned SrcAS = I.getSrcAddressSpace();
2070ac9a064cSDimitry Andric const unsigned DstAS = I.getDestAddressSpace();
2071ac9a064cSDimitry Andric
2072ac9a064cSDimitry Andric bool CanLower = false;
2073ac9a064cSDimitry Andric if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2074ac9a064cSDimitry Andric CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2075ac9a064cSDimitry Andric DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2076ac9a064cSDimitry Andric else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2077ac9a064cSDimitry Andric CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2078ac9a064cSDimitry Andric SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2079ac9a064cSDimitry Andric if (!CanLower)
2080ac9a064cSDimitry Andric return false;
2081ac9a064cSDimitry Andric
2082ac9a064cSDimitry Andric SmallVector<const Value *, 4> WorkList;
2083ac9a064cSDimitry Andric getUnderlyingObjects(I.getOperand(0), WorkList);
2084ac9a064cSDimitry Andric if (!all_of(WorkList, [&](const Value *V) {
2085ac9a064cSDimitry Andric return isPtrKnownNeverNull(V, *DL, *TM, SrcAS);
2086ac9a064cSDimitry Andric }))
2087ac9a064cSDimitry Andric return false;
2088ac9a064cSDimitry Andric
2089ac9a064cSDimitry Andric IRBuilder<> B(&I);
2090ac9a064cSDimitry Andric auto *Intrin = B.CreateIntrinsic(
2091ac9a064cSDimitry Andric I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2092ac9a064cSDimitry Andric I.replaceAllUsesWith(Intrin);
2093ac9a064cSDimitry Andric I.eraseFromParent();
2094ac9a064cSDimitry Andric return true;
2095ac9a064cSDimitry Andric }
2096ac9a064cSDimitry Andric
visitIntrinsicInst(IntrinsicInst & I)20977fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2098b915e9e0SDimitry Andric switch (I.getIntrinsicID()) {
2099b915e9e0SDimitry Andric case Intrinsic::bitreverse:
2100b915e9e0SDimitry Andric return visitBitreverseIntrinsicInst(I);
21017fa27ce4SDimitry Andric case Intrinsic::minnum:
21027fa27ce4SDimitry Andric return visitMinNum(I);
2103b1c73532SDimitry Andric case Intrinsic::sqrt:
2104b1c73532SDimitry Andric return visitSqrt(I);
2105b915e9e0SDimitry Andric default:
2106b915e9e0SDimitry Andric return false;
2107b915e9e0SDimitry Andric }
2108b915e9e0SDimitry Andric }
2109b915e9e0SDimitry Andric
visitBitreverseIntrinsicInst(IntrinsicInst & I)21107fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
2111b915e9e0SDimitry Andric bool Changed = false;
2112b915e9e0SDimitry Andric
2113b915e9e0SDimitry Andric if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
21147fa27ce4SDimitry Andric UA->isUniform(&I))
2115b915e9e0SDimitry Andric Changed |= promoteUniformBitreverseToI32(I);
2116b915e9e0SDimitry Andric
2117b915e9e0SDimitry Andric return Changed;
2118b915e9e0SDimitry Andric }
2119b915e9e0SDimitry Andric
21207fa27ce4SDimitry Andric /// Match non-nan fract pattern.
21217fa27ce4SDimitry Andric /// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
21227fa27ce4SDimitry Andric ///
21237fa27ce4SDimitry Andric /// If fract is a useful instruction for the subtarget. Does not account for the
21247fa27ce4SDimitry Andric /// nan handling; the instruction has a nan check on the input value.
matchFractPat(IntrinsicInst & I)21257fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
21267fa27ce4SDimitry Andric if (ST->hasFractBug())
21277fa27ce4SDimitry Andric return nullptr;
21287fa27ce4SDimitry Andric
21297fa27ce4SDimitry Andric if (I.getIntrinsicID() != Intrinsic::minnum)
21307fa27ce4SDimitry Andric return nullptr;
21317fa27ce4SDimitry Andric
21327fa27ce4SDimitry Andric Type *Ty = I.getType();
21337fa27ce4SDimitry Andric if (!isLegalFloatingTy(Ty->getScalarType()))
21347fa27ce4SDimitry Andric return nullptr;
21357fa27ce4SDimitry Andric
21367fa27ce4SDimitry Andric Value *Arg0 = I.getArgOperand(0);
21377fa27ce4SDimitry Andric Value *Arg1 = I.getArgOperand(1);
21387fa27ce4SDimitry Andric
21397fa27ce4SDimitry Andric const APFloat *C;
21407fa27ce4SDimitry Andric if (!match(Arg1, m_APFloat(C)))
21417fa27ce4SDimitry Andric return nullptr;
21427fa27ce4SDimitry Andric
21437fa27ce4SDimitry Andric APFloat One(1.0);
21447fa27ce4SDimitry Andric bool LosesInfo;
21457fa27ce4SDimitry Andric One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
21467fa27ce4SDimitry Andric
21477fa27ce4SDimitry Andric // Match nextafter(1.0, -1)
21487fa27ce4SDimitry Andric One.next(true);
21497fa27ce4SDimitry Andric if (One != *C)
21507fa27ce4SDimitry Andric return nullptr;
21517fa27ce4SDimitry Andric
21527fa27ce4SDimitry Andric Value *FloorSrc;
21537fa27ce4SDimitry Andric if (match(Arg0, m_FSub(m_Value(FloorSrc),
21547fa27ce4SDimitry Andric m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc)))))
21557fa27ce4SDimitry Andric return FloorSrc;
21567fa27ce4SDimitry Andric return nullptr;
21577fa27ce4SDimitry Andric }
21587fa27ce4SDimitry Andric
applyFractPat(IRBuilder<> & Builder,Value * FractArg)21597fa27ce4SDimitry Andric Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
21607fa27ce4SDimitry Andric Value *FractArg) {
21617fa27ce4SDimitry Andric SmallVector<Value *, 4> FractVals;
21627fa27ce4SDimitry Andric extractValues(Builder, FractVals, FractArg);
21637fa27ce4SDimitry Andric
21647fa27ce4SDimitry Andric SmallVector<Value *, 4> ResultVals(FractVals.size());
21657fa27ce4SDimitry Andric
21667fa27ce4SDimitry Andric Type *Ty = FractArg->getType()->getScalarType();
21677fa27ce4SDimitry Andric for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
21687fa27ce4SDimitry Andric ResultVals[I] =
21697fa27ce4SDimitry Andric Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
21707fa27ce4SDimitry Andric }
21717fa27ce4SDimitry Andric
21727fa27ce4SDimitry Andric return insertValues(Builder, FractArg->getType(), ResultVals);
21737fa27ce4SDimitry Andric }
21747fa27ce4SDimitry Andric
visitMinNum(IntrinsicInst & I)21757fa27ce4SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
21767fa27ce4SDimitry Andric Value *FractArg = matchFractPat(I);
21777fa27ce4SDimitry Andric if (!FractArg)
21787fa27ce4SDimitry Andric return false;
21797fa27ce4SDimitry Andric
21807fa27ce4SDimitry Andric // Match pattern for fract intrinsic in contexts where the nan check has been
21817fa27ce4SDimitry Andric // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2182ac9a064cSDimitry Andric if (!I.hasNoNaNs() &&
2183ac9a064cSDimitry Andric !isKnownNeverNaN(FractArg, /*Depth=*/0, SimplifyQuery(*DL, TLInfo)))
21847fa27ce4SDimitry Andric return false;
21857fa27ce4SDimitry Andric
21867fa27ce4SDimitry Andric IRBuilder<> Builder(&I);
21877fa27ce4SDimitry Andric FastMathFlags FMF = I.getFastMathFlags();
21887fa27ce4SDimitry Andric FMF.setNoNaNs();
21897fa27ce4SDimitry Andric Builder.setFastMathFlags(FMF);
21907fa27ce4SDimitry Andric
21917fa27ce4SDimitry Andric Value *Fract = applyFractPat(Builder, FractArg);
21927fa27ce4SDimitry Andric Fract->takeName(&I);
21937fa27ce4SDimitry Andric I.replaceAllUsesWith(Fract);
21947fa27ce4SDimitry Andric
21957fa27ce4SDimitry Andric RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);
21967fa27ce4SDimitry Andric return true;
21977fa27ce4SDimitry Andric }
21987fa27ce4SDimitry Andric
isOneOrNegOne(const Value * Val)2199b1c73532SDimitry Andric static bool isOneOrNegOne(const Value *Val) {
2200b1c73532SDimitry Andric const APFloat *C;
2201b1c73532SDimitry Andric return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;
2202b1c73532SDimitry Andric }
2203b1c73532SDimitry Andric
2204b1c73532SDimitry Andric // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
visitSqrt(IntrinsicInst & Sqrt)2205b1c73532SDimitry Andric bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2206b1c73532SDimitry Andric Type *Ty = Sqrt.getType()->getScalarType();
2207b1c73532SDimitry Andric if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST->has16BitInsts()))
2208b1c73532SDimitry Andric return false;
2209b1c73532SDimitry Andric
2210b1c73532SDimitry Andric const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2211b1c73532SDimitry Andric FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2212b1c73532SDimitry Andric
2213b1c73532SDimitry Andric // We're trying to handle the fast-but-not-that-fast case only. The lowering
2214b1c73532SDimitry Andric // of fast llvm.sqrt will give the raw instruction anyway.
2215b1c73532SDimitry Andric if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
2216b1c73532SDimitry Andric return false;
2217b1c73532SDimitry Andric
2218b1c73532SDimitry Andric const float ReqdAccuracy = FPOp->getFPAccuracy();
2219b1c73532SDimitry Andric
2220b1c73532SDimitry Andric // Defer correctly rounded expansion to codegen.
2221b1c73532SDimitry Andric if (ReqdAccuracy < 1.0f)
2222b1c73532SDimitry Andric return false;
2223b1c73532SDimitry Andric
2224b1c73532SDimitry Andric // FIXME: This is an ugly hack for this pass using forward iteration instead
2225b1c73532SDimitry Andric // of reverse. If it worked like a normal combiner, the rsq would form before
2226b1c73532SDimitry Andric // we saw a sqrt call.
2227b1c73532SDimitry Andric auto *FDiv =
2228b1c73532SDimitry Andric dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());
2229b1c73532SDimitry Andric if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
2230b1c73532SDimitry Andric FDiv->getFPAccuracy() >= 1.0f &&
2231b1c73532SDimitry Andric canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&
2232b1c73532SDimitry Andric // TODO: We should also handle the arcp case for the fdiv with non-1 value
2233b1c73532SDimitry Andric isOneOrNegOne(FDiv->getOperand(0)))
2234b1c73532SDimitry Andric return false;
2235b1c73532SDimitry Andric
2236b1c73532SDimitry Andric Value *SrcVal = Sqrt.getOperand(0);
2237b1c73532SDimitry Andric bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2238b1c73532SDimitry Andric
2239b1c73532SDimitry Andric // The raw instruction is 1 ulp, but the correction for denormal handling
2240b1c73532SDimitry Andric // brings it to 2.
2241b1c73532SDimitry Andric if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2242b1c73532SDimitry Andric return false;
2243b1c73532SDimitry Andric
2244b1c73532SDimitry Andric IRBuilder<> Builder(&Sqrt);
2245b1c73532SDimitry Andric SmallVector<Value *, 4> SrcVals;
2246b1c73532SDimitry Andric extractValues(Builder, SrcVals, SrcVal);
2247b1c73532SDimitry Andric
2248b1c73532SDimitry Andric SmallVector<Value *, 4> ResultVals(SrcVals.size());
2249b1c73532SDimitry Andric for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2250b1c73532SDimitry Andric if (CanTreatAsDAZ)
2251b1c73532SDimitry Andric ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2252b1c73532SDimitry Andric else
2253b1c73532SDimitry Andric ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2254b1c73532SDimitry Andric }
2255b1c73532SDimitry Andric
2256b1c73532SDimitry Andric Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2257b1c73532SDimitry Andric NewSqrt->takeName(&Sqrt);
2258b1c73532SDimitry Andric Sqrt.replaceAllUsesWith(NewSqrt);
2259b1c73532SDimitry Andric Sqrt.eraseFromParent();
2260b1c73532SDimitry Andric return true;
2261b1c73532SDimitry Andric }
2262b1c73532SDimitry Andric
doInitialization(Module & M)226301095a5dSDimitry Andric bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
22647fa27ce4SDimitry Andric Impl.Mod = &M;
22657fa27ce4SDimitry Andric Impl.DL = &Impl.Mod->getDataLayout();
2266b1c73532SDimitry Andric Impl.SqrtF32 = nullptr;
2267b1c73532SDimitry Andric Impl.LdexpF32 = nullptr;
226801095a5dSDimitry Andric return false;
226901095a5dSDimitry Andric }
227001095a5dSDimitry Andric
runOnFunction(Function & F)227101095a5dSDimitry Andric bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2272b5630dbaSDimitry Andric if (skipFunction(F))
227301095a5dSDimitry Andric return false;
227401095a5dSDimitry Andric
2275b5630dbaSDimitry Andric auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2276b5630dbaSDimitry Andric if (!TPC)
2277b5630dbaSDimitry Andric return false;
2278b5630dbaSDimitry Andric
2279eb11fae6SDimitry Andric const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2280ac9a064cSDimitry Andric Impl.TM = &TM;
22817fa27ce4SDimitry Andric Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
22827fa27ce4SDimitry Andric Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
22837fa27ce4SDimitry Andric Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
22847fa27ce4SDimitry Andric Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2285cfca06d7SDimitry Andric auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
22867fa27ce4SDimitry Andric Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
22877fa27ce4SDimitry Andric Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2288312c0ed1SDimitry Andric SIModeRegisterDefaults Mode(F, *Impl.ST);
22897fa27ce4SDimitry Andric Impl.HasFP32DenormalFlush =
22907fa27ce4SDimitry Andric Mode.FP32Denormals == DenormalMode::getPreserveSign();
22917fa27ce4SDimitry Andric return Impl.run(F);
2292a7fe922bSDimitry Andric }
2293a7fe922bSDimitry Andric
run(Function & F,FunctionAnalysisManager & FAM)22947fa27ce4SDimitry Andric PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
22957fa27ce4SDimitry Andric FunctionAnalysisManager &FAM) {
22967fa27ce4SDimitry Andric AMDGPUCodeGenPrepareImpl Impl;
22977fa27ce4SDimitry Andric Impl.Mod = F.getParent();
22987fa27ce4SDimitry Andric Impl.DL = &Impl.Mod->getDataLayout();
2299ac9a064cSDimitry Andric Impl.TM = static_cast<const AMDGPUTargetMachine *>(&TM);
23007fa27ce4SDimitry Andric Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
23017fa27ce4SDimitry Andric Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
23027fa27ce4SDimitry Andric Impl.AC = &FAM.getResult<AssumptionAnalysis>(F);
23037fa27ce4SDimitry Andric Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F);
23047fa27ce4SDimitry Andric Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
23057fa27ce4SDimitry Andric Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2306312c0ed1SDimitry Andric SIModeRegisterDefaults Mode(F, *Impl.ST);
23077fa27ce4SDimitry Andric Impl.HasFP32DenormalFlush =
23087fa27ce4SDimitry Andric Mode.FP32Denormals == DenormalMode::getPreserveSign();
23097fa27ce4SDimitry Andric PreservedAnalyses PA = PreservedAnalyses::none();
23107fa27ce4SDimitry Andric if (!Impl.FlowChanged)
23117fa27ce4SDimitry Andric PA.preserveSet<CFGAnalyses>();
23127fa27ce4SDimitry Andric return Impl.run(F) ? PA : PreservedAnalyses::all();
231301095a5dSDimitry Andric }
231401095a5dSDimitry Andric
2315b5630dbaSDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
231601095a5dSDimitry Andric "AMDGPU IR optimizations", false, false)
2317eb11fae6SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
23187fa27ce4SDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
23197fa27ce4SDimitry Andric INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
2320b5630dbaSDimitry Andric INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2321b5630dbaSDimitry Andric false, false)
232201095a5dSDimitry Andric
232301095a5dSDimitry Andric char AMDGPUCodeGenPrepare::ID = 0;
232401095a5dSDimitry Andric
createAMDGPUCodeGenPreparePass()2325b5630dbaSDimitry Andric FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
2326b5630dbaSDimitry Andric return new AMDGPUCodeGenPrepare();
232701095a5dSDimitry Andric }
2328