1eb11fae6SDimitry Andric //===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
2eb11fae6SDimitry Andric //
3e6d15924SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e6d15924SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5e6d15924SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6eb11fae6SDimitry Andric //
7eb11fae6SDimitry Andric //===----------------------------------------------------------------------===//
8eb11fae6SDimitry Andric //
9eb11fae6SDimitry Andric /// \file This pass replaces accesses to kernel arguments with loads from
10eb11fae6SDimitry Andric /// offsets from the kernarg base pointer.
11eb11fae6SDimitry Andric //
12eb11fae6SDimitry Andric //===----------------------------------------------------------------------===//
13eb11fae6SDimitry Andric
14eb11fae6SDimitry Andric #include "AMDGPU.h"
15b60736ecSDimitry Andric #include "GCNSubtarget.h"
16eb11fae6SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
17344a3780SDimitry Andric #include "llvm/IR/IRBuilder.h"
18b1c73532SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
19eb11fae6SDimitry Andric #include "llvm/IR/MDBuilder.h"
20b60736ecSDimitry Andric #include "llvm/Target/TargetMachine.h"
21b1c73532SDimitry Andric
22eb11fae6SDimitry Andric #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
23eb11fae6SDimitry Andric
24eb11fae6SDimitry Andric using namespace llvm;
25eb11fae6SDimitry Andric
26eb11fae6SDimitry Andric namespace {
27eb11fae6SDimitry Andric
28b1c73532SDimitry Andric class PreloadKernelArgInfo {
29b1c73532SDimitry Andric private:
30b1c73532SDimitry Andric Function &F;
31b1c73532SDimitry Andric const GCNSubtarget &ST;
32b1c73532SDimitry Andric unsigned NumFreeUserSGPRs;
33b1c73532SDimitry Andric
34b1c73532SDimitry Andric public:
35b1c73532SDimitry Andric SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
36b1c73532SDimitry Andric
PreloadKernelArgInfo(Function & F,const GCNSubtarget & ST)37b1c73532SDimitry Andric PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
38b1c73532SDimitry Andric setInitialFreeUserSGPRsCount();
39b1c73532SDimitry Andric }
40b1c73532SDimitry Andric
41b1c73532SDimitry Andric // Returns the maximum number of user SGPRs that we have available to preload
42b1c73532SDimitry Andric // arguments.
setInitialFreeUserSGPRsCount()43b1c73532SDimitry Andric void setInitialFreeUserSGPRsCount() {
44b1c73532SDimitry Andric const unsigned MaxUserSGPRs = ST.getMaxNumUserSGPRs();
45b1c73532SDimitry Andric GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
46b1c73532SDimitry Andric
47b1c73532SDimitry Andric NumFreeUserSGPRs = MaxUserSGPRs - UserSGPRInfo.getNumUsedUserSGPRs();
48b1c73532SDimitry Andric }
49b1c73532SDimitry Andric
tryAllocPreloadSGPRs(unsigned AllocSize,uint64_t ArgOffset,uint64_t LastExplicitArgOffset)50b1c73532SDimitry Andric bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset,
51b1c73532SDimitry Andric uint64_t LastExplicitArgOffset) {
52b1c73532SDimitry Andric // Check if this argument may be loaded into the same register as the
53b1c73532SDimitry Andric // previous argument.
54b1c73532SDimitry Andric if (!isAligned(Align(4), ArgOffset) && AllocSize < 4)
55b1c73532SDimitry Andric return true;
56b1c73532SDimitry Andric
57b1c73532SDimitry Andric // Pad SGPRs for kernarg alignment.
58b1c73532SDimitry Andric unsigned Padding = ArgOffset - LastExplicitArgOffset;
59b1c73532SDimitry Andric unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
60b1c73532SDimitry Andric unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4;
61b1c73532SDimitry Andric if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
62b1c73532SDimitry Andric return false;
63b1c73532SDimitry Andric
64b1c73532SDimitry Andric NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
65b1c73532SDimitry Andric return true;
66b1c73532SDimitry Andric }
67b1c73532SDimitry Andric };
68b1c73532SDimitry Andric
69eb11fae6SDimitry Andric class AMDGPULowerKernelArguments : public FunctionPass {
70eb11fae6SDimitry Andric public:
71eb11fae6SDimitry Andric static char ID;
72eb11fae6SDimitry Andric
AMDGPULowerKernelArguments()73eb11fae6SDimitry Andric AMDGPULowerKernelArguments() : FunctionPass(ID) {}
74eb11fae6SDimitry Andric
75eb11fae6SDimitry Andric bool runOnFunction(Function &F) override;
76eb11fae6SDimitry Andric
getAnalysisUsage(AnalysisUsage & AU) const77eb11fae6SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override {
78eb11fae6SDimitry Andric AU.addRequired<TargetPassConfig>();
79eb11fae6SDimitry Andric AU.setPreservesAll();
80eb11fae6SDimitry Andric }
81eb11fae6SDimitry Andric };
82eb11fae6SDimitry Andric
83eb11fae6SDimitry Andric } // end anonymous namespace
84eb11fae6SDimitry Andric
85cfca06d7SDimitry Andric // skip allocas
getInsertPt(BasicBlock & BB)86cfca06d7SDimitry Andric static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
87cfca06d7SDimitry Andric BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
88cfca06d7SDimitry Andric for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
89cfca06d7SDimitry Andric AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
90cfca06d7SDimitry Andric
91cfca06d7SDimitry Andric // If this is a dynamic alloca, the value may depend on the loaded kernargs,
92cfca06d7SDimitry Andric // so loads will need to be inserted before it.
93cfca06d7SDimitry Andric if (!AI || !AI->isStaticAlloca())
94cfca06d7SDimitry Andric break;
95cfca06d7SDimitry Andric }
96cfca06d7SDimitry Andric
97cfca06d7SDimitry Andric return InsPt;
98cfca06d7SDimitry Andric }
99cfca06d7SDimitry Andric
lowerKernelArguments(Function & F,const TargetMachine & TM)100b1c73532SDimitry Andric static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
101eb11fae6SDimitry Andric CallingConv::ID CC = F.getCallingConv();
102eb11fae6SDimitry Andric if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
103eb11fae6SDimitry Andric return false;
104eb11fae6SDimitry Andric
105eb11fae6SDimitry Andric const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
106eb11fae6SDimitry Andric LLVMContext &Ctx = F.getParent()->getContext();
107ac9a064cSDimitry Andric const DataLayout &DL = F.getDataLayout();
108eb11fae6SDimitry Andric BasicBlock &EntryBlock = *F.begin();
1094df029ccSDimitry Andric IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock));
110eb11fae6SDimitry Andric
1111d5ae102SDimitry Andric const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
1127fa27ce4SDimitry Andric const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
113eb11fae6SDimitry Andric
1141d5ae102SDimitry Andric Align MaxAlign;
115145449b1SDimitry Andric // FIXME: Alignment is broken with explicit arg offset.;
116eb11fae6SDimitry Andric const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
117eb11fae6SDimitry Andric if (TotalKernArgSize == 0)
118eb11fae6SDimitry Andric return false;
119eb11fae6SDimitry Andric
120eb11fae6SDimitry Andric CallInst *KernArgSegment =
121d8e91e46SDimitry Andric Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
122d8e91e46SDimitry Andric nullptr, F.getName() + ".kernarg.segment");
123c0981da4SDimitry Andric KernArgSegment->addRetAttr(Attribute::NonNull);
124c0981da4SDimitry Andric KernArgSegment->addRetAttr(
125eb11fae6SDimitry Andric Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
126eb11fae6SDimitry Andric
127eb11fae6SDimitry Andric uint64_t ExplicitArgOffset = 0;
128b1c73532SDimitry Andric // Preloaded kernel arguments must be sequential.
129b1c73532SDimitry Andric bool InPreloadSequence = true;
130b1c73532SDimitry Andric PreloadKernelArgInfo PreloadInfo(F, ST);
131eb11fae6SDimitry Andric
132eb11fae6SDimitry Andric for (Argument &Arg : F.args()) {
133b60736ecSDimitry Andric const bool IsByRef = Arg.hasByRefAttr();
134b60736ecSDimitry Andric Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
135e3b55780SDimitry Andric MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
136145449b1SDimitry Andric Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
137b60736ecSDimitry Andric
138b60736ecSDimitry Andric uint64_t Size = DL.getTypeSizeInBits(ArgTy);
139b60736ecSDimitry Andric uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
140eb11fae6SDimitry Andric
1411d5ae102SDimitry Andric uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
142b1c73532SDimitry Andric uint64_t LastExplicitArgOffset = ExplicitArgOffset;
1431d5ae102SDimitry Andric ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
144eb11fae6SDimitry Andric
145b1c73532SDimitry Andric // Try to preload this argument into user SGPRs.
146b1c73532SDimitry Andric if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
147b1c73532SDimitry Andric !Arg.getType()->isAggregateType())
148b1c73532SDimitry Andric if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
149b1c73532SDimitry Andric LastExplicitArgOffset))
150b1c73532SDimitry Andric continue;
151b1c73532SDimitry Andric
152b1c73532SDimitry Andric InPreloadSequence = false;
153b1c73532SDimitry Andric
154eb11fae6SDimitry Andric if (Arg.use_empty())
155eb11fae6SDimitry Andric continue;
156eb11fae6SDimitry Andric
157b60736ecSDimitry Andric // If this is byval, the loads are already explicit in the function. We just
158b60736ecSDimitry Andric // need to rewrite the pointer values.
159b60736ecSDimitry Andric if (IsByRef) {
160b60736ecSDimitry Andric Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64(
161b60736ecSDimitry Andric Builder.getInt8Ty(), KernArgSegment, EltOffset,
162b60736ecSDimitry Andric Arg.getName() + ".byval.kernarg.offset");
163b60736ecSDimitry Andric
1647fa27ce4SDimitry Andric Value *CastOffsetPtr =
1657fa27ce4SDimitry Andric Builder.CreateAddrSpaceCast(ArgOffsetPtr, Arg.getType());
166b60736ecSDimitry Andric Arg.replaceAllUsesWith(CastOffsetPtr);
167b60736ecSDimitry Andric continue;
168b60736ecSDimitry Andric }
169b60736ecSDimitry Andric
170eb11fae6SDimitry Andric if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
171eb11fae6SDimitry Andric // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
172eb11fae6SDimitry Andric // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
173eb11fae6SDimitry Andric // can't represent this with range metadata because it's only allowed for
174eb11fae6SDimitry Andric // integer types.
175e6d15924SDimitry Andric if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
176e6d15924SDimitry Andric PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
177e6d15924SDimitry Andric !ST.hasUsableDSOffset())
178eb11fae6SDimitry Andric continue;
179eb11fae6SDimitry Andric
180eb11fae6SDimitry Andric // FIXME: We can replace this with equivalent alias.scope/noalias
181eb11fae6SDimitry Andric // metadata, but this appears to be a lot of work.
182eb11fae6SDimitry Andric if (Arg.hasNoAliasAttr())
183eb11fae6SDimitry Andric continue;
184eb11fae6SDimitry Andric }
185eb11fae6SDimitry Andric
186cfca06d7SDimitry Andric auto *VT = dyn_cast<FixedVectorType>(ArgTy);
187eb11fae6SDimitry Andric bool IsV3 = VT && VT->getNumElements() == 3;
188d8e91e46SDimitry Andric bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
189d8e91e46SDimitry Andric
190eb11fae6SDimitry Andric VectorType *V4Ty = nullptr;
191eb11fae6SDimitry Andric
192eb11fae6SDimitry Andric int64_t AlignDownOffset = alignDown(EltOffset, 4);
193eb11fae6SDimitry Andric int64_t OffsetDiff = EltOffset - AlignDownOffset;
1941d5ae102SDimitry Andric Align AdjustedAlign = commonAlignment(
1951d5ae102SDimitry Andric KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);
196eb11fae6SDimitry Andric
197eb11fae6SDimitry Andric Value *ArgPtr;
198e6d15924SDimitry Andric Type *AdjustedArgTy;
199d8e91e46SDimitry Andric if (DoShiftOpt) { // FIXME: Handle aggregate types
200eb11fae6SDimitry Andric // Since we don't have sub-dword scalar loads, avoid doing an extload by
201eb11fae6SDimitry Andric // loading earlier than the argument address, and extracting the relevant
202eb11fae6SDimitry Andric // bits.
2034df029ccSDimitry Andric // TODO: Update this for GFX12 which does have scalar sub-dword loads.
204eb11fae6SDimitry Andric //
205eb11fae6SDimitry Andric // Additionally widen any sub-dword load to i32 even if suitably aligned,
206eb11fae6SDimitry Andric // so that CSE between different argument loads works easily.
207eb11fae6SDimitry Andric ArgPtr = Builder.CreateConstInBoundsGEP1_64(
208e6d15924SDimitry Andric Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,
209eb11fae6SDimitry Andric Arg.getName() + ".kernarg.offset.align.down");
210e6d15924SDimitry Andric AdjustedArgTy = Builder.getInt32Ty();
211eb11fae6SDimitry Andric } else {
212eb11fae6SDimitry Andric ArgPtr = Builder.CreateConstInBoundsGEP1_64(
213e6d15924SDimitry Andric Builder.getInt8Ty(), KernArgSegment, EltOffset,
214eb11fae6SDimitry Andric Arg.getName() + ".kernarg.offset");
215e6d15924SDimitry Andric AdjustedArgTy = ArgTy;
216eb11fae6SDimitry Andric }
217eb11fae6SDimitry Andric
218eb11fae6SDimitry Andric if (IsV3 && Size >= 32) {
219cfca06d7SDimitry Andric V4Ty = FixedVectorType::get(VT->getElementType(), 4);
220eb11fae6SDimitry Andric // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
221e6d15924SDimitry Andric AdjustedArgTy = V4Ty;
222eb11fae6SDimitry Andric }
223eb11fae6SDimitry Andric
224e6d15924SDimitry Andric LoadInst *Load =
225cfca06d7SDimitry Andric Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
226eb11fae6SDimitry Andric Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
227eb11fae6SDimitry Andric
228eb11fae6SDimitry Andric MDBuilder MDB(Ctx);
229eb11fae6SDimitry Andric
230eb11fae6SDimitry Andric if (isa<PointerType>(ArgTy)) {
231eb11fae6SDimitry Andric if (Arg.hasNonNullAttr())
232eb11fae6SDimitry Andric Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
233eb11fae6SDimitry Andric
234eb11fae6SDimitry Andric uint64_t DerefBytes = Arg.getDereferenceableBytes();
235eb11fae6SDimitry Andric if (DerefBytes != 0) {
236eb11fae6SDimitry Andric Load->setMetadata(
237eb11fae6SDimitry Andric LLVMContext::MD_dereferenceable,
238eb11fae6SDimitry Andric MDNode::get(Ctx,
239eb11fae6SDimitry Andric MDB.createConstant(
240eb11fae6SDimitry Andric ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
241eb11fae6SDimitry Andric }
242eb11fae6SDimitry Andric
243eb11fae6SDimitry Andric uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
244eb11fae6SDimitry Andric if (DerefOrNullBytes != 0) {
245eb11fae6SDimitry Andric Load->setMetadata(
246eb11fae6SDimitry Andric LLVMContext::MD_dereferenceable_or_null,
247eb11fae6SDimitry Andric MDNode::get(Ctx,
248eb11fae6SDimitry Andric MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
249eb11fae6SDimitry Andric DerefOrNullBytes))));
250eb11fae6SDimitry Andric }
251eb11fae6SDimitry Andric
252e3b55780SDimitry Andric if (MaybeAlign ParamAlign = Arg.getParamAlign()) {
253eb11fae6SDimitry Andric Load->setMetadata(
254eb11fae6SDimitry Andric LLVMContext::MD_align,
255e3b55780SDimitry Andric MDNode::get(Ctx, MDB.createConstant(ConstantInt::get(
256e3b55780SDimitry Andric Builder.getInt64Ty(), ParamAlign->value()))));
257eb11fae6SDimitry Andric }
258eb11fae6SDimitry Andric }
259eb11fae6SDimitry Andric
260eb11fae6SDimitry Andric // TODO: Convert noalias arg to !noalias
261eb11fae6SDimitry Andric
262d8e91e46SDimitry Andric if (DoShiftOpt) {
263eb11fae6SDimitry Andric Value *ExtractBits = OffsetDiff == 0 ?
264eb11fae6SDimitry Andric Load : Builder.CreateLShr(Load, OffsetDiff * 8);
265eb11fae6SDimitry Andric
266eb11fae6SDimitry Andric IntegerType *ArgIntTy = Builder.getIntNTy(Size);
267eb11fae6SDimitry Andric Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
268eb11fae6SDimitry Andric Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
269eb11fae6SDimitry Andric Arg.getName() + ".load");
270eb11fae6SDimitry Andric Arg.replaceAllUsesWith(NewVal);
271eb11fae6SDimitry Andric } else if (IsV3) {
272b60736ecSDimitry Andric Value *Shuf = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 2},
273eb11fae6SDimitry Andric Arg.getName() + ".load");
274eb11fae6SDimitry Andric Arg.replaceAllUsesWith(Shuf);
275eb11fae6SDimitry Andric } else {
276eb11fae6SDimitry Andric Load->setName(Arg.getName() + ".load");
277eb11fae6SDimitry Andric Arg.replaceAllUsesWith(Load);
278eb11fae6SDimitry Andric }
279eb11fae6SDimitry Andric }
280eb11fae6SDimitry Andric
281c0981da4SDimitry Andric KernArgSegment->addRetAttr(
282eb11fae6SDimitry Andric Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
283eb11fae6SDimitry Andric
284eb11fae6SDimitry Andric return true;
285eb11fae6SDimitry Andric }
286eb11fae6SDimitry Andric
runOnFunction(Function & F)287b1c73532SDimitry Andric bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
288b1c73532SDimitry Andric auto &TPC = getAnalysis<TargetPassConfig>();
289b1c73532SDimitry Andric const TargetMachine &TM = TPC.getTM<TargetMachine>();
290b1c73532SDimitry Andric return lowerKernelArguments(F, TM);
291b1c73532SDimitry Andric }
292b1c73532SDimitry Andric
293eb11fae6SDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
294eb11fae6SDimitry Andric "AMDGPU Lower Kernel Arguments", false, false)
295eb11fae6SDimitry Andric INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
296eb11fae6SDimitry Andric false, false)
297eb11fae6SDimitry Andric
298eb11fae6SDimitry Andric char AMDGPULowerKernelArguments::ID = 0;
299eb11fae6SDimitry Andric
createAMDGPULowerKernelArgumentsPass()300eb11fae6SDimitry Andric FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
301eb11fae6SDimitry Andric return new AMDGPULowerKernelArguments();
302eb11fae6SDimitry Andric }
303b1c73532SDimitry Andric
304b1c73532SDimitry Andric PreservedAnalyses
run(Function & F,FunctionAnalysisManager & AM)305b1c73532SDimitry Andric AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) {
306b1c73532SDimitry Andric bool Changed = lowerKernelArguments(F, TM);
307b1c73532SDimitry Andric if (Changed) {
308b1c73532SDimitry Andric // TODO: Preserves a lot more.
309b1c73532SDimitry Andric PreservedAnalyses PA;
310b1c73532SDimitry Andric PA.preserveSet<CFGAnalyses>();
311b1c73532SDimitry Andric return PA;
312b1c73532SDimitry Andric }
313b1c73532SDimitry Andric
314b1c73532SDimitry Andric return PreservedAnalyses::all();
315b1c73532SDimitry Andric }
316