1044eb2f6SDimitry Andric //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
267c32a98SDimitry Andric //
3e6d15924SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e6d15924SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5e6d15924SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
667c32a98SDimitry Andric //
767c32a98SDimitry Andric //===----------------------------------------------------------------------===//
867c32a98SDimitry Andric //
967c32a98SDimitry Andric // This pass tries to fuse DS instructions with close by immediate offsets.
1067c32a98SDimitry Andric // This will fuse operations such as
1167c32a98SDimitry Andric // ds_read_b32 v0, v2 offset:16
1267c32a98SDimitry Andric // ds_read_b32 v1, v2 offset:32
1367c32a98SDimitry Andric // ==>
1467c32a98SDimitry Andric // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
1567c32a98SDimitry Andric //
16044eb2f6SDimitry Andric // The same is done for certain SMEM and VMEM opcodes, e.g.:
17044eb2f6SDimitry Andric // s_buffer_load_dword s4, s[0:3], 4
18044eb2f6SDimitry Andric // s_buffer_load_dword s5, s[0:3], 8
19044eb2f6SDimitry Andric // ==>
20044eb2f6SDimitry Andric // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21044eb2f6SDimitry Andric //
22d8e91e46SDimitry Andric // This pass also tries to promote constant offset to the immediate by
23d8e91e46SDimitry Andric // adjusting the base. It tries to use a base from the nearby instructions that
24d8e91e46SDimitry Andric // allows it to have a 13bit constant offset and then promotes the 13bit offset
25d8e91e46SDimitry Andric // to the immediate.
26d8e91e46SDimitry Andric // E.g.
27d8e91e46SDimitry Andric // s_movk_i32 s0, 0x1800
28d8e91e46SDimitry Andric // v_add_co_u32_e32 v0, vcc, s0, v2
29d8e91e46SDimitry Andric // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30d8e91e46SDimitry Andric //
31d8e91e46SDimitry Andric // s_movk_i32 s0, 0x1000
32d8e91e46SDimitry Andric // v_add_co_u32_e32 v5, vcc, s0, v2
33d8e91e46SDimitry Andric // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34d8e91e46SDimitry Andric // global_load_dwordx2 v[5:6], v[5:6], off
35d8e91e46SDimitry Andric // global_load_dwordx2 v[0:1], v[0:1], off
36d8e91e46SDimitry Andric // =>
37d8e91e46SDimitry Andric // s_movk_i32 s0, 0x1000
38d8e91e46SDimitry Andric // v_add_co_u32_e32 v5, vcc, s0, v2
39d8e91e46SDimitry Andric // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40d8e91e46SDimitry Andric // global_load_dwordx2 v[5:6], v[5:6], off
41d8e91e46SDimitry Andric // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
4267c32a98SDimitry Andric //
4367c32a98SDimitry Andric // Future improvements:
4467c32a98SDimitry Andric //
451d5ae102SDimitry Andric // - This is currently missing stores of constants because loading
4667c32a98SDimitry Andric // the constant into the data register is placed between the stores, although
4767c32a98SDimitry Andric // this is arguably a scheduling problem.
4867c32a98SDimitry Andric //
4967c32a98SDimitry Andric // - Live interval recomputing seems inefficient. This currently only matches
5067c32a98SDimitry Andric // one pair, and recomputes live intervals and moves on to the next pair. It
5101095a5dSDimitry Andric // would be better to compute a list of all merges that need to occur.
5267c32a98SDimitry Andric //
5367c32a98SDimitry Andric // - With a list of instructions to process, we can also merge more. If a
5467c32a98SDimitry Andric // cluster of loads have offsets that are too large to fit in the 8-bit
5567c32a98SDimitry Andric // offsets, but are close enough to fit in the 8 bits, we can add to the base
5667c32a98SDimitry Andric // pointer and use the new reduced offsets.
5767c32a98SDimitry Andric //
5867c32a98SDimitry Andric //===----------------------------------------------------------------------===//
5967c32a98SDimitry Andric
6067c32a98SDimitry Andric #include "AMDGPU.h"
61b60736ecSDimitry Andric #include "GCNSubtarget.h"
62d8e91e46SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
6371d5a254SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h"
6467c32a98SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
65706b4fc4SDimitry Andric #include "llvm/InitializePasses.h"
6667c32a98SDimitry Andric
6767c32a98SDimitry Andric using namespace llvm;
6867c32a98SDimitry Andric
6967c32a98SDimitry Andric #define DEBUG_TYPE "si-load-store-opt"
7067c32a98SDimitry Andric
7167c32a98SDimitry Andric namespace {
72044eb2f6SDimitry Andric enum InstClassEnum {
73d8e91e46SDimitry Andric UNKNOWN,
74d8e91e46SDimitry Andric DS_READ,
75d8e91e46SDimitry Andric DS_WRITE,
76044eb2f6SDimitry Andric S_BUFFER_LOAD_IMM,
77e3b55780SDimitry Andric S_BUFFER_LOAD_SGPR_IMM,
78e3b55780SDimitry Andric S_LOAD_IMM,
791d5ae102SDimitry Andric BUFFER_LOAD,
801d5ae102SDimitry Andric BUFFER_STORE,
811d5ae102SDimitry Andric MIMG,
82706b4fc4SDimitry Andric TBUFFER_LOAD,
83706b4fc4SDimitry Andric TBUFFER_STORE,
84145449b1SDimitry Andric GLOBAL_LOAD_SADDR,
85145449b1SDimitry Andric GLOBAL_STORE_SADDR,
86145449b1SDimitry Andric FLAT_LOAD,
87145449b1SDimitry Andric FLAT_STORE,
88145449b1SDimitry Andric GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89145449b1SDimitry Andric GLOBAL_STORE // any CombineInfo, they are only ever returned by
90145449b1SDimitry Andric // getCommonInstClass.
91044eb2f6SDimitry Andric };
9271d5a254SDimitry Andric
93cfca06d7SDimitry Andric struct AddressRegs {
94cfca06d7SDimitry Andric unsigned char NumVAddrs = 0;
95cfca06d7SDimitry Andric bool SBase = false;
96cfca06d7SDimitry Andric bool SRsrc = false;
97cfca06d7SDimitry Andric bool SOffset = false;
98145449b1SDimitry Andric bool SAddr = false;
99cfca06d7SDimitry Andric bool VAddr = false;
100cfca06d7SDimitry Andric bool Addr = false;
101cfca06d7SDimitry Andric bool SSamp = false;
102d8e91e46SDimitry Andric };
103d8e91e46SDimitry Andric
104cfca06d7SDimitry Andric // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105cfca06d7SDimitry Andric const unsigned MaxAddressRegs = 12 + 1 + 1;
106cfca06d7SDimitry Andric
107d8e91e46SDimitry Andric class SILoadStoreOptimizer : public MachineFunctionPass {
108044eb2f6SDimitry Andric struct CombineInfo {
10971d5a254SDimitry Andric MachineBasicBlock::iterator I;
11071d5a254SDimitry Andric unsigned EltSize;
111706b4fc4SDimitry Andric unsigned Offset;
112706b4fc4SDimitry Andric unsigned Width;
113706b4fc4SDimitry Andric unsigned Format;
11471d5a254SDimitry Andric unsigned BaseOff;
115706b4fc4SDimitry Andric unsigned DMask;
116044eb2f6SDimitry Andric InstClassEnum InstClass;
117344a3780SDimitry Andric unsigned CPol = 0;
1186f8fc217SDimitry Andric bool IsAGPR;
11971d5a254SDimitry Andric bool UseST64;
120cfca06d7SDimitry Andric int AddrIdx[MaxAddressRegs];
121cfca06d7SDimitry Andric const MachineOperand *AddrReg[MaxAddressRegs];
1221d5ae102SDimitry Andric unsigned NumAddresses;
123cfca06d7SDimitry Andric unsigned Order;
1241d5ae102SDimitry Andric
hasSameBaseAddress__anon7062d5540111::SILoadStoreOptimizer::CombineInfo125e3b55780SDimitry Andric bool hasSameBaseAddress(const CombineInfo &CI) {
126e3b55780SDimitry Andric if (NumAddresses != CI.NumAddresses)
127e3b55780SDimitry Andric return false;
128e3b55780SDimitry Andric
129e3b55780SDimitry Andric const MachineInstr &MI = *CI.I;
1301d5ae102SDimitry Andric for (unsigned i = 0; i < NumAddresses; i++) {
1311d5ae102SDimitry Andric const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
1321d5ae102SDimitry Andric
1331d5ae102SDimitry Andric if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
1341d5ae102SDimitry Andric if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
1351d5ae102SDimitry Andric AddrReg[i]->getImm() != AddrRegNext.getImm()) {
1361d5ae102SDimitry Andric return false;
1371d5ae102SDimitry Andric }
1381d5ae102SDimitry Andric continue;
1391d5ae102SDimitry Andric }
1401d5ae102SDimitry Andric
1411d5ae102SDimitry Andric // Check same base pointer. Be careful of subregisters, which can occur
1421d5ae102SDimitry Andric // with vectors of pointers.
1431d5ae102SDimitry Andric if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
1441d5ae102SDimitry Andric AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
1451d5ae102SDimitry Andric return false;
1461d5ae102SDimitry Andric }
1471d5ae102SDimitry Andric }
1481d5ae102SDimitry Andric return true;
1491d5ae102SDimitry Andric }
1501d5ae102SDimitry Andric
hasMergeableAddress__anon7062d5540111::SILoadStoreOptimizer::CombineInfo1511d5ae102SDimitry Andric bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
1521d5ae102SDimitry Andric for (unsigned i = 0; i < NumAddresses; ++i) {
1531d5ae102SDimitry Andric const MachineOperand *AddrOp = AddrReg[i];
1541d5ae102SDimitry Andric // Immediates are always OK.
1551d5ae102SDimitry Andric if (AddrOp->isImm())
1561d5ae102SDimitry Andric continue;
1571d5ae102SDimitry Andric
1581d5ae102SDimitry Andric // Don't try to merge addresses that aren't either immediates or registers.
1591d5ae102SDimitry Andric // TODO: Should be possible to merge FrameIndexes and maybe some other
1601d5ae102SDimitry Andric // non-register
1611d5ae102SDimitry Andric if (!AddrOp->isReg())
1621d5ae102SDimitry Andric return false;
1631d5ae102SDimitry Andric
164312c0ed1SDimitry Andric // TODO: We should be able to merge instructions with other physical reg
165312c0ed1SDimitry Andric // addresses too.
166312c0ed1SDimitry Andric if (AddrOp->getReg().isPhysical() &&
167312c0ed1SDimitry Andric AddrOp->getReg() != AMDGPU::SGPR_NULL)
1681d5ae102SDimitry Andric return false;
1691d5ae102SDimitry Andric
170e3b55780SDimitry Andric // If an address has only one use then there will be no other
1711d5ae102SDimitry Andric // instructions with the same address, so we can't merge this one.
1721d5ae102SDimitry Andric if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
1731d5ae102SDimitry Andric return false;
1741d5ae102SDimitry Andric }
1751d5ae102SDimitry Andric return true;
1761d5ae102SDimitry Andric }
1771d5ae102SDimitry Andric
1786f8fc217SDimitry Andric void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179145449b1SDimitry Andric
180145449b1SDimitry Andric // Compare by pointer order.
operator <__anon7062d5540111::SILoadStoreOptimizer::CombineInfo181145449b1SDimitry Andric bool operator<(const CombineInfo& Other) const {
182145449b1SDimitry Andric return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183145449b1SDimitry Andric }
184044eb2f6SDimitry Andric };
18571d5a254SDimitry Andric
186d8e91e46SDimitry Andric struct BaseRegisters {
187cfca06d7SDimitry Andric Register LoReg;
188cfca06d7SDimitry Andric Register HiReg;
189d8e91e46SDimitry Andric
190d8e91e46SDimitry Andric unsigned LoSubReg = 0;
191d8e91e46SDimitry Andric unsigned HiSubReg = 0;
192d8e91e46SDimitry Andric };
193d8e91e46SDimitry Andric
194d8e91e46SDimitry Andric struct MemAddress {
195d8e91e46SDimitry Andric BaseRegisters Base;
196d8e91e46SDimitry Andric int64_t Offset = 0;
197d8e91e46SDimitry Andric };
198d8e91e46SDimitry Andric
199d8e91e46SDimitry Andric using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200d8e91e46SDimitry Andric
20167c32a98SDimitry Andric private:
202eb11fae6SDimitry Andric const GCNSubtarget *STM = nullptr;
20371d5a254SDimitry Andric const SIInstrInfo *TII = nullptr;
20471d5a254SDimitry Andric const SIRegisterInfo *TRI = nullptr;
20571d5a254SDimitry Andric MachineRegisterInfo *MRI = nullptr;
20671d5a254SDimitry Andric AliasAnalysis *AA = nullptr;
207d8e91e46SDimitry Andric bool OptimizeAgain;
20867c32a98SDimitry Andric
209145449b1SDimitry Andric bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210145449b1SDimitry Andric const DenseSet<Register> &ARegUses,
211145449b1SDimitry Andric const MachineInstr &A, const MachineInstr &B) const;
212706b4fc4SDimitry Andric static bool dmasksCanBeCombined(const CombineInfo &CI,
213706b4fc4SDimitry Andric const SIInstrInfo &TII,
214706b4fc4SDimitry Andric const CombineInfo &Paired);
215cfca06d7SDimitry Andric static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216cfca06d7SDimitry Andric CombineInfo &Paired, bool Modify = false);
217cfca06d7SDimitry Andric static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218706b4fc4SDimitry Andric const CombineInfo &Paired);
219ac9a064cSDimitry Andric unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220706b4fc4SDimitry Andric static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221706b4fc4SDimitry Andric const CombineInfo &Paired);
222ac9a064cSDimitry Andric const TargetRegisterClass *
223ac9a064cSDimitry Andric getTargetRegisterClass(const CombineInfo &CI,
224ac9a064cSDimitry Andric const CombineInfo &Paired) const;
225344a3780SDimitry Andric const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
22667c32a98SDimitry Andric
227145449b1SDimitry Andric CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
22867c32a98SDimitry Andric
229ac9a064cSDimitry Andric void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
230ac9a064cSDimitry Andric MachineBasicBlock::iterator InsertBefore, int OpName,
231ac9a064cSDimitry Andric Register DestReg) const;
232ac9a064cSDimitry Andric Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
233ac9a064cSDimitry Andric MachineBasicBlock::iterator InsertBefore,
234ac9a064cSDimitry Andric int OpName) const;
235ac9a064cSDimitry Andric
236044eb2f6SDimitry Andric unsigned read2Opcode(unsigned EltSize) const;
237044eb2f6SDimitry Andric unsigned read2ST64Opcode(unsigned EltSize) const;
238145449b1SDimitry Andric MachineBasicBlock::iterator
239145449b1SDimitry Andric mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
240145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore);
24167c32a98SDimitry Andric
242044eb2f6SDimitry Andric unsigned write2Opcode(unsigned EltSize) const;
243044eb2f6SDimitry Andric unsigned write2ST64Opcode(unsigned EltSize) const;
244cfca06d7SDimitry Andric MachineBasicBlock::iterator
245cfca06d7SDimitry Andric mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
246145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore);
247cfca06d7SDimitry Andric MachineBasicBlock::iterator
248cfca06d7SDimitry Andric mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
249145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore);
250cfca06d7SDimitry Andric MachineBasicBlock::iterator
251e3b55780SDimitry Andric mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
252145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore);
253cfca06d7SDimitry Andric MachineBasicBlock::iterator
254cfca06d7SDimitry Andric mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
255145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore);
256cfca06d7SDimitry Andric MachineBasicBlock::iterator
257cfca06d7SDimitry Andric mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
258145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore);
259cfca06d7SDimitry Andric MachineBasicBlock::iterator
260cfca06d7SDimitry Andric mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
261145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore);
262cfca06d7SDimitry Andric MachineBasicBlock::iterator
263cfca06d7SDimitry Andric mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
264145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore);
265145449b1SDimitry Andric MachineBasicBlock::iterator
266145449b1SDimitry Andric mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
267145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore);
268145449b1SDimitry Andric MachineBasicBlock::iterator
269145449b1SDimitry Andric mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
270145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore);
27167c32a98SDimitry Andric
272cfca06d7SDimitry Andric void updateBaseAndOffset(MachineInstr &I, Register NewBase,
2731d5ae102SDimitry Andric int32_t NewOffset) const;
274cfca06d7SDimitry Andric Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
2751d5ae102SDimitry Andric MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
276e3b55780SDimitry Andric std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
2771d5ae102SDimitry Andric void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
278d8e91e46SDimitry Andric /// Promotes constant offset to the immediate by adjusting the base. It
279d8e91e46SDimitry Andric /// tries to use a base from the nearby instructions that allows it to have
280d8e91e46SDimitry Andric /// a 13bit constant offset which gets promoted to the immediate.
281d8e91e46SDimitry Andric bool promoteConstantOffsetToImm(MachineInstr &CI,
282d8e91e46SDimitry Andric MemInfoMap &Visited,
2831d5ae102SDimitry Andric SmallPtrSet<MachineInstr *, 4> &Promoted) const;
2841d5ae102SDimitry Andric void addInstToMergeableList(const CombineInfo &CI,
2851d5ae102SDimitry Andric std::list<std::list<CombineInfo> > &MergeableInsts) const;
286cfca06d7SDimitry Andric
287cfca06d7SDimitry Andric std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
288cfca06d7SDimitry Andric MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
289cfca06d7SDimitry Andric MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2901d5ae102SDimitry Andric std::list<std::list<CombineInfo>> &MergeableInsts) const;
291d8e91e46SDimitry Andric
292145449b1SDimitry Andric static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
293145449b1SDimitry Andric const CombineInfo &Paired);
294145449b1SDimitry Andric
295145449b1SDimitry Andric static InstClassEnum getCommonInstClass(const CombineInfo &CI,
296145449b1SDimitry Andric const CombineInfo &Paired);
297145449b1SDimitry Andric
29867c32a98SDimitry Andric public:
29967c32a98SDimitry Andric static char ID;
30067c32a98SDimitry Andric
SILoadStoreOptimizer()301b5630dbaSDimitry Andric SILoadStoreOptimizer() : MachineFunctionPass(ID) {
30267c32a98SDimitry Andric initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
30367c32a98SDimitry Andric }
30467c32a98SDimitry Andric
3051d5ae102SDimitry Andric bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
3061d5ae102SDimitry Andric bool &OptimizeListAgain);
3071d5ae102SDimitry Andric bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
30867c32a98SDimitry Andric
30967c32a98SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override;
31067c32a98SDimitry Andric
getPassName() const311eb11fae6SDimitry Andric StringRef getPassName() const override { return "SI Load Store Optimizer"; }
31267c32a98SDimitry Andric
getAnalysisUsage(AnalysisUsage & AU) const31367c32a98SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override {
31467c32a98SDimitry Andric AU.setPreservesCFG();
315b915e9e0SDimitry Andric AU.addRequired<AAResultsWrapperPass>();
31667c32a98SDimitry Andric
31767c32a98SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU);
31867c32a98SDimitry Andric }
319cfca06d7SDimitry Andric
getRequiredProperties() const320cfca06d7SDimitry Andric MachineFunctionProperties getRequiredProperties() const override {
321cfca06d7SDimitry Andric return MachineFunctionProperties()
322cfca06d7SDimitry Andric .set(MachineFunctionProperties::Property::IsSSA);
323cfca06d7SDimitry Andric }
32467c32a98SDimitry Andric };
32567c32a98SDimitry Andric
getOpcodeWidth(const MachineInstr & MI,const SIInstrInfo & TII)3261d5ae102SDimitry Andric static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
3271d5ae102SDimitry Andric const unsigned Opc = MI.getOpcode();
3281d5ae102SDimitry Andric
3291d5ae102SDimitry Andric if (TII.isMUBUF(Opc)) {
3301d5ae102SDimitry Andric // FIXME: Handle d16 correctly
3311d5ae102SDimitry Andric return AMDGPU::getMUBUFElements(Opc);
3321d5ae102SDimitry Andric }
333312c0ed1SDimitry Andric if (TII.isImage(MI)) {
3341d5ae102SDimitry Andric uint64_t DMaskImm =
3351d5ae102SDimitry Andric TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
336e3b55780SDimitry Andric return llvm::popcount(DMaskImm);
3371d5ae102SDimitry Andric }
338706b4fc4SDimitry Andric if (TII.isMTBUF(Opc)) {
339706b4fc4SDimitry Andric return AMDGPU::getMTBUFElements(Opc);
340706b4fc4SDimitry Andric }
3411d5ae102SDimitry Andric
3421d5ae102SDimitry Andric switch (Opc) {
3431d5ae102SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORD_IMM:
346145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD:
347145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD:
349145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
350145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORD:
351145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORD:
3521d5ae102SDimitry Andric return 1;
3531d5ae102SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM:
356ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
357145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2:
358145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
359145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2:
360145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
361145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX2:
362145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX2:
3631d5ae102SDimitry Andric return 2;
364312c0ed1SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
365312c0ed1SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
366312c0ed1SDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM:
367ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
368145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3:
369145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
370145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3:
371145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
372145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX3:
373145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX3:
374145449b1SDimitry Andric return 3;
3751d5ae102SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
376e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
377e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM:
378ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
379145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4:
380145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
381145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4:
382145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
383145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX4:
384145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX4:
3851d5ae102SDimitry Andric return 4;
386c0981da4SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
387e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
388e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM:
389ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
390c0981da4SDimitry Andric return 8;
391ac9a064cSDimitry Andric case AMDGPU::DS_READ_B32:
392ac9a064cSDimitry Andric case AMDGPU::DS_READ_B32_gfx9:
393ac9a064cSDimitry Andric case AMDGPU::DS_WRITE_B32:
394344a3780SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9:
395344a3780SDimitry Andric return 1;
396ac9a064cSDimitry Andric case AMDGPU::DS_READ_B64:
397ac9a064cSDimitry Andric case AMDGPU::DS_READ_B64_gfx9:
398ac9a064cSDimitry Andric case AMDGPU::DS_WRITE_B64:
399344a3780SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9:
400344a3780SDimitry Andric return 2;
4011d5ae102SDimitry Andric default:
4021d5ae102SDimitry Andric return 0;
4031d5ae102SDimitry Andric }
4041d5ae102SDimitry Andric }
4051d5ae102SDimitry Andric
4061d5ae102SDimitry Andric /// Maps instruction opcode to enum InstClassEnum.
getInstClass(unsigned Opc,const SIInstrInfo & TII)4071d5ae102SDimitry Andric static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
4081d5ae102SDimitry Andric switch (Opc) {
4091d5ae102SDimitry Andric default:
4101d5ae102SDimitry Andric if (TII.isMUBUF(Opc)) {
4111d5ae102SDimitry Andric switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
4121d5ae102SDimitry Andric default:
4131d5ae102SDimitry Andric return UNKNOWN;
414ac9a064cSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
415ac9a064cSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
416ac9a064cSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
417ac9a064cSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
4181d5ae102SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
4191d5ae102SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
4201d5ae102SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
4211d5ae102SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
422ac9a064cSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
423ac9a064cSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
424ac9a064cSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
425ac9a064cSDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
426312c0ed1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
427312c0ed1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
428312c0ed1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
429312c0ed1SDimitry Andric case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
4301d5ae102SDimitry Andric return BUFFER_LOAD;
431ac9a064cSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
432ac9a064cSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
433ac9a064cSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
434ac9a064cSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
4351d5ae102SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
4361d5ae102SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
4371d5ae102SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
4381d5ae102SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
439ac9a064cSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
440ac9a064cSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
441ac9a064cSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
442ac9a064cSDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
443312c0ed1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
444312c0ed1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
445312c0ed1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
446312c0ed1SDimitry Andric case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
4471d5ae102SDimitry Andric return BUFFER_STORE;
4481d5ae102SDimitry Andric }
4491d5ae102SDimitry Andric }
450312c0ed1SDimitry Andric if (TII.isImage(Opc)) {
4511d5ae102SDimitry Andric // Ignore instructions encoded without vaddr.
452e3b55780SDimitry Andric if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
453e3b55780SDimitry Andric !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
4541d5ae102SDimitry Andric return UNKNOWN;
455c0981da4SDimitry Andric // Ignore BVH instructions
456c0981da4SDimitry Andric if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
457c0981da4SDimitry Andric return UNKNOWN;
4581d5ae102SDimitry Andric // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
459706b4fc4SDimitry Andric if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
460706b4fc4SDimitry Andric TII.isGather4(Opc))
4611d5ae102SDimitry Andric return UNKNOWN;
4621d5ae102SDimitry Andric return MIMG;
4631d5ae102SDimitry Andric }
464706b4fc4SDimitry Andric if (TII.isMTBUF(Opc)) {
465706b4fc4SDimitry Andric switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
466706b4fc4SDimitry Andric default:
467706b4fc4SDimitry Andric return UNKNOWN;
468312c0ed1SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
469312c0ed1SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
470312c0ed1SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
471312c0ed1SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
472706b4fc4SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
473706b4fc4SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
474706b4fc4SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
475706b4fc4SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
476312c0ed1SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
477312c0ed1SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
478312c0ed1SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
479312c0ed1SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
480312c0ed1SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
481312c0ed1SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
482312c0ed1SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
483312c0ed1SDimitry Andric case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
484706b4fc4SDimitry Andric return TBUFFER_LOAD;
485706b4fc4SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
486706b4fc4SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
487706b4fc4SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
488706b4fc4SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
489312c0ed1SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
490312c0ed1SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
491312c0ed1SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
492312c0ed1SDimitry Andric case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
493706b4fc4SDimitry Andric return TBUFFER_STORE;
494706b4fc4SDimitry Andric }
495706b4fc4SDimitry Andric }
4961d5ae102SDimitry Andric return UNKNOWN;
4971d5ae102SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
4981d5ae102SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
499312c0ed1SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
5001d5ae102SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
501c0981da4SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
5021d5ae102SDimitry Andric return S_BUFFER_LOAD_IMM;
503e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
504e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
505312c0ed1SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
506e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
507e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
508e3b55780SDimitry Andric return S_BUFFER_LOAD_SGPR_IMM;
509e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORD_IMM:
510e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM:
511312c0ed1SDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM:
512e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM:
513e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM:
514ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
515ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
516ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
517ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
518e3b55780SDimitry Andric return S_LOAD_IMM;
5191d5ae102SDimitry Andric case AMDGPU::DS_READ_B32:
5201d5ae102SDimitry Andric case AMDGPU::DS_READ_B32_gfx9:
5211d5ae102SDimitry Andric case AMDGPU::DS_READ_B64:
5221d5ae102SDimitry Andric case AMDGPU::DS_READ_B64_gfx9:
5231d5ae102SDimitry Andric return DS_READ;
5241d5ae102SDimitry Andric case AMDGPU::DS_WRITE_B32:
5251d5ae102SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9:
5261d5ae102SDimitry Andric case AMDGPU::DS_WRITE_B64:
5271d5ae102SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9:
5281d5ae102SDimitry Andric return DS_WRITE;
529145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD:
530145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2:
531145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3:
532145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4:
533145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORD:
534145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX2:
535145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX3:
536145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX4:
537145449b1SDimitry Andric return FLAT_LOAD;
538145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
539145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
540145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
541145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
542145449b1SDimitry Andric return GLOBAL_LOAD_SADDR;
543145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD:
544145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2:
545145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3:
546145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4:
547145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORD:
548145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX2:
549145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX3:
550145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX4:
551145449b1SDimitry Andric return FLAT_STORE;
552145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
553145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
554145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
555145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
556145449b1SDimitry Andric return GLOBAL_STORE_SADDR;
5571d5ae102SDimitry Andric }
5581d5ae102SDimitry Andric }
5591d5ae102SDimitry Andric
5601d5ae102SDimitry Andric /// Determines instruction subclass from opcode. Only instructions
561145449b1SDimitry Andric /// of the same subclass can be merged together. The merged instruction may have
562145449b1SDimitry Andric /// a different subclass but must have the same class.
getInstSubclass(unsigned Opc,const SIInstrInfo & TII)5631d5ae102SDimitry Andric static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
5641d5ae102SDimitry Andric switch (Opc) {
5651d5ae102SDimitry Andric default:
5661d5ae102SDimitry Andric if (TII.isMUBUF(Opc))
5671d5ae102SDimitry Andric return AMDGPU::getMUBUFBaseOpcode(Opc);
568312c0ed1SDimitry Andric if (TII.isImage(Opc)) {
5691d5ae102SDimitry Andric const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
5701d5ae102SDimitry Andric assert(Info);
5711d5ae102SDimitry Andric return Info->BaseOpcode;
5721d5ae102SDimitry Andric }
573706b4fc4SDimitry Andric if (TII.isMTBUF(Opc))
574706b4fc4SDimitry Andric return AMDGPU::getMTBUFBaseOpcode(Opc);
5751d5ae102SDimitry Andric return -1;
5761d5ae102SDimitry Andric case AMDGPU::DS_READ_B32:
5771d5ae102SDimitry Andric case AMDGPU::DS_READ_B32_gfx9:
5781d5ae102SDimitry Andric case AMDGPU::DS_READ_B64:
5791d5ae102SDimitry Andric case AMDGPU::DS_READ_B64_gfx9:
5801d5ae102SDimitry Andric case AMDGPU::DS_WRITE_B32:
5811d5ae102SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9:
5821d5ae102SDimitry Andric case AMDGPU::DS_WRITE_B64:
5831d5ae102SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9:
5841d5ae102SDimitry Andric return Opc;
5851d5ae102SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
5861d5ae102SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
587312c0ed1SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
5881d5ae102SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
589c0981da4SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
5901d5ae102SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
591e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
592e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
593312c0ed1SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
594e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
595e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
596e3b55780SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
597e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORD_IMM:
598e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM:
599312c0ed1SDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM:
600e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM:
601e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM:
602ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
603ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
604ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
605ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
606e3b55780SDimitry Andric return AMDGPU::S_LOAD_DWORD_IMM;
607145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD:
608145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2:
609145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3:
610145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4:
611145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORD:
612145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX2:
613145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX3:
614145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX4:
615145449b1SDimitry Andric return AMDGPU::FLAT_LOAD_DWORD;
616145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
617145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
618145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
619145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
620145449b1SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
621145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD:
622145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2:
623145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3:
624145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4:
625145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORD:
626145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX2:
627145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX3:
628145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX4:
629145449b1SDimitry Andric return AMDGPU::FLAT_STORE_DWORD;
630145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
631145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
632145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
633145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
634145449b1SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
6351d5ae102SDimitry Andric }
6361d5ae102SDimitry Andric }
6371d5ae102SDimitry Andric
638145449b1SDimitry Andric // GLOBAL loads and stores are classified as FLAT initially. If both combined
639145449b1SDimitry Andric // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
640145449b1SDimitry Andric // If either or both instructions are non segment specific FLAT the resulting
641145449b1SDimitry Andric // combined operation will be FLAT, potentially promoting one of the GLOBAL
642145449b1SDimitry Andric // operations to FLAT.
643145449b1SDimitry Andric // For other instructions return the original unmodified class.
644145449b1SDimitry Andric InstClassEnum
getCommonInstClass(const CombineInfo & CI,const CombineInfo & Paired)645145449b1SDimitry Andric SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
646145449b1SDimitry Andric const CombineInfo &Paired) {
647145449b1SDimitry Andric assert(CI.InstClass == Paired.InstClass);
648145449b1SDimitry Andric
649145449b1SDimitry Andric if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
650145449b1SDimitry Andric SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
651145449b1SDimitry Andric return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
652145449b1SDimitry Andric
653145449b1SDimitry Andric return CI.InstClass;
654145449b1SDimitry Andric }
655145449b1SDimitry Andric
getRegs(unsigned Opc,const SIInstrInfo & TII)656cfca06d7SDimitry Andric static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
657cfca06d7SDimitry Andric AddressRegs Result;
658cfca06d7SDimitry Andric
6591d5ae102SDimitry Andric if (TII.isMUBUF(Opc)) {
660cfca06d7SDimitry Andric if (AMDGPU::getMUBUFHasVAddr(Opc))
661cfca06d7SDimitry Andric Result.VAddr = true;
662cfca06d7SDimitry Andric if (AMDGPU::getMUBUFHasSrsrc(Opc))
663cfca06d7SDimitry Andric Result.SRsrc = true;
664cfca06d7SDimitry Andric if (AMDGPU::getMUBUFHasSoffset(Opc))
665cfca06d7SDimitry Andric Result.SOffset = true;
6661d5ae102SDimitry Andric
667cfca06d7SDimitry Andric return Result;
6681d5ae102SDimitry Andric }
6691d5ae102SDimitry Andric
670312c0ed1SDimitry Andric if (TII.isImage(Opc)) {
671cfca06d7SDimitry Andric int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
672cfca06d7SDimitry Andric if (VAddr0Idx >= 0) {
673312c0ed1SDimitry Andric int RsrcName =
674312c0ed1SDimitry Andric TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
675312c0ed1SDimitry Andric int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
676312c0ed1SDimitry Andric Result.NumVAddrs = RsrcIdx - VAddr0Idx;
677cfca06d7SDimitry Andric } else {
678cfca06d7SDimitry Andric Result.VAddr = true;
679cfca06d7SDimitry Andric }
680cfca06d7SDimitry Andric Result.SRsrc = true;
6811d5ae102SDimitry Andric const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
6821d5ae102SDimitry Andric if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
683cfca06d7SDimitry Andric Result.SSamp = true;
684706b4fc4SDimitry Andric
685cfca06d7SDimitry Andric return Result;
686706b4fc4SDimitry Andric }
687706b4fc4SDimitry Andric if (TII.isMTBUF(Opc)) {
688cfca06d7SDimitry Andric if (AMDGPU::getMTBUFHasVAddr(Opc))
689cfca06d7SDimitry Andric Result.VAddr = true;
690cfca06d7SDimitry Andric if (AMDGPU::getMTBUFHasSrsrc(Opc))
691cfca06d7SDimitry Andric Result.SRsrc = true;
692cfca06d7SDimitry Andric if (AMDGPU::getMTBUFHasSoffset(Opc))
693cfca06d7SDimitry Andric Result.SOffset = true;
694706b4fc4SDimitry Andric
695cfca06d7SDimitry Andric return Result;
6961d5ae102SDimitry Andric }
6971d5ae102SDimitry Andric
6981d5ae102SDimitry Andric switch (Opc) {
6991d5ae102SDimitry Andric default:
700cfca06d7SDimitry Andric return Result;
701e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
702e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
703312c0ed1SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
704e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
705e3b55780SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
706e3b55780SDimitry Andric Result.SOffset = true;
707e3b55780SDimitry Andric [[fallthrough]];
7081d5ae102SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
7091d5ae102SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
710312c0ed1SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
7111d5ae102SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
712c0981da4SDimitry Andric case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
713e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORD_IMM:
714e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM:
715312c0ed1SDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM:
716e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM:
717e3b55780SDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM:
718ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
719ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
720ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
721ac9a064cSDimitry Andric case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
722cfca06d7SDimitry Andric Result.SBase = true;
723cfca06d7SDimitry Andric return Result;
7241d5ae102SDimitry Andric case AMDGPU::DS_READ_B32:
7251d5ae102SDimitry Andric case AMDGPU::DS_READ_B64:
7261d5ae102SDimitry Andric case AMDGPU::DS_READ_B32_gfx9:
7271d5ae102SDimitry Andric case AMDGPU::DS_READ_B64_gfx9:
7281d5ae102SDimitry Andric case AMDGPU::DS_WRITE_B32:
7291d5ae102SDimitry Andric case AMDGPU::DS_WRITE_B64:
7301d5ae102SDimitry Andric case AMDGPU::DS_WRITE_B32_gfx9:
7311d5ae102SDimitry Andric case AMDGPU::DS_WRITE_B64_gfx9:
732cfca06d7SDimitry Andric Result.Addr = true;
733cfca06d7SDimitry Andric return Result;
734145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
735145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
736145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
737145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
738145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
739145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
740145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
741145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
742145449b1SDimitry Andric Result.SAddr = true;
743e3b55780SDimitry Andric [[fallthrough]];
744145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORD:
745145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX2:
746145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX3:
747145449b1SDimitry Andric case AMDGPU::GLOBAL_LOAD_DWORDX4:
748145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORD:
749145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX2:
750145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX3:
751145449b1SDimitry Andric case AMDGPU::GLOBAL_STORE_DWORDX4:
752145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORD:
753145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX2:
754145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX3:
755145449b1SDimitry Andric case AMDGPU::FLAT_LOAD_DWORDX4:
756145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORD:
757145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX2:
758145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX3:
759145449b1SDimitry Andric case AMDGPU::FLAT_STORE_DWORDX4:
760145449b1SDimitry Andric Result.VAddr = true;
761145449b1SDimitry Andric return Result;
7621d5ae102SDimitry Andric }
7631d5ae102SDimitry Andric }
7641d5ae102SDimitry Andric
setMI(MachineBasicBlock::iterator MI,const SILoadStoreOptimizer & LSO)7651d5ae102SDimitry Andric void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
7666f8fc217SDimitry Andric const SILoadStoreOptimizer &LSO) {
7671d5ae102SDimitry Andric I = MI;
7681d5ae102SDimitry Andric unsigned Opc = MI->getOpcode();
7696f8fc217SDimitry Andric InstClass = getInstClass(Opc, *LSO.TII);
7701d5ae102SDimitry Andric
7711d5ae102SDimitry Andric if (InstClass == UNKNOWN)
7721d5ae102SDimitry Andric return;
7731d5ae102SDimitry Andric
7746f8fc217SDimitry Andric IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
7756f8fc217SDimitry Andric
7761d5ae102SDimitry Andric switch (InstClass) {
7771d5ae102SDimitry Andric case DS_READ:
7781d5ae102SDimitry Andric EltSize =
7791d5ae102SDimitry Andric (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
7801d5ae102SDimitry Andric : 4;
7811d5ae102SDimitry Andric break;
7821d5ae102SDimitry Andric case DS_WRITE:
7831d5ae102SDimitry Andric EltSize =
7841d5ae102SDimitry Andric (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
7851d5ae102SDimitry Andric : 4;
7861d5ae102SDimitry Andric break;
7871d5ae102SDimitry Andric case S_BUFFER_LOAD_IMM:
788e3b55780SDimitry Andric case S_BUFFER_LOAD_SGPR_IMM:
789e3b55780SDimitry Andric case S_LOAD_IMM:
7906f8fc217SDimitry Andric EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
7911d5ae102SDimitry Andric break;
7921d5ae102SDimitry Andric default:
7931d5ae102SDimitry Andric EltSize = 4;
7941d5ae102SDimitry Andric break;
7951d5ae102SDimitry Andric }
7961d5ae102SDimitry Andric
7971d5ae102SDimitry Andric if (InstClass == MIMG) {
7986f8fc217SDimitry Andric DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
799cfca06d7SDimitry Andric // Offset is not considered for MIMG instructions.
800cfca06d7SDimitry Andric Offset = 0;
8011d5ae102SDimitry Andric } else {
8021d5ae102SDimitry Andric int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
8037fa27ce4SDimitry Andric Offset = I->getOperand(OffsetIdx).getImm();
8041d5ae102SDimitry Andric }
8051d5ae102SDimitry Andric
806706b4fc4SDimitry Andric if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
8076f8fc217SDimitry Andric Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
808706b4fc4SDimitry Andric
8096f8fc217SDimitry Andric Width = getOpcodeWidth(*I, *LSO.TII);
8101d5ae102SDimitry Andric
8111d5ae102SDimitry Andric if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
812706b4fc4SDimitry Andric Offset &= 0xffff;
8131d5ae102SDimitry Andric } else if (InstClass != MIMG) {
8146f8fc217SDimitry Andric CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
8151d5ae102SDimitry Andric }
8161d5ae102SDimitry Andric
8176f8fc217SDimitry Andric AddressRegs Regs = getRegs(Opc, *LSO.TII);
818312c0ed1SDimitry Andric bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
819cfca06d7SDimitry Andric
8201d5ae102SDimitry Andric NumAddresses = 0;
821cfca06d7SDimitry Andric for (unsigned J = 0; J < Regs.NumVAddrs; J++)
822cfca06d7SDimitry Andric AddrIdx[NumAddresses++] =
823cfca06d7SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
824cfca06d7SDimitry Andric if (Regs.Addr)
825cfca06d7SDimitry Andric AddrIdx[NumAddresses++] =
826cfca06d7SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
827cfca06d7SDimitry Andric if (Regs.SBase)
828cfca06d7SDimitry Andric AddrIdx[NumAddresses++] =
829cfca06d7SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
830cfca06d7SDimitry Andric if (Regs.SRsrc)
831312c0ed1SDimitry Andric AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
832312c0ed1SDimitry Andric Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
833cfca06d7SDimitry Andric if (Regs.SOffset)
834cfca06d7SDimitry Andric AddrIdx[NumAddresses++] =
835cfca06d7SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
836145449b1SDimitry Andric if (Regs.SAddr)
837145449b1SDimitry Andric AddrIdx[NumAddresses++] =
838145449b1SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
839cfca06d7SDimitry Andric if (Regs.VAddr)
840cfca06d7SDimitry Andric AddrIdx[NumAddresses++] =
841cfca06d7SDimitry Andric AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
842cfca06d7SDimitry Andric if (Regs.SSamp)
843312c0ed1SDimitry Andric AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
844312c0ed1SDimitry Andric Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
845cfca06d7SDimitry Andric assert(NumAddresses <= MaxAddressRegs);
8461d5ae102SDimitry Andric
847cfca06d7SDimitry Andric for (unsigned J = 0; J < NumAddresses; J++)
848cfca06d7SDimitry Andric AddrReg[J] = &I->getOperand(AddrIdx[J]);
8491d5ae102SDimitry Andric }
8501d5ae102SDimitry Andric
85171d5a254SDimitry Andric } // end anonymous namespace.
85267c32a98SDimitry Andric
85367c32a98SDimitry Andric INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
854eb11fae6SDimitry Andric "SI Load Store Optimizer", false, false)
855b915e9e0SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
856d8e91e46SDimitry Andric INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
857d8e91e46SDimitry Andric false, false)
85867c32a98SDimitry Andric
85967c32a98SDimitry Andric char SILoadStoreOptimizer::ID = 0;
86067c32a98SDimitry Andric
86167c32a98SDimitry Andric char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
86267c32a98SDimitry Andric
createSILoadStoreOptimizerPass()863b5630dbaSDimitry Andric FunctionPass *llvm::createSILoadStoreOptimizerPass() {
864b5630dbaSDimitry Andric return new SILoadStoreOptimizer();
86567c32a98SDimitry Andric }
86667c32a98SDimitry Andric
addDefsUsesToList(const MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & RegUses)867eb11fae6SDimitry Andric static void addDefsUsesToList(const MachineInstr &MI,
868cfca06d7SDimitry Andric DenseSet<Register> &RegDefs,
869145449b1SDimitry Andric DenseSet<Register> &RegUses) {
870145449b1SDimitry Andric for (const auto &Op : MI.operands()) {
871145449b1SDimitry Andric if (!Op.isReg())
872145449b1SDimitry Andric continue;
873eb11fae6SDimitry Andric if (Op.isDef())
874eb11fae6SDimitry Andric RegDefs.insert(Op.getReg());
875145449b1SDimitry Andric if (Op.readsReg())
876145449b1SDimitry Andric RegUses.insert(Op.getReg());
877b915e9e0SDimitry Andric }
878b915e9e0SDimitry Andric }
879b915e9e0SDimitry Andric
canSwapInstructions(const DenseSet<Register> & ARegDefs,const DenseSet<Register> & ARegUses,const MachineInstr & A,const MachineInstr & B) const880145449b1SDimitry Andric bool SILoadStoreOptimizer::canSwapInstructions(
881145449b1SDimitry Andric const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
882145449b1SDimitry Andric const MachineInstr &A, const MachineInstr &B) const {
883145449b1SDimitry Andric if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
884145449b1SDimitry Andric (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
885b915e9e0SDimitry Andric return false;
886145449b1SDimitry Andric for (const auto &BOp : B.operands()) {
887145449b1SDimitry Andric if (!BOp.isReg())
888b915e9e0SDimitry Andric continue;
889145449b1SDimitry Andric if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
890145449b1SDimitry Andric return false;
891145449b1SDimitry Andric if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
892b915e9e0SDimitry Andric return false;
893b915e9e0SDimitry Andric }
894b915e9e0SDimitry Andric return true;
895b915e9e0SDimitry Andric }
896b915e9e0SDimitry Andric
897145449b1SDimitry Andric // Given that \p CI and \p Paired are adjacent memory operations produce a new
898145449b1SDimitry Andric // MMO for the combined operation with a new access size.
899145449b1SDimitry Andric MachineMemOperand *
combineKnownAdjacentMMOs(const CombineInfo & CI,const CombineInfo & Paired)900145449b1SDimitry Andric SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
901145449b1SDimitry Andric const CombineInfo &Paired) {
902145449b1SDimitry Andric const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
903145449b1SDimitry Andric const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
904145449b1SDimitry Andric
905ac9a064cSDimitry Andric unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
906145449b1SDimitry Andric
907145449b1SDimitry Andric // A base pointer for the combined operation is the same as the leading
908145449b1SDimitry Andric // operation's pointer.
909145449b1SDimitry Andric if (Paired < CI)
910145449b1SDimitry Andric std::swap(MMOa, MMOb);
911145449b1SDimitry Andric
912145449b1SDimitry Andric MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
913145449b1SDimitry Andric // If merging FLAT and GLOBAL set address space to FLAT.
914145449b1SDimitry Andric if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
915145449b1SDimitry Andric PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
916145449b1SDimitry Andric
917145449b1SDimitry Andric MachineFunction *MF = CI.I->getMF();
918145449b1SDimitry Andric return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
9191d5ae102SDimitry Andric }
9201d5ae102SDimitry Andric
dmasksCanBeCombined(const CombineInfo & CI,const SIInstrInfo & TII,const CombineInfo & Paired)921706b4fc4SDimitry Andric bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
922706b4fc4SDimitry Andric const SIInstrInfo &TII,
923706b4fc4SDimitry Andric const CombineInfo &Paired) {
9241d5ae102SDimitry Andric assert(CI.InstClass == MIMG);
9251d5ae102SDimitry Andric
9261d5ae102SDimitry Andric // Ignore instructions with tfe/lwe set.
9271d5ae102SDimitry Andric const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
9281d5ae102SDimitry Andric const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
9291d5ae102SDimitry Andric
9301d5ae102SDimitry Andric if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
9311d5ae102SDimitry Andric return false;
9321d5ae102SDimitry Andric
9331d5ae102SDimitry Andric // Check other optional immediate operands for equality.
934344a3780SDimitry Andric unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
935344a3780SDimitry Andric AMDGPU::OpName::unorm, AMDGPU::OpName::da,
936344a3780SDimitry Andric AMDGPU::OpName::r128, AMDGPU::OpName::a16};
9371d5ae102SDimitry Andric
9381d5ae102SDimitry Andric for (auto op : OperandsToMatch) {
9391d5ae102SDimitry Andric int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
940706b4fc4SDimitry Andric if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
9411d5ae102SDimitry Andric return false;
9421d5ae102SDimitry Andric if (Idx != -1 &&
943706b4fc4SDimitry Andric CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
9441d5ae102SDimitry Andric return false;
9451d5ae102SDimitry Andric }
9461d5ae102SDimitry Andric
9471d5ae102SDimitry Andric // Check DMask for overlaps.
948706b4fc4SDimitry Andric unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
949706b4fc4SDimitry Andric unsigned MinMask = std::min(CI.DMask, Paired.DMask);
9501d5ae102SDimitry Andric
951b1c73532SDimitry Andric if (!MaxMask)
952b1c73532SDimitry Andric return false;
953b1c73532SDimitry Andric
9547fa27ce4SDimitry Andric unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
9551d5ae102SDimitry Andric if ((1u << AllowedBitsForMin) <= MinMask)
9561d5ae102SDimitry Andric return false;
9571d5ae102SDimitry Andric
9581d5ae102SDimitry Andric return true;
9591d5ae102SDimitry Andric }
9601d5ae102SDimitry Andric
getBufferFormatWithCompCount(unsigned OldFormat,unsigned ComponentCount,const GCNSubtarget & STI)961706b4fc4SDimitry Andric static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
962706b4fc4SDimitry Andric unsigned ComponentCount,
963cfca06d7SDimitry Andric const GCNSubtarget &STI) {
964706b4fc4SDimitry Andric if (ComponentCount > 4)
965706b4fc4SDimitry Andric return 0;
966706b4fc4SDimitry Andric
967706b4fc4SDimitry Andric const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
968706b4fc4SDimitry Andric llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
969706b4fc4SDimitry Andric if (!OldFormatInfo)
970706b4fc4SDimitry Andric return 0;
971706b4fc4SDimitry Andric
972706b4fc4SDimitry Andric const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
973706b4fc4SDimitry Andric llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
974706b4fc4SDimitry Andric ComponentCount,
975706b4fc4SDimitry Andric OldFormatInfo->NumFormat, STI);
976706b4fc4SDimitry Andric
977706b4fc4SDimitry Andric if (!NewFormatInfo)
978706b4fc4SDimitry Andric return 0;
979706b4fc4SDimitry Andric
980706b4fc4SDimitry Andric assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
981706b4fc4SDimitry Andric NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
982706b4fc4SDimitry Andric
983706b4fc4SDimitry Andric return NewFormatInfo->Format;
984706b4fc4SDimitry Andric }
985706b4fc4SDimitry Andric
986344a3780SDimitry Andric // Return the value in the inclusive range [Lo,Hi] that is aligned to the
987344a3780SDimitry Andric // highest power of two. Note that the result is well defined for all inputs
988344a3780SDimitry Andric // including corner cases like:
989344a3780SDimitry Andric // - if Lo == Hi, return that value
990344a3780SDimitry Andric // - if Lo == 0, return 0 (even though the "- 1" below underflows
991344a3780SDimitry Andric // - if Lo > Hi, return 0 (as if the range wrapped around)
mostAlignedValueInRange(uint32_t Lo,uint32_t Hi)992344a3780SDimitry Andric static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
9937fa27ce4SDimitry Andric return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
994344a3780SDimitry Andric }
995344a3780SDimitry Andric
offsetsCanBeCombined(CombineInfo & CI,const GCNSubtarget & STI,CombineInfo & Paired,bool Modify)996706b4fc4SDimitry Andric bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
997cfca06d7SDimitry Andric const GCNSubtarget &STI,
998cfca06d7SDimitry Andric CombineInfo &Paired,
999cfca06d7SDimitry Andric bool Modify) {
10001d5ae102SDimitry Andric assert(CI.InstClass != MIMG);
10011d5ae102SDimitry Andric
100267c32a98SDimitry Andric // XXX - Would the same offset be OK? Is there any reason this would happen or
100367c32a98SDimitry Andric // be useful?
1004706b4fc4SDimitry Andric if (CI.Offset == Paired.Offset)
100567c32a98SDimitry Andric return false;
100667c32a98SDimitry Andric
100767c32a98SDimitry Andric // This won't be valid if the offset isn't aligned.
1008706b4fc4SDimitry Andric if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
100967c32a98SDimitry Andric return false;
101067c32a98SDimitry Andric
1011706b4fc4SDimitry Andric if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1012706b4fc4SDimitry Andric
1013706b4fc4SDimitry Andric const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1014706b4fc4SDimitry Andric llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
1015706b4fc4SDimitry Andric if (!Info0)
1016706b4fc4SDimitry Andric return false;
1017706b4fc4SDimitry Andric const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1018706b4fc4SDimitry Andric llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1019706b4fc4SDimitry Andric if (!Info1)
1020706b4fc4SDimitry Andric return false;
1021706b4fc4SDimitry Andric
1022706b4fc4SDimitry Andric if (Info0->BitsPerComp != Info1->BitsPerComp ||
1023706b4fc4SDimitry Andric Info0->NumFormat != Info1->NumFormat)
1024706b4fc4SDimitry Andric return false;
1025706b4fc4SDimitry Andric
1026706b4fc4SDimitry Andric // TODO: Should be possible to support more formats, but if format loads
1027706b4fc4SDimitry Andric // are not dword-aligned, the merged load might not be valid.
1028706b4fc4SDimitry Andric if (Info0->BitsPerComp != 32)
1029706b4fc4SDimitry Andric return false;
1030706b4fc4SDimitry Andric
1031706b4fc4SDimitry Andric if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1032706b4fc4SDimitry Andric return false;
1033706b4fc4SDimitry Andric }
1034706b4fc4SDimitry Andric
1035344a3780SDimitry Andric uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1036344a3780SDimitry Andric uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
103771d5a254SDimitry Andric CI.UseST64 = false;
103871d5a254SDimitry Andric CI.BaseOff = 0;
103967c32a98SDimitry Andric
1040344a3780SDimitry Andric // Handle all non-DS instructions.
1041d8e91e46SDimitry Andric if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
10427fa27ce4SDimitry Andric if (EltOffset0 + CI.Width != EltOffset1 &&
10437fa27ce4SDimitry Andric EltOffset1 + Paired.Width != EltOffset0)
10447fa27ce4SDimitry Andric return false;
10457fa27ce4SDimitry Andric if (CI.CPol != Paired.CPol)
10467fa27ce4SDimitry Andric return false;
1047312c0ed1SDimitry Andric if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1048312c0ed1SDimitry Andric CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1049312c0ed1SDimitry Andric // Reject cases like:
1050312c0ed1SDimitry Andric // dword + dwordx2 -> dwordx3
1051312c0ed1SDimitry Andric // dword + dwordx3 -> dwordx4
1052312c0ed1SDimitry Andric // If we tried to combine these cases, we would fail to extract a subreg
1053312c0ed1SDimitry Andric // for the result of the second load due to SGPR alignment requirements.
1054312c0ed1SDimitry Andric if (CI.Width != Paired.Width &&
1055312c0ed1SDimitry Andric (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1056312c0ed1SDimitry Andric return false;
1057312c0ed1SDimitry Andric }
10587fa27ce4SDimitry Andric return true;
1059044eb2f6SDimitry Andric }
1060044eb2f6SDimitry Andric
106167c32a98SDimitry Andric // If the offset in elements doesn't fit in 8-bits, we might be able to use
106267c32a98SDimitry Andric // the stride 64 versions.
106371d5a254SDimitry Andric if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
106471d5a254SDimitry Andric isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1065cfca06d7SDimitry Andric if (Modify) {
1066706b4fc4SDimitry Andric CI.Offset = EltOffset0 / 64;
1067706b4fc4SDimitry Andric Paired.Offset = EltOffset1 / 64;
106871d5a254SDimitry Andric CI.UseST64 = true;
1069cfca06d7SDimitry Andric }
107071d5a254SDimitry Andric return true;
107167c32a98SDimitry Andric }
107267c32a98SDimitry Andric
107371d5a254SDimitry Andric // Check if the new offsets fit in the reduced 8-bit range.
107471d5a254SDimitry Andric if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1075cfca06d7SDimitry Andric if (Modify) {
1076706b4fc4SDimitry Andric CI.Offset = EltOffset0;
1077706b4fc4SDimitry Andric Paired.Offset = EltOffset1;
1078cfca06d7SDimitry Andric }
107971d5a254SDimitry Andric return true;
108071d5a254SDimitry Andric }
108171d5a254SDimitry Andric
108271d5a254SDimitry Andric // Try to shift base address to decrease offsets.
1083344a3780SDimitry Andric uint32_t Min = std::min(EltOffset0, EltOffset1);
1084344a3780SDimitry Andric uint32_t Max = std::max(EltOffset0, EltOffset1);
108571d5a254SDimitry Andric
1086344a3780SDimitry Andric const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1087344a3780SDimitry Andric if (((Max - Min) & ~Mask) == 0) {
1088cfca06d7SDimitry Andric if (Modify) {
1089344a3780SDimitry Andric // From the range of values we could use for BaseOff, choose the one that
1090344a3780SDimitry Andric // is aligned to the highest power of two, to maximise the chance that
1091344a3780SDimitry Andric // the same offset can be reused for other load/store pairs.
1092344a3780SDimitry Andric uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1093344a3780SDimitry Andric // Copy the low bits of the offsets, so that when we adjust them by
1094344a3780SDimitry Andric // subtracting BaseOff they will be multiples of 64.
1095344a3780SDimitry Andric BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1096344a3780SDimitry Andric CI.BaseOff = BaseOff * CI.EltSize;
1097344a3780SDimitry Andric CI.Offset = (EltOffset0 - BaseOff) / 64;
1098344a3780SDimitry Andric Paired.Offset = (EltOffset1 - BaseOff) / 64;
109971d5a254SDimitry Andric CI.UseST64 = true;
1100cfca06d7SDimitry Andric }
110171d5a254SDimitry Andric return true;
110271d5a254SDimitry Andric }
110371d5a254SDimitry Andric
1104344a3780SDimitry Andric if (isUInt<8>(Max - Min)) {
1105cfca06d7SDimitry Andric if (Modify) {
1106344a3780SDimitry Andric // From the range of values we could use for BaseOff, choose the one that
1107344a3780SDimitry Andric // is aligned to the highest power of two, to maximise the chance that
1108344a3780SDimitry Andric // the same offset can be reused for other load/store pairs.
1109344a3780SDimitry Andric uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1110344a3780SDimitry Andric CI.BaseOff = BaseOff * CI.EltSize;
1111344a3780SDimitry Andric CI.Offset = EltOffset0 - BaseOff;
1112344a3780SDimitry Andric Paired.Offset = EltOffset1 - BaseOff;
1113cfca06d7SDimitry Andric }
111471d5a254SDimitry Andric return true;
111571d5a254SDimitry Andric }
111671d5a254SDimitry Andric
111771d5a254SDimitry Andric return false;
111871d5a254SDimitry Andric }
111971d5a254SDimitry Andric
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI,const CombineInfo & Paired)1120d8e91e46SDimitry Andric bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1121706b4fc4SDimitry Andric const CombineInfo &CI,
1122706b4fc4SDimitry Andric const CombineInfo &Paired) {
1123706b4fc4SDimitry Andric const unsigned Width = (CI.Width + Paired.Width);
1124d8e91e46SDimitry Andric switch (CI.InstClass) {
1125d8e91e46SDimitry Andric default:
1126d8e91e46SDimitry Andric return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1127d8e91e46SDimitry Andric case S_BUFFER_LOAD_IMM:
1128e3b55780SDimitry Andric case S_BUFFER_LOAD_SGPR_IMM:
1129e3b55780SDimitry Andric case S_LOAD_IMM:
1130d8e91e46SDimitry Andric switch (Width) {
1131d8e91e46SDimitry Andric default:
1132d8e91e46SDimitry Andric return false;
1133d8e91e46SDimitry Andric case 2:
1134d8e91e46SDimitry Andric case 4:
1135c0981da4SDimitry Andric case 8:
1136d8e91e46SDimitry Andric return true;
1137312c0ed1SDimitry Andric case 3:
1138312c0ed1SDimitry Andric return STM.hasScalarDwordx3Loads();
1139d8e91e46SDimitry Andric }
1140d8e91e46SDimitry Andric }
1141d8e91e46SDimitry Andric }
1142d8e91e46SDimitry Andric
1143344a3780SDimitry Andric const TargetRegisterClass *
getDataRegClass(const MachineInstr & MI) const1144344a3780SDimitry Andric SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1145344a3780SDimitry Andric if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1146344a3780SDimitry Andric return TRI->getRegClassForReg(*MRI, Dst->getReg());
1147344a3780SDimitry Andric }
1148344a3780SDimitry Andric if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1149344a3780SDimitry Andric return TRI->getRegClassForReg(*MRI, Src->getReg());
1150344a3780SDimitry Andric }
1151344a3780SDimitry Andric if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1152344a3780SDimitry Andric return TRI->getRegClassForReg(*MRI, Src->getReg());
1153344a3780SDimitry Andric }
1154344a3780SDimitry Andric if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1155344a3780SDimitry Andric return TRI->getRegClassForReg(*MRI, Dst->getReg());
1156344a3780SDimitry Andric }
1157344a3780SDimitry Andric if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1158344a3780SDimitry Andric return TRI->getRegClassForReg(*MRI, Src->getReg());
1159344a3780SDimitry Andric }
1160344a3780SDimitry Andric return nullptr;
1161344a3780SDimitry Andric }
1162344a3780SDimitry Andric
1163145449b1SDimitry Andric /// This function assumes that CI comes before Paired in a basic block. Return
1164145449b1SDimitry Andric /// an insertion point for the merged instruction or nullptr on failure.
1165145449b1SDimitry Andric SILoadStoreOptimizer::CombineInfo *
checkAndPrepareMerge(CombineInfo & CI,CombineInfo & Paired)1166145449b1SDimitry Andric SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1167145449b1SDimitry Andric CombineInfo &Paired) {
1168145449b1SDimitry Andric // If another instruction has already been merged into CI, it may now be a
1169145449b1SDimitry Andric // type that we can't do any further merging into.
1170145449b1SDimitry Andric if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1171145449b1SDimitry Andric return nullptr;
1172145449b1SDimitry Andric assert(CI.InstClass == Paired.InstClass);
1173145449b1SDimitry Andric
1174145449b1SDimitry Andric if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1175145449b1SDimitry Andric getInstSubclass(Paired.I->getOpcode(), *TII))
1176145449b1SDimitry Andric return nullptr;
1177cfca06d7SDimitry Andric
1178cfca06d7SDimitry Andric // Check both offsets (or masks for MIMG) can be combined and fit in the
1179cfca06d7SDimitry Andric // reduced range.
1180145449b1SDimitry Andric if (CI.InstClass == MIMG) {
1181145449b1SDimitry Andric if (!dmasksCanBeCombined(CI, *TII, Paired))
1182145449b1SDimitry Andric return nullptr;
1183145449b1SDimitry Andric } else {
1184145449b1SDimitry Andric if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1185145449b1SDimitry Andric return nullptr;
1186cfca06d7SDimitry Andric }
1187cfca06d7SDimitry Andric
1188145449b1SDimitry Andric DenseSet<Register> RegDefs;
1189145449b1SDimitry Andric DenseSet<Register> RegUses;
1190145449b1SDimitry Andric CombineInfo *Where;
1191145449b1SDimitry Andric if (CI.I->mayLoad()) {
1192145449b1SDimitry Andric // Try to hoist Paired up to CI.
1193145449b1SDimitry Andric addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1194145449b1SDimitry Andric for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1195145449b1SDimitry Andric if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1196145449b1SDimitry Andric return nullptr;
1197b915e9e0SDimitry Andric }
1198145449b1SDimitry Andric Where = &CI;
1199145449b1SDimitry Andric } else {
1200145449b1SDimitry Andric // Try to sink CI down to Paired.
1201145449b1SDimitry Andric addDefsUsesToList(*CI.I, RegDefs, RegUses);
1202145449b1SDimitry Andric for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1203145449b1SDimitry Andric if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1204145449b1SDimitry Andric return nullptr;
1205b915e9e0SDimitry Andric }
1206145449b1SDimitry Andric Where = &Paired;
1207145449b1SDimitry Andric }
1208cfca06d7SDimitry Andric
1209cfca06d7SDimitry Andric // Call offsetsCanBeCombined with modify = true so that the offsets are
1210cfca06d7SDimitry Andric // correct for the new instruction. This should return true, because
1211cfca06d7SDimitry Andric // this function should only be called on CombineInfo objects that
1212cfca06d7SDimitry Andric // have already been confirmed to be mergeable.
1213145449b1SDimitry Andric if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1214cfca06d7SDimitry Andric offsetsCanBeCombined(CI, *STM, Paired, true);
1215145449b1SDimitry Andric return Where;
121667c32a98SDimitry Andric }
121767c32a98SDimitry Andric
1218ac9a064cSDimitry Andric // Copy the merged load result from DestReg to the original dest regs of CI and
1219ac9a064cSDimitry Andric // Paired.
copyToDestRegs(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore,int OpName,Register DestReg) const1220ac9a064cSDimitry Andric void SILoadStoreOptimizer::copyToDestRegs(
1221ac9a064cSDimitry Andric CombineInfo &CI, CombineInfo &Paired,
1222ac9a064cSDimitry Andric MachineBasicBlock::iterator InsertBefore, int OpName,
1223ac9a064cSDimitry Andric Register DestReg) const {
1224ac9a064cSDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
1225ac9a064cSDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1226ac9a064cSDimitry Andric
1227ac9a064cSDimitry Andric auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1228ac9a064cSDimitry Andric
1229ac9a064cSDimitry Andric // Copy to the old destination registers.
1230ac9a064cSDimitry Andric const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1231ac9a064cSDimitry Andric auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1232ac9a064cSDimitry Andric auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1233ac9a064cSDimitry Andric
1234ac9a064cSDimitry Andric // The constrained sload instructions in S_LOAD_IMM class will have
1235ac9a064cSDimitry Andric // `early-clobber` flag in the dst operand. Remove the flag before using the
1236ac9a064cSDimitry Andric // MOs in copies.
1237ac9a064cSDimitry Andric Dest0->setIsEarlyClobber(false);
1238ac9a064cSDimitry Andric Dest1->setIsEarlyClobber(false);
1239ac9a064cSDimitry Andric
1240ac9a064cSDimitry Andric BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1241ac9a064cSDimitry Andric .add(*Dest0) // Copy to same destination including flags and sub reg.
1242ac9a064cSDimitry Andric .addReg(DestReg, 0, SubRegIdx0);
1243ac9a064cSDimitry Andric BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1244ac9a064cSDimitry Andric .add(*Dest1)
1245ac9a064cSDimitry Andric .addReg(DestReg, RegState::Kill, SubRegIdx1);
1246ac9a064cSDimitry Andric }
1247ac9a064cSDimitry Andric
1248ac9a064cSDimitry Andric // Return a register for the source of the merged store after copying the
1249ac9a064cSDimitry Andric // original source regs of CI and Paired into it.
1250ac9a064cSDimitry Andric Register
copyFromSrcRegs(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore,int OpName) const1251ac9a064cSDimitry Andric SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1252ac9a064cSDimitry Andric MachineBasicBlock::iterator InsertBefore,
1253ac9a064cSDimitry Andric int OpName) const {
1254ac9a064cSDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
1255ac9a064cSDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1256ac9a064cSDimitry Andric
1257ac9a064cSDimitry Andric auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1258ac9a064cSDimitry Andric
1259ac9a064cSDimitry Andric // Copy to the new source register.
1260ac9a064cSDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1261ac9a064cSDimitry Andric Register SrcReg = MRI->createVirtualRegister(SuperRC);
1262ac9a064cSDimitry Andric
1263ac9a064cSDimitry Andric const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1264ac9a064cSDimitry Andric const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1265ac9a064cSDimitry Andric
1266ac9a064cSDimitry Andric BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1267ac9a064cSDimitry Andric .add(*Src0)
1268ac9a064cSDimitry Andric .addImm(SubRegIdx0)
1269ac9a064cSDimitry Andric .add(*Src1)
1270ac9a064cSDimitry Andric .addImm(SubRegIdx1);
1271ac9a064cSDimitry Andric
1272ac9a064cSDimitry Andric return SrcReg;
1273ac9a064cSDimitry Andric }
1274ac9a064cSDimitry Andric
read2Opcode(unsigned EltSize) const1275044eb2f6SDimitry Andric unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1276044eb2f6SDimitry Andric if (STM->ldsRequiresM0Init())
1277044eb2f6SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1278044eb2f6SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1279044eb2f6SDimitry Andric }
1280044eb2f6SDimitry Andric
read2ST64Opcode(unsigned EltSize) const1281044eb2f6SDimitry Andric unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1282044eb2f6SDimitry Andric if (STM->ldsRequiresM0Init())
1283044eb2f6SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1284044eb2f6SDimitry Andric
1285d8e91e46SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1286d8e91e46SDimitry Andric : AMDGPU::DS_READ2ST64_B64_gfx9;
1287044eb2f6SDimitry Andric }
1288044eb2f6SDimitry Andric
1289d8e91e46SDimitry Andric MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1290cfca06d7SDimitry Andric SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1291145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
129271d5a254SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
129367c32a98SDimitry Andric
129467c32a98SDimitry Andric // Be careful, since the addresses could be subregisters themselves in weird
129567c32a98SDimitry Andric // cases, like vectors of pointers.
129671d5a254SDimitry Andric const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
129767c32a98SDimitry Andric
1298ac9a064cSDimitry Andric unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1299ac9a064cSDimitry Andric unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1300d8e91e46SDimitry Andric unsigned Opc =
1301d8e91e46SDimitry Andric CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
130267c32a98SDimitry Andric
130367c32a98SDimitry Andric assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1304d8e91e46SDimitry Andric (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
130567c32a98SDimitry Andric
130667c32a98SDimitry Andric const MCInstrDesc &Read2Desc = TII->get(Opc);
130767c32a98SDimitry Andric
1308344a3780SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
13091d5ae102SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC);
131067c32a98SDimitry Andric
131171d5a254SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
131271d5a254SDimitry Andric
13131d5ae102SDimitry Andric Register BaseReg = AddrReg->getReg();
1314d8e91e46SDimitry Andric unsigned BaseSubReg = AddrReg->getSubReg();
131571d5a254SDimitry Andric unsigned BaseRegFlags = 0;
131671d5a254SDimitry Andric if (CI.BaseOff) {
13171d5ae102SDimitry Andric Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1318145449b1SDimitry Andric BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1319eb11fae6SDimitry Andric .addImm(CI.BaseOff);
1320eb11fae6SDimitry Andric
132171d5a254SDimitry Andric BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
132271d5a254SDimitry Andric BaseRegFlags = RegState::Kill;
1323044eb2f6SDimitry Andric
1324145449b1SDimitry Andric TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1325eb11fae6SDimitry Andric .addReg(ImmReg)
1326e6d15924SDimitry Andric .addReg(AddrReg->getReg(), 0, BaseSubReg)
1327e6d15924SDimitry Andric .addImm(0); // clamp bit
1328d8e91e46SDimitry Andric BaseSubReg = 0;
132971d5a254SDimitry Andric }
133071d5a254SDimitry Andric
133171d5a254SDimitry Andric MachineInstrBuilder Read2 =
1332145449b1SDimitry Andric BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1333d8e91e46SDimitry Andric .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
133467c32a98SDimitry Andric .addImm(NewOffset0) // offset0
133567c32a98SDimitry Andric .addImm(NewOffset1) // offset1
13365a5ac124SDimitry Andric .addImm(0) // gds
1337706b4fc4SDimitry Andric .cloneMergedMemRefs({&*CI.I, &*Paired.I});
133871d5a254SDimitry Andric
1339ac9a064cSDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1340ee8648bdSDimitry Andric
134171d5a254SDimitry Andric CI.I->eraseFromParent();
1342706b4fc4SDimitry Andric Paired.I->eraseFromParent();
134367c32a98SDimitry Andric
1344eb11fae6SDimitry Andric LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
13451d5ae102SDimitry Andric return Read2;
134667c32a98SDimitry Andric }
134767c32a98SDimitry Andric
write2Opcode(unsigned EltSize) const1348044eb2f6SDimitry Andric unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1349044eb2f6SDimitry Andric if (STM->ldsRequiresM0Init())
1350044eb2f6SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1351d8e91e46SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1352d8e91e46SDimitry Andric : AMDGPU::DS_WRITE2_B64_gfx9;
1353044eb2f6SDimitry Andric }
1354044eb2f6SDimitry Andric
write2ST64Opcode(unsigned EltSize) const1355044eb2f6SDimitry Andric unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1356044eb2f6SDimitry Andric if (STM->ldsRequiresM0Init())
1357d8e91e46SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1358d8e91e46SDimitry Andric : AMDGPU::DS_WRITE2ST64_B64;
1359044eb2f6SDimitry Andric
1360d8e91e46SDimitry Andric return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1361d8e91e46SDimitry Andric : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1362044eb2f6SDimitry Andric }
1363044eb2f6SDimitry Andric
mergeWrite2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1364145449b1SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1365145449b1SDimitry Andric CombineInfo &CI, CombineInfo &Paired,
1366145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
136771d5a254SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
136867c32a98SDimitry Andric
136967c32a98SDimitry Andric // Be sure to use .addOperand(), and not .addReg() with these. We want to be
137067c32a98SDimitry Andric // sure we preserve the subregister index and any register flags set on them.
1371d8e91e46SDimitry Andric const MachineOperand *AddrReg =
1372d8e91e46SDimitry Andric TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1373d8e91e46SDimitry Andric const MachineOperand *Data0 =
1374d8e91e46SDimitry Andric TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1375d8e91e46SDimitry Andric const MachineOperand *Data1 =
1376706b4fc4SDimitry Andric TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
137767c32a98SDimitry Andric
1378706b4fc4SDimitry Andric unsigned NewOffset0 = CI.Offset;
1379706b4fc4SDimitry Andric unsigned NewOffset1 = Paired.Offset;
1380d8e91e46SDimitry Andric unsigned Opc =
1381d8e91e46SDimitry Andric CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
138267c32a98SDimitry Andric
1383b915e9e0SDimitry Andric if (NewOffset0 > NewOffset1) {
1384b915e9e0SDimitry Andric // Canonicalize the merged instruction so the smaller offset comes first.
1385b915e9e0SDimitry Andric std::swap(NewOffset0, NewOffset1);
1386b915e9e0SDimitry Andric std::swap(Data0, Data1);
1387b915e9e0SDimitry Andric }
1388b915e9e0SDimitry Andric
138967c32a98SDimitry Andric assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1390d8e91e46SDimitry Andric (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
139167c32a98SDimitry Andric
139267c32a98SDimitry Andric const MCInstrDesc &Write2Desc = TII->get(Opc);
139371d5a254SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
139467c32a98SDimitry Andric
13951d5ae102SDimitry Andric Register BaseReg = AddrReg->getReg();
1396d8e91e46SDimitry Andric unsigned BaseSubReg = AddrReg->getSubReg();
139771d5a254SDimitry Andric unsigned BaseRegFlags = 0;
139871d5a254SDimitry Andric if (CI.BaseOff) {
13991d5ae102SDimitry Andric Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1400145449b1SDimitry Andric BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1401eb11fae6SDimitry Andric .addImm(CI.BaseOff);
1402eb11fae6SDimitry Andric
140371d5a254SDimitry Andric BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
140471d5a254SDimitry Andric BaseRegFlags = RegState::Kill;
1405044eb2f6SDimitry Andric
1406145449b1SDimitry Andric TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1407eb11fae6SDimitry Andric .addReg(ImmReg)
1408e6d15924SDimitry Andric .addReg(AddrReg->getReg(), 0, BaseSubReg)
1409e6d15924SDimitry Andric .addImm(0); // clamp bit
1410d8e91e46SDimitry Andric BaseSubReg = 0;
141171d5a254SDimitry Andric }
141271d5a254SDimitry Andric
141371d5a254SDimitry Andric MachineInstrBuilder Write2 =
1414145449b1SDimitry Andric BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1415d8e91e46SDimitry Andric .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
141671d5a254SDimitry Andric .add(*Data0) // data0
141771d5a254SDimitry Andric .add(*Data1) // data1
141867c32a98SDimitry Andric .addImm(NewOffset0) // offset0
141967c32a98SDimitry Andric .addImm(NewOffset1) // offset1
14205a5ac124SDimitry Andric .addImm(0) // gds
1421706b4fc4SDimitry Andric .cloneMergedMemRefs({&*CI.I, &*Paired.I});
142267c32a98SDimitry Andric
142371d5a254SDimitry Andric CI.I->eraseFromParent();
1424706b4fc4SDimitry Andric Paired.I->eraseFromParent();
142567c32a98SDimitry Andric
1426eb11fae6SDimitry Andric LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
14271d5ae102SDimitry Andric return Write2;
142867c32a98SDimitry Andric }
142967c32a98SDimitry Andric
1430d8e91e46SDimitry Andric MachineBasicBlock::iterator
mergeImagePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1431cfca06d7SDimitry Andric SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1432145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
1433044eb2f6SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
1434044eb2f6SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1435706b4fc4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
1436044eb2f6SDimitry Andric
1437706b4fc4SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1438d8e91e46SDimitry Andric
14391d5ae102SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC);
1440706b4fc4SDimitry Andric unsigned MergedDMask = CI.DMask | Paired.DMask;
14411d5ae102SDimitry Andric unsigned DMaskIdx =
14421d5ae102SDimitry Andric AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1443044eb2f6SDimitry Andric
1444145449b1SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
14451d5ae102SDimitry Andric for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
14461d5ae102SDimitry Andric if (I == DMaskIdx)
14471d5ae102SDimitry Andric MIB.addImm(MergedDMask);
14481d5ae102SDimitry Andric else
14491d5ae102SDimitry Andric MIB.add((*CI.I).getOperand(I));
14501d5ae102SDimitry Andric }
1451044eb2f6SDimitry Andric
14521d5ae102SDimitry Andric // It shouldn't be possible to get this far if the two instructions
14531d5ae102SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias()
14541d5ae102SDimitry Andric // will return true if this is the case.
1455706b4fc4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1456d8e91e46SDimitry Andric
1457145449b1SDimitry Andric MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1458044eb2f6SDimitry Andric
1459ac9a064cSDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1460044eb2f6SDimitry Andric
1461044eb2f6SDimitry Andric CI.I->eraseFromParent();
1462706b4fc4SDimitry Andric Paired.I->eraseFromParent();
14631d5ae102SDimitry Andric return New;
14641d5ae102SDimitry Andric }
14651d5ae102SDimitry Andric
mergeSMemLoadImmPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1466e3b55780SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1467cfca06d7SDimitry Andric CombineInfo &CI, CombineInfo &Paired,
1468145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
14691d5ae102SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
14701d5ae102SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1471706b4fc4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
14721d5ae102SDimitry Andric
1473706b4fc4SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
14741d5ae102SDimitry Andric
14751d5ae102SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC);
1476706b4fc4SDimitry Andric unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
14771d5ae102SDimitry Andric
14781d5ae102SDimitry Andric // It shouldn't be possible to get this far if the two instructions
14791d5ae102SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias()
14801d5ae102SDimitry Andric // will return true if this is the case.
1481706b4fc4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
14821d5ae102SDimitry Andric
1483e3b55780SDimitry Andric MachineInstrBuilder New =
1484145449b1SDimitry Andric BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1485e3b55780SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1486e3b55780SDimitry Andric if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1487e3b55780SDimitry Andric New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1488e3b55780SDimitry Andric New.addImm(MergedOffset);
1489e3b55780SDimitry Andric New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
14901d5ae102SDimitry Andric
1491ac9a064cSDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
14921d5ae102SDimitry Andric
14931d5ae102SDimitry Andric CI.I->eraseFromParent();
1494706b4fc4SDimitry Andric Paired.I->eraseFromParent();
14951d5ae102SDimitry Andric return New;
14961d5ae102SDimitry Andric }
14971d5ae102SDimitry Andric
mergeBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1498cfca06d7SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1499cfca06d7SDimitry Andric CombineInfo &CI, CombineInfo &Paired,
1500145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
15011d5ae102SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
15021d5ae102SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
15031d5ae102SDimitry Andric
1504706b4fc4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
15051d5ae102SDimitry Andric
1506706b4fc4SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
15071d5ae102SDimitry Andric
15081d5ae102SDimitry Andric // Copy to the new source register.
15091d5ae102SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC);
1510706b4fc4SDimitry Andric unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
15111d5ae102SDimitry Andric
1512145449b1SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
15131d5ae102SDimitry Andric
1514cfca06d7SDimitry Andric AddressRegs Regs = getRegs(Opcode, *TII);
15151d5ae102SDimitry Andric
1516cfca06d7SDimitry Andric if (Regs.VAddr)
15171d5ae102SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
15181d5ae102SDimitry Andric
15191d5ae102SDimitry Andric // It shouldn't be possible to get this far if the two instructions
15201d5ae102SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias()
15211d5ae102SDimitry Andric // will return true if this is the case.
1522706b4fc4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
15231d5ae102SDimitry Andric
15241d5ae102SDimitry Andric MachineInstr *New =
15251d5ae102SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
15261d5ae102SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
15271d5ae102SDimitry Andric .addImm(MergedOffset) // offset
1528344a3780SDimitry Andric .addImm(CI.CPol) // cpol
15291d5ae102SDimitry Andric .addImm(0) // swz
1530145449b1SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
15311d5ae102SDimitry Andric
1532ac9a064cSDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
15331d5ae102SDimitry Andric
15341d5ae102SDimitry Andric CI.I->eraseFromParent();
1535706b4fc4SDimitry Andric Paired.I->eraseFromParent();
15361d5ae102SDimitry Andric return New;
1537044eb2f6SDimitry Andric }
1538044eb2f6SDimitry Andric
mergeTBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1539cfca06d7SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1540cfca06d7SDimitry Andric CombineInfo &CI, CombineInfo &Paired,
1541145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
1542706b4fc4SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
1543706b4fc4SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1544706b4fc4SDimitry Andric
1545706b4fc4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
1546706b4fc4SDimitry Andric
1547706b4fc4SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1548706b4fc4SDimitry Andric
1549706b4fc4SDimitry Andric // Copy to the new source register.
1550706b4fc4SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC);
1551706b4fc4SDimitry Andric unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1552706b4fc4SDimitry Andric
1553145449b1SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1554706b4fc4SDimitry Andric
1555cfca06d7SDimitry Andric AddressRegs Regs = getRegs(Opcode, *TII);
1556706b4fc4SDimitry Andric
1557cfca06d7SDimitry Andric if (Regs.VAddr)
1558706b4fc4SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1559706b4fc4SDimitry Andric
1560706b4fc4SDimitry Andric unsigned JoinedFormat =
1561cfca06d7SDimitry Andric getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1562706b4fc4SDimitry Andric
1563706b4fc4SDimitry Andric // It shouldn't be possible to get this far if the two instructions
1564706b4fc4SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias()
1565706b4fc4SDimitry Andric // will return true if this is the case.
1566706b4fc4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1567706b4fc4SDimitry Andric
1568706b4fc4SDimitry Andric MachineInstr *New =
1569706b4fc4SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1570706b4fc4SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1571706b4fc4SDimitry Andric .addImm(MergedOffset) // offset
1572706b4fc4SDimitry Andric .addImm(JoinedFormat) // format
1573344a3780SDimitry Andric .addImm(CI.CPol) // cpol
1574706b4fc4SDimitry Andric .addImm(0) // swz
1575145449b1SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1576706b4fc4SDimitry Andric
1577ac9a064cSDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1578706b4fc4SDimitry Andric
1579706b4fc4SDimitry Andric CI.I->eraseFromParent();
1580706b4fc4SDimitry Andric Paired.I->eraseFromParent();
1581706b4fc4SDimitry Andric return New;
1582706b4fc4SDimitry Andric }
1583706b4fc4SDimitry Andric
mergeTBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1584cfca06d7SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1585cfca06d7SDimitry Andric CombineInfo &CI, CombineInfo &Paired,
1586145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
1587706b4fc4SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
1588706b4fc4SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1589706b4fc4SDimitry Andric
1590706b4fc4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
1591706b4fc4SDimitry Andric
1592ac9a064cSDimitry Andric Register SrcReg =
1593ac9a064cSDimitry Andric copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1594706b4fc4SDimitry Andric
1595145449b1SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1596706b4fc4SDimitry Andric .addReg(SrcReg, RegState::Kill);
1597706b4fc4SDimitry Andric
1598cfca06d7SDimitry Andric AddressRegs Regs = getRegs(Opcode, *TII);
1599706b4fc4SDimitry Andric
1600cfca06d7SDimitry Andric if (Regs.VAddr)
1601706b4fc4SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1602706b4fc4SDimitry Andric
1603706b4fc4SDimitry Andric unsigned JoinedFormat =
1604cfca06d7SDimitry Andric getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1605706b4fc4SDimitry Andric
1606706b4fc4SDimitry Andric // It shouldn't be possible to get this far if the two instructions
1607706b4fc4SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias()
1608706b4fc4SDimitry Andric // will return true if this is the case.
1609706b4fc4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1610706b4fc4SDimitry Andric
1611706b4fc4SDimitry Andric MachineInstr *New =
1612706b4fc4SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1613706b4fc4SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1614706b4fc4SDimitry Andric .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1615706b4fc4SDimitry Andric .addImm(JoinedFormat) // format
1616344a3780SDimitry Andric .addImm(CI.CPol) // cpol
1617706b4fc4SDimitry Andric .addImm(0) // swz
1618145449b1SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1619706b4fc4SDimitry Andric
1620145449b1SDimitry Andric CI.I->eraseFromParent();
1621145449b1SDimitry Andric Paired.I->eraseFromParent();
1622145449b1SDimitry Andric return New;
1623145449b1SDimitry Andric }
1624145449b1SDimitry Andric
mergeFlatLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1625145449b1SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1626145449b1SDimitry Andric CombineInfo &CI, CombineInfo &Paired,
1627145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
1628145449b1SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
1629145449b1SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1630145449b1SDimitry Andric
1631145449b1SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
1632145449b1SDimitry Andric
1633145449b1SDimitry Andric const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1634145449b1SDimitry Andric Register DestReg = MRI->createVirtualRegister(SuperRC);
1635145449b1SDimitry Andric
1636145449b1SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1637145449b1SDimitry Andric
1638145449b1SDimitry Andric if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1639145449b1SDimitry Andric MIB.add(*SAddr);
1640145449b1SDimitry Andric
1641145449b1SDimitry Andric MachineInstr *New =
1642145449b1SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1643145449b1SDimitry Andric .addImm(std::min(CI.Offset, Paired.Offset))
1644145449b1SDimitry Andric .addImm(CI.CPol)
1645145449b1SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1646145449b1SDimitry Andric
1647ac9a064cSDimitry Andric copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1648145449b1SDimitry Andric
1649145449b1SDimitry Andric CI.I->eraseFromParent();
1650145449b1SDimitry Andric Paired.I->eraseFromParent();
1651145449b1SDimitry Andric return New;
1652145449b1SDimitry Andric }
1653145449b1SDimitry Andric
mergeFlatStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1654145449b1SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1655145449b1SDimitry Andric CombineInfo &CI, CombineInfo &Paired,
1656145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
1657145449b1SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
1658145449b1SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1659145449b1SDimitry Andric
1660145449b1SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
1661145449b1SDimitry Andric
1662ac9a064cSDimitry Andric Register SrcReg =
1663ac9a064cSDimitry Andric copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1664145449b1SDimitry Andric
1665145449b1SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1666145449b1SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1667145449b1SDimitry Andric .addReg(SrcReg, RegState::Kill);
1668145449b1SDimitry Andric
1669145449b1SDimitry Andric if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1670145449b1SDimitry Andric MIB.add(*SAddr);
1671145449b1SDimitry Andric
1672145449b1SDimitry Andric MachineInstr *New =
1673145449b1SDimitry Andric MIB.addImm(std::min(CI.Offset, Paired.Offset))
1674145449b1SDimitry Andric .addImm(CI.CPol)
1675145449b1SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1676706b4fc4SDimitry Andric
1677706b4fc4SDimitry Andric CI.I->eraseFromParent();
1678706b4fc4SDimitry Andric Paired.I->eraseFromParent();
1679706b4fc4SDimitry Andric return New;
1680706b4fc4SDimitry Andric }
1681706b4fc4SDimitry Andric
getNewOpcode(const CombineInfo & CI,const CombineInfo & Paired)1682706b4fc4SDimitry Andric unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1683706b4fc4SDimitry Andric const CombineInfo &Paired) {
1684706b4fc4SDimitry Andric const unsigned Width = CI.Width + Paired.Width;
1685044eb2f6SDimitry Andric
1686145449b1SDimitry Andric switch (getCommonInstClass(CI, Paired)) {
1687d8e91e46SDimitry Andric default:
16881d5ae102SDimitry Andric assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
16891d5ae102SDimitry Andric // FIXME: Handle d16 correctly
16901d5ae102SDimitry Andric return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
16911d5ae102SDimitry Andric Width);
1692706b4fc4SDimitry Andric case TBUFFER_LOAD:
1693706b4fc4SDimitry Andric case TBUFFER_STORE:
1694706b4fc4SDimitry Andric return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1695706b4fc4SDimitry Andric Width);
1696706b4fc4SDimitry Andric
1697d8e91e46SDimitry Andric case UNKNOWN:
1698d8e91e46SDimitry Andric llvm_unreachable("Unknown instruction class");
1699d8e91e46SDimitry Andric case S_BUFFER_LOAD_IMM:
1700d8e91e46SDimitry Andric switch (Width) {
1701d8e91e46SDimitry Andric default:
1702044eb2f6SDimitry Andric return 0;
1703d8e91e46SDimitry Andric case 2:
1704d8e91e46SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1705312c0ed1SDimitry Andric case 3:
1706312c0ed1SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1707d8e91e46SDimitry Andric case 4:
1708d8e91e46SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1709c0981da4SDimitry Andric case 8:
1710c0981da4SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1711d8e91e46SDimitry Andric }
1712e3b55780SDimitry Andric case S_BUFFER_LOAD_SGPR_IMM:
1713e3b55780SDimitry Andric switch (Width) {
1714e3b55780SDimitry Andric default:
1715e3b55780SDimitry Andric return 0;
1716e3b55780SDimitry Andric case 2:
17177fa27ce4SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1718312c0ed1SDimitry Andric case 3:
1719312c0ed1SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1720e3b55780SDimitry Andric case 4:
17217fa27ce4SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1722e3b55780SDimitry Andric case 8:
17237fa27ce4SDimitry Andric return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1724e3b55780SDimitry Andric }
1725ac9a064cSDimitry Andric case S_LOAD_IMM: {
1726ac9a064cSDimitry Andric // If XNACK is enabled, use the constrained opcodes when the first load is
1727ac9a064cSDimitry Andric // under-aligned.
1728ac9a064cSDimitry Andric const MachineMemOperand *MMO = *CI.I->memoperands_begin();
1729ac9a064cSDimitry Andric bool NeedsConstrainedOpc =
1730ac9a064cSDimitry Andric STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
1731e3b55780SDimitry Andric switch (Width) {
1732e3b55780SDimitry Andric default:
1733e3b55780SDimitry Andric return 0;
1734e3b55780SDimitry Andric case 2:
1735ac9a064cSDimitry Andric return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1736ac9a064cSDimitry Andric : AMDGPU::S_LOAD_DWORDX2_IMM;
1737312c0ed1SDimitry Andric case 3:
1738ac9a064cSDimitry Andric return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1739ac9a064cSDimitry Andric : AMDGPU::S_LOAD_DWORDX3_IMM;
1740e3b55780SDimitry Andric case 4:
1741ac9a064cSDimitry Andric return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1742ac9a064cSDimitry Andric : AMDGPU::S_LOAD_DWORDX4_IMM;
1743e3b55780SDimitry Andric case 8:
1744ac9a064cSDimitry Andric return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1745ac9a064cSDimitry Andric : AMDGPU::S_LOAD_DWORDX8_IMM;
1746ac9a064cSDimitry Andric }
1747e3b55780SDimitry Andric }
1748145449b1SDimitry Andric case GLOBAL_LOAD:
1749145449b1SDimitry Andric switch (Width) {
1750145449b1SDimitry Andric default:
1751145449b1SDimitry Andric return 0;
1752145449b1SDimitry Andric case 2:
1753145449b1SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX2;
1754145449b1SDimitry Andric case 3:
1755145449b1SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX3;
1756145449b1SDimitry Andric case 4:
1757145449b1SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX4;
1758145449b1SDimitry Andric }
1759145449b1SDimitry Andric case GLOBAL_LOAD_SADDR:
1760145449b1SDimitry Andric switch (Width) {
1761145449b1SDimitry Andric default:
1762145449b1SDimitry Andric return 0;
1763145449b1SDimitry Andric case 2:
1764145449b1SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1765145449b1SDimitry Andric case 3:
1766145449b1SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1767145449b1SDimitry Andric case 4:
1768145449b1SDimitry Andric return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1769145449b1SDimitry Andric }
1770145449b1SDimitry Andric case GLOBAL_STORE:
1771145449b1SDimitry Andric switch (Width) {
1772145449b1SDimitry Andric default:
1773145449b1SDimitry Andric return 0;
1774145449b1SDimitry Andric case 2:
1775145449b1SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX2;
1776145449b1SDimitry Andric case 3:
1777145449b1SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX3;
1778145449b1SDimitry Andric case 4:
1779145449b1SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX4;
1780145449b1SDimitry Andric }
1781145449b1SDimitry Andric case GLOBAL_STORE_SADDR:
1782145449b1SDimitry Andric switch (Width) {
1783145449b1SDimitry Andric default:
1784145449b1SDimitry Andric return 0;
1785145449b1SDimitry Andric case 2:
1786145449b1SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1787145449b1SDimitry Andric case 3:
1788145449b1SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1789145449b1SDimitry Andric case 4:
1790145449b1SDimitry Andric return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1791145449b1SDimitry Andric }
1792145449b1SDimitry Andric case FLAT_LOAD:
1793145449b1SDimitry Andric switch (Width) {
1794145449b1SDimitry Andric default:
1795145449b1SDimitry Andric return 0;
1796145449b1SDimitry Andric case 2:
1797145449b1SDimitry Andric return AMDGPU::FLAT_LOAD_DWORDX2;
1798145449b1SDimitry Andric case 3:
1799145449b1SDimitry Andric return AMDGPU::FLAT_LOAD_DWORDX3;
1800145449b1SDimitry Andric case 4:
1801145449b1SDimitry Andric return AMDGPU::FLAT_LOAD_DWORDX4;
1802145449b1SDimitry Andric }
1803145449b1SDimitry Andric case FLAT_STORE:
1804145449b1SDimitry Andric switch (Width) {
1805145449b1SDimitry Andric default:
1806145449b1SDimitry Andric return 0;
1807145449b1SDimitry Andric case 2:
1808145449b1SDimitry Andric return AMDGPU::FLAT_STORE_DWORDX2;
1809145449b1SDimitry Andric case 3:
1810145449b1SDimitry Andric return AMDGPU::FLAT_STORE_DWORDX3;
1811145449b1SDimitry Andric case 4:
1812145449b1SDimitry Andric return AMDGPU::FLAT_STORE_DWORDX4;
1813145449b1SDimitry Andric }
18141d5ae102SDimitry Andric case MIMG:
1815e3b55780SDimitry Andric assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1816c0981da4SDimitry Andric "No overlaps");
18171d5ae102SDimitry Andric return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1818d8e91e46SDimitry Andric }
1819044eb2f6SDimitry Andric }
1820044eb2f6SDimitry Andric
1821d8e91e46SDimitry Andric std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI,const CombineInfo & Paired)1822c0981da4SDimitry Andric SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1823c0981da4SDimitry Andric const CombineInfo &Paired) {
1824e3b55780SDimitry Andric assert((CI.InstClass != MIMG ||
1825e3b55780SDimitry Andric ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1826145449b1SDimitry Andric CI.Width + Paired.Width)) &&
18271d5ae102SDimitry Andric "No overlaps");
18281d5ae102SDimitry Andric
1829c0981da4SDimitry Andric unsigned Idx0;
1830c0981da4SDimitry Andric unsigned Idx1;
1831c0981da4SDimitry Andric
18326f8fc217SDimitry Andric static const unsigned Idxs[5][4] = {
18331d5ae102SDimitry Andric {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
18346f8fc217SDimitry Andric {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
18356f8fc217SDimitry Andric {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
18366f8fc217SDimitry Andric {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
18376f8fc217SDimitry Andric {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
18381d5ae102SDimitry Andric };
18391d5ae102SDimitry Andric
18406f8fc217SDimitry Andric assert(CI.Width >= 1 && CI.Width <= 4);
18416f8fc217SDimitry Andric assert(Paired.Width >= 1 && Paired.Width <= 4);
18421d5ae102SDimitry Andric
1843145449b1SDimitry Andric if (Paired < CI) {
1844706b4fc4SDimitry Andric Idx1 = Idxs[0][Paired.Width - 1];
1845706b4fc4SDimitry Andric Idx0 = Idxs[Paired.Width][CI.Width - 1];
1846d8e91e46SDimitry Andric } else {
1847706b4fc4SDimitry Andric Idx0 = Idxs[0][CI.Width - 1];
1848706b4fc4SDimitry Andric Idx1 = Idxs[CI.Width][Paired.Width - 1];
1849d8e91e46SDimitry Andric }
18501d5ae102SDimitry Andric
1851ac9a064cSDimitry Andric return {Idx0, Idx1};
1852d8e91e46SDimitry Andric }
1853d8e91e46SDimitry Andric
1854d8e91e46SDimitry Andric const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI,const CombineInfo & Paired) const1855706b4fc4SDimitry Andric SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1856ac9a064cSDimitry Andric const CombineInfo &Paired) const {
1857e3b55780SDimitry Andric if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1858e3b55780SDimitry Andric CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1859706b4fc4SDimitry Andric switch (CI.Width + Paired.Width) {
1860d8e91e46SDimitry Andric default:
1861d8e91e46SDimitry Andric return nullptr;
1862d8e91e46SDimitry Andric case 2:
1863d8e91e46SDimitry Andric return &AMDGPU::SReg_64_XEXECRegClass;
1864312c0ed1SDimitry Andric case 3:
1865312c0ed1SDimitry Andric return &AMDGPU::SGPR_96RegClass;
1866d8e91e46SDimitry Andric case 4:
18671d5ae102SDimitry Andric return &AMDGPU::SGPR_128RegClass;
1868d8e91e46SDimitry Andric case 8:
1869cfca06d7SDimitry Andric return &AMDGPU::SGPR_256RegClass;
1870d8e91e46SDimitry Andric case 16:
1871cfca06d7SDimitry Andric return &AMDGPU::SGPR_512RegClass;
1872d8e91e46SDimitry Andric }
1873d8e91e46SDimitry Andric }
1874344a3780SDimitry Andric
1875344a3780SDimitry Andric unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1876f65dcba8SDimitry Andric return TRI->isAGPRClass(getDataRegClass(*CI.I))
1877344a3780SDimitry Andric ? TRI->getAGPRClassForBitWidth(BitWidth)
1878344a3780SDimitry Andric : TRI->getVGPRClassForBitWidth(BitWidth);
1879d8e91e46SDimitry Andric }
1880d8e91e46SDimitry Andric
mergeBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1881cfca06d7SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1882cfca06d7SDimitry Andric CombineInfo &CI, CombineInfo &Paired,
1883145449b1SDimitry Andric MachineBasicBlock::iterator InsertBefore) {
1884044eb2f6SDimitry Andric MachineBasicBlock *MBB = CI.I->getParent();
1885044eb2f6SDimitry Andric DebugLoc DL = CI.I->getDebugLoc();
1886044eb2f6SDimitry Andric
1887706b4fc4SDimitry Andric const unsigned Opcode = getNewOpcode(CI, Paired);
1888044eb2f6SDimitry Andric
1889ac9a064cSDimitry Andric Register SrcReg =
1890ac9a064cSDimitry Andric copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1891044eb2f6SDimitry Andric
1892145449b1SDimitry Andric auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1893044eb2f6SDimitry Andric .addReg(SrcReg, RegState::Kill);
1894044eb2f6SDimitry Andric
1895cfca06d7SDimitry Andric AddressRegs Regs = getRegs(Opcode, *TII);
1896d8e91e46SDimitry Andric
1897cfca06d7SDimitry Andric if (Regs.VAddr)
1898044eb2f6SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1899044eb2f6SDimitry Andric
19001d5ae102SDimitry Andric
19011d5ae102SDimitry Andric // It shouldn't be possible to get this far if the two instructions
19021d5ae102SDimitry Andric // don't have a single memoperand, because MachineInstr::mayAlias()
19031d5ae102SDimitry Andric // will return true if this is the case.
1904706b4fc4SDimitry Andric assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
19051d5ae102SDimitry Andric
19061d5ae102SDimitry Andric MachineInstr *New =
1907044eb2f6SDimitry Andric MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1908044eb2f6SDimitry Andric .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1909706b4fc4SDimitry Andric .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1910344a3780SDimitry Andric .addImm(CI.CPol) // cpol
19111d5ae102SDimitry Andric .addImm(0) // swz
1912145449b1SDimitry Andric .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1913044eb2f6SDimitry Andric
1914044eb2f6SDimitry Andric CI.I->eraseFromParent();
1915706b4fc4SDimitry Andric Paired.I->eraseFromParent();
19161d5ae102SDimitry Andric return New;
1917044eb2f6SDimitry Andric }
1918044eb2f6SDimitry Andric
1919d8e91e46SDimitry Andric MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI) const19201d5ae102SDimitry Andric SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1921d8e91e46SDimitry Andric APInt V(32, Val, true);
1922d8e91e46SDimitry Andric if (TII->isInlineConstant(V))
1923d8e91e46SDimitry Andric return MachineOperand::CreateImm(Val);
1924d8e91e46SDimitry Andric
19251d5ae102SDimitry Andric Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1926d8e91e46SDimitry Andric MachineInstr *Mov =
1927d8e91e46SDimitry Andric BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1928d8e91e46SDimitry Andric TII->get(AMDGPU::S_MOV_B32), Reg)
1929d8e91e46SDimitry Andric .addImm(Val);
1930d8e91e46SDimitry Andric (void)Mov;
1931d8e91e46SDimitry Andric LLVM_DEBUG(dbgs() << " "; Mov->dump());
1932d8e91e46SDimitry Andric return MachineOperand::CreateReg(Reg, false);
1933d8e91e46SDimitry Andric }
1934d8e91e46SDimitry Andric
1935d8e91e46SDimitry Andric // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr) const1936cfca06d7SDimitry Andric Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
19371d5ae102SDimitry Andric const MemAddress &Addr) const {
1938d8e91e46SDimitry Andric MachineBasicBlock *MBB = MI.getParent();
1939d8e91e46SDimitry Andric MachineBasicBlock::iterator MBBI = MI.getIterator();
1940d8e91e46SDimitry Andric DebugLoc DL = MI.getDebugLoc();
1941d8e91e46SDimitry Andric
1942d8e91e46SDimitry Andric assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1943d8e91e46SDimitry Andric Addr.Base.LoSubReg) &&
1944d8e91e46SDimitry Andric "Expected 32-bit Base-Register-Low!!");
1945d8e91e46SDimitry Andric
1946d8e91e46SDimitry Andric assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1947d8e91e46SDimitry Andric Addr.Base.HiSubReg) &&
1948d8e91e46SDimitry Andric "Expected 32-bit Base-Register-Hi!!");
1949d8e91e46SDimitry Andric
1950d8e91e46SDimitry Andric LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1951d8e91e46SDimitry Andric MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1952d8e91e46SDimitry Andric MachineOperand OffsetHi =
1953d8e91e46SDimitry Andric createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1954e6d15924SDimitry Andric
1955e6d15924SDimitry Andric const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
19561d5ae102SDimitry Andric Register CarryReg = MRI->createVirtualRegister(CarryRC);
19571d5ae102SDimitry Andric Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1958d8e91e46SDimitry Andric
19591d5ae102SDimitry Andric Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
19601d5ae102SDimitry Andric Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1961d8e91e46SDimitry Andric MachineInstr *LoHalf =
1962b60736ecSDimitry Andric BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1963d8e91e46SDimitry Andric .addReg(CarryReg, RegState::Define)
1964d8e91e46SDimitry Andric .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1965e6d15924SDimitry Andric .add(OffsetLo)
1966e6d15924SDimitry Andric .addImm(0); // clamp bit
1967d8e91e46SDimitry Andric (void)LoHalf;
1968d8e91e46SDimitry Andric LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1969d8e91e46SDimitry Andric
1970d8e91e46SDimitry Andric MachineInstr *HiHalf =
1971d8e91e46SDimitry Andric BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1972d8e91e46SDimitry Andric .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1973d8e91e46SDimitry Andric .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1974d8e91e46SDimitry Andric .add(OffsetHi)
1975e6d15924SDimitry Andric .addReg(CarryReg, RegState::Kill)
1976e6d15924SDimitry Andric .addImm(0); // clamp bit
1977d8e91e46SDimitry Andric (void)HiHalf;
1978d8e91e46SDimitry Andric LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1979d8e91e46SDimitry Andric
1980344a3780SDimitry Andric Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1981d8e91e46SDimitry Andric MachineInstr *FullBase =
1982d8e91e46SDimitry Andric BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1983d8e91e46SDimitry Andric .addReg(DestSub0)
1984d8e91e46SDimitry Andric .addImm(AMDGPU::sub0)
1985d8e91e46SDimitry Andric .addReg(DestSub1)
1986d8e91e46SDimitry Andric .addImm(AMDGPU::sub1);
1987d8e91e46SDimitry Andric (void)FullBase;
1988d8e91e46SDimitry Andric LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1989d8e91e46SDimitry Andric
1990d8e91e46SDimitry Andric return FullDestReg;
1991d8e91e46SDimitry Andric }
1992d8e91e46SDimitry Andric
1993d8e91e46SDimitry Andric // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,Register NewBase,int32_t NewOffset) const1994d8e91e46SDimitry Andric void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1995cfca06d7SDimitry Andric Register NewBase,
19961d5ae102SDimitry Andric int32_t NewOffset) const {
1997706b4fc4SDimitry Andric auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1998706b4fc4SDimitry Andric Base->setReg(NewBase);
1999706b4fc4SDimitry Andric Base->setIsKill(false);
2000d8e91e46SDimitry Andric TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2001d8e91e46SDimitry Andric }
2002d8e91e46SDimitry Andric
2003e3b55780SDimitry Andric std::optional<int32_t>
extractConstOffset(const MachineOperand & Op) const20041d5ae102SDimitry Andric SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2005d8e91e46SDimitry Andric if (Op.isImm())
2006d8e91e46SDimitry Andric return Op.getImm();
2007d8e91e46SDimitry Andric
2008d8e91e46SDimitry Andric if (!Op.isReg())
2009e3b55780SDimitry Andric return std::nullopt;
2010d8e91e46SDimitry Andric
2011d8e91e46SDimitry Andric MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2012d8e91e46SDimitry Andric if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2013d8e91e46SDimitry Andric !Def->getOperand(1).isImm())
2014e3b55780SDimitry Andric return std::nullopt;
2015d8e91e46SDimitry Andric
2016d8e91e46SDimitry Andric return Def->getOperand(1).getImm();
2017d8e91e46SDimitry Andric }
2018d8e91e46SDimitry Andric
2019d8e91e46SDimitry Andric // Analyze Base and extracts:
2020d8e91e46SDimitry Andric // - 32bit base registers, subregisters
2021d8e91e46SDimitry Andric // - 64bit constant offset
2022d8e91e46SDimitry Andric // Expecting base computation as:
2023d8e91e46SDimitry Andric // %OFFSET0:sgpr_32 = S_MOV_B32 8000
2024d8e91e46SDimitry Andric // %LO:vgpr_32, %c:sreg_64_xexec =
2025b60736ecSDimitry Andric // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2026d8e91e46SDimitry Andric // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2027d8e91e46SDimitry Andric // %Base:vreg_64 =
2028d8e91e46SDimitry Andric // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr) const2029d8e91e46SDimitry Andric void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
20301d5ae102SDimitry Andric MemAddress &Addr) const {
2031d8e91e46SDimitry Andric if (!Base.isReg())
2032d8e91e46SDimitry Andric return;
2033d8e91e46SDimitry Andric
2034d8e91e46SDimitry Andric MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2035d8e91e46SDimitry Andric if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2036d8e91e46SDimitry Andric || Def->getNumOperands() != 5)
2037d8e91e46SDimitry Andric return;
2038d8e91e46SDimitry Andric
2039d8e91e46SDimitry Andric MachineOperand BaseLo = Def->getOperand(1);
2040d8e91e46SDimitry Andric MachineOperand BaseHi = Def->getOperand(3);
2041d8e91e46SDimitry Andric if (!BaseLo.isReg() || !BaseHi.isReg())
2042d8e91e46SDimitry Andric return;
2043d8e91e46SDimitry Andric
2044d8e91e46SDimitry Andric MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2045d8e91e46SDimitry Andric MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2046d8e91e46SDimitry Andric
2047b60736ecSDimitry Andric if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2048d8e91e46SDimitry Andric !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2049d8e91e46SDimitry Andric return;
2050d8e91e46SDimitry Andric
2051d8e91e46SDimitry Andric const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2052d8e91e46SDimitry Andric const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2053d8e91e46SDimitry Andric
2054d8e91e46SDimitry Andric auto Offset0P = extractConstOffset(*Src0);
2055d8e91e46SDimitry Andric if (Offset0P)
2056d8e91e46SDimitry Andric BaseLo = *Src1;
2057d8e91e46SDimitry Andric else {
2058d8e91e46SDimitry Andric if (!(Offset0P = extractConstOffset(*Src1)))
2059d8e91e46SDimitry Andric return;
2060d8e91e46SDimitry Andric BaseLo = *Src0;
2061d8e91e46SDimitry Andric }
2062d8e91e46SDimitry Andric
2063d8e91e46SDimitry Andric Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2064d8e91e46SDimitry Andric Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2065d8e91e46SDimitry Andric
2066d8e91e46SDimitry Andric if (Src0->isImm())
2067d8e91e46SDimitry Andric std::swap(Src0, Src1);
2068d8e91e46SDimitry Andric
2069ac9a064cSDimitry Andric if (!Src1->isImm() || Src0->isImm())
2070d8e91e46SDimitry Andric return;
2071d8e91e46SDimitry Andric
2072d8e91e46SDimitry Andric uint64_t Offset1 = Src1->getImm();
2073d8e91e46SDimitry Andric BaseHi = *Src0;
2074d8e91e46SDimitry Andric
2075d8e91e46SDimitry Andric Addr.Base.LoReg = BaseLo.getReg();
2076d8e91e46SDimitry Andric Addr.Base.HiReg = BaseHi.getReg();
2077d8e91e46SDimitry Andric Addr.Base.LoSubReg = BaseLo.getSubReg();
2078d8e91e46SDimitry Andric Addr.Base.HiSubReg = BaseHi.getSubReg();
2079d8e91e46SDimitry Andric Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2080d8e91e46SDimitry Andric }
2081d8e91e46SDimitry Andric
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList) const2082d8e91e46SDimitry Andric bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2083d8e91e46SDimitry Andric MachineInstr &MI,
2084d8e91e46SDimitry Andric MemInfoMap &Visited,
20851d5ae102SDimitry Andric SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2086d8e91e46SDimitry Andric
2087ac9a064cSDimitry Andric if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2088d8e91e46SDimitry Andric return false;
2089d8e91e46SDimitry Andric
2090ac9a064cSDimitry Andric // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2091ac9a064cSDimitry Andric if (SIInstrInfo::isFLATScratch(MI))
20921d5ae102SDimitry Andric return false;
20931d5ae102SDimitry Andric
2094ac9a064cSDimitry Andric unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
2095ac9a064cSDimitry Andric : AMDGPUAS::FLAT_ADDRESS;
2096d8e91e46SDimitry Andric
2097d8e91e46SDimitry Andric if (AnchorList.count(&MI))
2098d8e91e46SDimitry Andric return false;
2099d8e91e46SDimitry Andric
2100d8e91e46SDimitry Andric LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2101d8e91e46SDimitry Andric
2102d8e91e46SDimitry Andric if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2103d8e91e46SDimitry Andric LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2104d8e91e46SDimitry Andric return false;
2105d8e91e46SDimitry Andric }
2106d8e91e46SDimitry Andric
2107d8e91e46SDimitry Andric // Step1: Find the base-registers and a 64bit constant offset.
2108d8e91e46SDimitry Andric MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2109d8e91e46SDimitry Andric MemAddress MAddr;
21107fa27ce4SDimitry Andric if (!Visited.contains(&MI)) {
2111d8e91e46SDimitry Andric processBaseWithConstOffset(Base, MAddr);
2112d8e91e46SDimitry Andric Visited[&MI] = MAddr;
2113d8e91e46SDimitry Andric } else
2114d8e91e46SDimitry Andric MAddr = Visited[&MI];
2115d8e91e46SDimitry Andric
2116d8e91e46SDimitry Andric if (MAddr.Offset == 0) {
2117d8e91e46SDimitry Andric LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2118d8e91e46SDimitry Andric " constant offsets that can be promoted.\n";);
2119d8e91e46SDimitry Andric return false;
2120d8e91e46SDimitry Andric }
2121d8e91e46SDimitry Andric
2122d8e91e46SDimitry Andric LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
2123d8e91e46SDimitry Andric << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2124d8e91e46SDimitry Andric
2125d8e91e46SDimitry Andric // Step2: Traverse through MI's basic block and find an anchor(that has the
2126d8e91e46SDimitry Andric // same base-registers) with the highest 13bit distance from MI's offset.
2127d8e91e46SDimitry Andric // E.g. (64bit loads)
2128d8e91e46SDimitry Andric // bb:
2129d8e91e46SDimitry Andric // addr1 = &a + 4096; load1 = load(addr1, 0)
2130d8e91e46SDimitry Andric // addr2 = &a + 6144; load2 = load(addr2, 0)
2131d8e91e46SDimitry Andric // addr3 = &a + 8192; load3 = load(addr3, 0)
2132d8e91e46SDimitry Andric // addr4 = &a + 10240; load4 = load(addr4, 0)
2133d8e91e46SDimitry Andric // addr5 = &a + 12288; load5 = load(addr5, 0)
2134d8e91e46SDimitry Andric //
2135d8e91e46SDimitry Andric // Starting from the first load, the optimization will try to find a new base
2136d8e91e46SDimitry Andric // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2137d8e91e46SDimitry Andric // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2138d8e91e46SDimitry Andric // as the new-base(anchor) because of the maximum distance which can
2139145449b1SDimitry Andric // accommodate more intermediate bases presumably.
2140d8e91e46SDimitry Andric //
2141d8e91e46SDimitry Andric // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2142d8e91e46SDimitry Andric // (&a + 8192) for load1, load2, load4.
2143d8e91e46SDimitry Andric // addr = &a + 8192
2144d8e91e46SDimitry Andric // load1 = load(addr, -4096)
2145d8e91e46SDimitry Andric // load2 = load(addr, -2048)
2146d8e91e46SDimitry Andric // load3 = load(addr, 0)
2147d8e91e46SDimitry Andric // load4 = load(addr, 2048)
2148d8e91e46SDimitry Andric // addr5 = &a + 12288; load5 = load(addr5, 0)
2149d8e91e46SDimitry Andric //
2150d8e91e46SDimitry Andric MachineInstr *AnchorInst = nullptr;
2151d8e91e46SDimitry Andric MemAddress AnchorAddr;
2152d8e91e46SDimitry Andric uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2153d8e91e46SDimitry Andric SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2154d8e91e46SDimitry Andric
2155d8e91e46SDimitry Andric MachineBasicBlock *MBB = MI.getParent();
2156d8e91e46SDimitry Andric MachineBasicBlock::iterator E = MBB->end();
2157d8e91e46SDimitry Andric MachineBasicBlock::iterator MBBI = MI.getIterator();
2158d8e91e46SDimitry Andric ++MBBI;
2159d8e91e46SDimitry Andric const SITargetLowering *TLI =
2160d8e91e46SDimitry Andric static_cast<const SITargetLowering *>(STM->getTargetLowering());
2161d8e91e46SDimitry Andric
2162d8e91e46SDimitry Andric for ( ; MBBI != E; ++MBBI) {
2163d8e91e46SDimitry Andric MachineInstr &MINext = *MBBI;
2164d8e91e46SDimitry Andric // TODO: Support finding an anchor(with same base) from store addresses or
2165d8e91e46SDimitry Andric // any other load addresses where the opcodes are different.
2166d8e91e46SDimitry Andric if (MINext.getOpcode() != MI.getOpcode() ||
2167d8e91e46SDimitry Andric TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2168d8e91e46SDimitry Andric continue;
2169d8e91e46SDimitry Andric
2170d8e91e46SDimitry Andric const MachineOperand &BaseNext =
2171d8e91e46SDimitry Andric *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2172d8e91e46SDimitry Andric MemAddress MAddrNext;
21737fa27ce4SDimitry Andric if (!Visited.contains(&MINext)) {
2174d8e91e46SDimitry Andric processBaseWithConstOffset(BaseNext, MAddrNext);
2175d8e91e46SDimitry Andric Visited[&MINext] = MAddrNext;
2176d8e91e46SDimitry Andric } else
2177d8e91e46SDimitry Andric MAddrNext = Visited[&MINext];
2178d8e91e46SDimitry Andric
2179d8e91e46SDimitry Andric if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2180d8e91e46SDimitry Andric MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2181d8e91e46SDimitry Andric MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2182d8e91e46SDimitry Andric MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2183d8e91e46SDimitry Andric continue;
2184d8e91e46SDimitry Andric
2185ac9a064cSDimitry Andric InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2186d8e91e46SDimitry Andric
2187d8e91e46SDimitry Andric int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2188d8e91e46SDimitry Andric TargetLoweringBase::AddrMode AM;
2189d8e91e46SDimitry Andric AM.HasBaseReg = true;
2190d8e91e46SDimitry Andric AM.BaseOffs = Dist;
2191ac9a064cSDimitry Andric if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2192d8e91e46SDimitry Andric (uint32_t)std::abs(Dist) > MaxDist) {
2193d8e91e46SDimitry Andric MaxDist = std::abs(Dist);
2194d8e91e46SDimitry Andric
2195d8e91e46SDimitry Andric AnchorAddr = MAddrNext;
2196d8e91e46SDimitry Andric AnchorInst = &MINext;
2197d8e91e46SDimitry Andric }
2198d8e91e46SDimitry Andric }
2199d8e91e46SDimitry Andric
2200d8e91e46SDimitry Andric if (AnchorInst) {
2201d8e91e46SDimitry Andric LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2202d8e91e46SDimitry Andric AnchorInst->dump());
2203d8e91e46SDimitry Andric LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2204d8e91e46SDimitry Andric << AnchorAddr.Offset << "\n\n");
2205d8e91e46SDimitry Andric
2206d8e91e46SDimitry Andric // Instead of moving up, just re-compute anchor-instruction's base address.
2207cfca06d7SDimitry Andric Register Base = computeBase(MI, AnchorAddr);
2208d8e91e46SDimitry Andric
2209d8e91e46SDimitry Andric updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2210d8e91e46SDimitry Andric LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2211d8e91e46SDimitry Andric
2212ac9a064cSDimitry Andric for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2213d8e91e46SDimitry Andric TargetLoweringBase::AddrMode AM;
2214d8e91e46SDimitry Andric AM.HasBaseReg = true;
2215ac9a064cSDimitry Andric AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2216d8e91e46SDimitry Andric
2217ac9a064cSDimitry Andric if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2218ac9a064cSDimitry Andric LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
2219ac9a064cSDimitry Andric OtherMI->dump());
2220ac9a064cSDimitry Andric updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2221ac9a064cSDimitry Andric LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
2222d8e91e46SDimitry Andric }
2223d8e91e46SDimitry Andric }
2224d8e91e46SDimitry Andric AnchorList.insert(AnchorInst);
2225d8e91e46SDimitry Andric return true;
2226d8e91e46SDimitry Andric }
2227d8e91e46SDimitry Andric
2228d8e91e46SDimitry Andric return false;
2229d8e91e46SDimitry Andric }
2230d8e91e46SDimitry Andric
addInstToMergeableList(const CombineInfo & CI,std::list<std::list<CombineInfo>> & MergeableInsts) const22311d5ae102SDimitry Andric void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
22321d5ae102SDimitry Andric std::list<std::list<CombineInfo> > &MergeableInsts) const {
22331d5ae102SDimitry Andric for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2234706b4fc4SDimitry Andric if (AddrList.front().InstClass == CI.InstClass &&
22356f8fc217SDimitry Andric AddrList.front().IsAGPR == CI.IsAGPR &&
2236e3b55780SDimitry Andric AddrList.front().hasSameBaseAddress(CI)) {
22371d5ae102SDimitry Andric AddrList.emplace_back(CI);
22381d5ae102SDimitry Andric return;
22391d5ae102SDimitry Andric }
22401d5ae102SDimitry Andric }
224167c32a98SDimitry Andric
22421d5ae102SDimitry Andric // Base address not found, so add a new list.
22431d5ae102SDimitry Andric MergeableInsts.emplace_back(1, CI);
22441d5ae102SDimitry Andric }
22451d5ae102SDimitry Andric
2246cfca06d7SDimitry Andric std::pair<MachineBasicBlock::iterator, bool>
collectMergeableInsts(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList,std::list<std::list<CombineInfo>> & MergeableInsts) const2247cfca06d7SDimitry Andric SILoadStoreOptimizer::collectMergeableInsts(
2248cfca06d7SDimitry Andric MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2249cfca06d7SDimitry Andric MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
22501d5ae102SDimitry Andric std::list<std::list<CombineInfo>> &MergeableInsts) const {
22511d5ae102SDimitry Andric bool Modified = false;
2252d8e91e46SDimitry Andric
22531d5ae102SDimitry Andric // Sort potential mergeable instructions into lists. One list per base address.
2254cfca06d7SDimitry Andric unsigned Order = 0;
2255cfca06d7SDimitry Andric MachineBasicBlock::iterator BlockI = Begin;
2256cfca06d7SDimitry Andric for (; BlockI != End; ++BlockI) {
2257cfca06d7SDimitry Andric MachineInstr &MI = *BlockI;
2258cfca06d7SDimitry Andric
22591d5ae102SDimitry Andric // We run this before checking if an address is mergeable, because it can produce
22601d5ae102SDimitry Andric // better code even if the instructions aren't mergeable.
2261d8e91e46SDimitry Andric if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2262d8e91e46SDimitry Andric Modified = true;
2263d8e91e46SDimitry Andric
2264ecbca9f5SDimitry Andric // Treat volatile accesses, ordered accesses and unmodeled side effects as
2265ecbca9f5SDimitry Andric // barriers. We can look after this barrier for separate merges.
2266ecbca9f5SDimitry Andric if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2267ecbca9f5SDimitry Andric LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2268cfca06d7SDimitry Andric
2269cfca06d7SDimitry Andric // Search will resume after this instruction in a separate merge list.
2270cfca06d7SDimitry Andric ++BlockI;
2271cfca06d7SDimitry Andric break;
2272cfca06d7SDimitry Andric }
2273cfca06d7SDimitry Andric
22741d5ae102SDimitry Andric const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
22751d5ae102SDimitry Andric if (InstClass == UNKNOWN)
22761d5ae102SDimitry Andric continue;
22771d5ae102SDimitry Andric
22786f8fc217SDimitry Andric // Do not merge VMEM buffer instructions with "swizzled" bit set.
22796f8fc217SDimitry Andric int Swizzled =
22806f8fc217SDimitry Andric AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
22816f8fc217SDimitry Andric if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
22826f8fc217SDimitry Andric continue;
22836f8fc217SDimitry Andric
22841d5ae102SDimitry Andric CombineInfo CI;
22856f8fc217SDimitry Andric CI.setMI(MI, *this);
2286cfca06d7SDimitry Andric CI.Order = Order++;
22871d5ae102SDimitry Andric
22881d5ae102SDimitry Andric if (!CI.hasMergeableAddress(*MRI))
22891d5ae102SDimitry Andric continue;
22901d5ae102SDimitry Andric
22916f8fc217SDimitry Andric if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
22926f8fc217SDimitry Andric // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
22936f8fc217SDimitry Andric // operands. However we are reporting that ds_write2 shall have
22946f8fc217SDimitry Andric // only VGPR data so that machine copy propagation does not
22956f8fc217SDimitry Andric // create an illegal instruction with a VGPR and AGPR sources.
22966f8fc217SDimitry Andric // Consequenctially if we create such instruction the verifier
22976f8fc217SDimitry Andric // will complain.
22986f8fc217SDimitry Andric continue;
22996f8fc217SDimitry Andric }
23006f8fc217SDimitry Andric
2301cfca06d7SDimitry Andric LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2302cfca06d7SDimitry Andric
23031d5ae102SDimitry Andric addInstToMergeableList(CI, MergeableInsts);
23041d5ae102SDimitry Andric }
2305cfca06d7SDimitry Andric
2306cfca06d7SDimitry Andric // At this point we have lists of Mergeable instructions.
2307cfca06d7SDimitry Andric //
2308cfca06d7SDimitry Andric // Part 2: Sort lists by offset and then for each CombineInfo object in the
2309cfca06d7SDimitry Andric // list try to find an instruction that can be merged with I. If an instruction
2310cfca06d7SDimitry Andric // is found, it is stored in the Paired field. If no instructions are found, then
2311cfca06d7SDimitry Andric // the CombineInfo object is deleted from the list.
2312cfca06d7SDimitry Andric
2313cfca06d7SDimitry Andric for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2314cfca06d7SDimitry Andric E = MergeableInsts.end(); I != E;) {
2315cfca06d7SDimitry Andric
2316cfca06d7SDimitry Andric std::list<CombineInfo> &MergeList = *I;
2317cfca06d7SDimitry Andric if (MergeList.size() <= 1) {
2318cfca06d7SDimitry Andric // This means we have found only one instruction with a given address
2319cfca06d7SDimitry Andric // that can be merged, and we need at least 2 instructions to do a merge,
2320cfca06d7SDimitry Andric // so this list can be discarded.
2321cfca06d7SDimitry Andric I = MergeableInsts.erase(I);
2322cfca06d7SDimitry Andric continue;
2323cfca06d7SDimitry Andric }
2324cfca06d7SDimitry Andric
2325cfca06d7SDimitry Andric // Sort the lists by offsets, this way mergeable instructions will be
2326cfca06d7SDimitry Andric // adjacent to each other in the list, which will make it easier to find
2327cfca06d7SDimitry Andric // matches.
2328cfca06d7SDimitry Andric MergeList.sort(
2329c0981da4SDimitry Andric [] (const CombineInfo &A, const CombineInfo &B) {
2330cfca06d7SDimitry Andric return A.Offset < B.Offset;
2331cfca06d7SDimitry Andric });
2332cfca06d7SDimitry Andric ++I;
2333cfca06d7SDimitry Andric }
2334cfca06d7SDimitry Andric
2335ac9a064cSDimitry Andric return {BlockI, Modified};
23361d5ae102SDimitry Andric }
23371d5ae102SDimitry Andric
23381d5ae102SDimitry Andric // Scan through looking for adjacent LDS operations with constant offsets from
23391d5ae102SDimitry Andric // the same base register. We rely on the scheduler to do the hard work of
23401d5ae102SDimitry Andric // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(std::list<std::list<CombineInfo>> & MergeableInsts)23411d5ae102SDimitry Andric bool SILoadStoreOptimizer::optimizeBlock(
23421d5ae102SDimitry Andric std::list<std::list<CombineInfo> > &MergeableInsts) {
23431d5ae102SDimitry Andric bool Modified = false;
23441d5ae102SDimitry Andric
2345cfca06d7SDimitry Andric for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2346cfca06d7SDimitry Andric E = MergeableInsts.end(); I != E;) {
2347cfca06d7SDimitry Andric std::list<CombineInfo> &MergeList = *I;
23481d5ae102SDimitry Andric
23491d5ae102SDimitry Andric bool OptimizeListAgain = false;
23501d5ae102SDimitry Andric if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2351cfca06d7SDimitry Andric // We weren't able to make any changes, so delete the list so we don't
23521d5ae102SDimitry Andric // process the same instructions the next time we try to optimize this
23531d5ae102SDimitry Andric // block.
2354cfca06d7SDimitry Andric I = MergeableInsts.erase(I);
235567c32a98SDimitry Andric continue;
235667c32a98SDimitry Andric }
235767c32a98SDimitry Andric
2358cfca06d7SDimitry Andric Modified = true;
2359cfca06d7SDimitry Andric
23601d5ae102SDimitry Andric // We made changes, but also determined that there were no more optimization
23611d5ae102SDimitry Andric // opportunities, so we don't need to reprocess the list
2362cfca06d7SDimitry Andric if (!OptimizeListAgain) {
2363cfca06d7SDimitry Andric I = MergeableInsts.erase(I);
2364cfca06d7SDimitry Andric continue;
2365cfca06d7SDimitry Andric }
2366cfca06d7SDimitry Andric OptimizeAgain = true;
23671d5ae102SDimitry Andric }
23681d5ae102SDimitry Andric return Modified;
23691d5ae102SDimitry Andric }
23701d5ae102SDimitry Andric
23711d5ae102SDimitry Andric bool
optimizeInstsWithSameBaseAddr(std::list<CombineInfo> & MergeList,bool & OptimizeListAgain)23721d5ae102SDimitry Andric SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
23731d5ae102SDimitry Andric std::list<CombineInfo> &MergeList,
23741d5ae102SDimitry Andric bool &OptimizeListAgain) {
2375cfca06d7SDimitry Andric if (MergeList.empty())
2376cfca06d7SDimitry Andric return false;
2377cfca06d7SDimitry Andric
23781d5ae102SDimitry Andric bool Modified = false;
2379706b4fc4SDimitry Andric
2380cfca06d7SDimitry Andric for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2381cfca06d7SDimitry Andric Next = std::next(I)) {
2382cfca06d7SDimitry Andric
2383cfca06d7SDimitry Andric auto First = I;
2384cfca06d7SDimitry Andric auto Second = Next;
2385cfca06d7SDimitry Andric
2386cfca06d7SDimitry Andric if ((*First).Order > (*Second).Order)
2387cfca06d7SDimitry Andric std::swap(First, Second);
2388cfca06d7SDimitry Andric CombineInfo &CI = *First;
2389cfca06d7SDimitry Andric CombineInfo &Paired = *Second;
2390cfca06d7SDimitry Andric
2391145449b1SDimitry Andric CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2392145449b1SDimitry Andric if (!Where) {
2393cfca06d7SDimitry Andric ++I;
2394706b4fc4SDimitry Andric continue;
2395cfca06d7SDimitry Andric }
2396706b4fc4SDimitry Andric
2397706b4fc4SDimitry Andric Modified = true;
2398cfca06d7SDimitry Andric
2399cfca06d7SDimitry Andric LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2400044eb2f6SDimitry Andric
2401145449b1SDimitry Andric MachineBasicBlock::iterator NewMI;
2402d8e91e46SDimitry Andric switch (CI.InstClass) {
2403d8e91e46SDimitry Andric default:
2404706b4fc4SDimitry Andric llvm_unreachable("unknown InstClass");
2405d8e91e46SDimitry Andric break;
2406145449b1SDimitry Andric case DS_READ:
2407145449b1SDimitry Andric NewMI = mergeRead2Pair(CI, Paired, Where->I);
2408145449b1SDimitry Andric break;
2409145449b1SDimitry Andric case DS_WRITE:
2410145449b1SDimitry Andric NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2411145449b1SDimitry Andric break;
2412145449b1SDimitry Andric case S_BUFFER_LOAD_IMM:
2413e3b55780SDimitry Andric case S_BUFFER_LOAD_SGPR_IMM:
2414e3b55780SDimitry Andric case S_LOAD_IMM:
2415e3b55780SDimitry Andric NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2416145449b1SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 8;
2417145449b1SDimitry Andric break;
2418145449b1SDimitry Andric case BUFFER_LOAD:
2419145449b1SDimitry Andric NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2420145449b1SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
2421145449b1SDimitry Andric break;
2422145449b1SDimitry Andric case BUFFER_STORE:
2423145449b1SDimitry Andric NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2424145449b1SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
2425145449b1SDimitry Andric break;
2426145449b1SDimitry Andric case MIMG:
2427145449b1SDimitry Andric NewMI = mergeImagePair(CI, Paired, Where->I);
2428145449b1SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
2429145449b1SDimitry Andric break;
2430145449b1SDimitry Andric case TBUFFER_LOAD:
2431145449b1SDimitry Andric NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2432145449b1SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
2433145449b1SDimitry Andric break;
2434145449b1SDimitry Andric case TBUFFER_STORE:
2435145449b1SDimitry Andric NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2436145449b1SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
2437145449b1SDimitry Andric break;
2438145449b1SDimitry Andric case FLAT_LOAD:
2439145449b1SDimitry Andric case GLOBAL_LOAD:
2440145449b1SDimitry Andric case GLOBAL_LOAD_SADDR:
2441145449b1SDimitry Andric NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2442145449b1SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
2443145449b1SDimitry Andric break;
2444145449b1SDimitry Andric case FLAT_STORE:
2445145449b1SDimitry Andric case GLOBAL_STORE:
2446145449b1SDimitry Andric case GLOBAL_STORE_SADDR:
2447145449b1SDimitry Andric NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2448145449b1SDimitry Andric OptimizeListAgain |= CI.Width + Paired.Width < 4;
24491d5ae102SDimitry Andric break;
2450706b4fc4SDimitry Andric }
24516f8fc217SDimitry Andric CI.setMI(NewMI, *this);
2452145449b1SDimitry Andric CI.Order = Where->Order;
2453cfca06d7SDimitry Andric if (I == Second)
2454cfca06d7SDimitry Andric I = Next;
2455706b4fc4SDimitry Andric
2456cfca06d7SDimitry Andric MergeList.erase(Second);
245767c32a98SDimitry Andric }
245867c32a98SDimitry Andric
245967c32a98SDimitry Andric return Modified;
246067c32a98SDimitry Andric }
246167c32a98SDimitry Andric
runOnMachineFunction(MachineFunction & MF)246267c32a98SDimitry Andric bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2463044eb2f6SDimitry Andric if (skipFunction(MF.getFunction()))
246401095a5dSDimitry Andric return false;
246501095a5dSDimitry Andric
2466eb11fae6SDimitry Andric STM = &MF.getSubtarget<GCNSubtarget>();
2467044eb2f6SDimitry Andric if (!STM->loadStoreOptEnabled())
246801095a5dSDimitry Andric return false;
246901095a5dSDimitry Andric
2470044eb2f6SDimitry Andric TII = STM->getInstrInfo();
247101095a5dSDimitry Andric TRI = &TII->getRegisterInfo();
247201095a5dSDimitry Andric
247367c32a98SDimitry Andric MRI = &MF.getRegInfo();
2474b915e9e0SDimitry Andric AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
247567c32a98SDimitry Andric
2476eb11fae6SDimitry Andric LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
247767c32a98SDimitry Andric
247867c32a98SDimitry Andric bool Modified = false;
247967c32a98SDimitry Andric
2480cfca06d7SDimitry Andric // Contains the list of instructions for which constant offsets are being
2481cfca06d7SDimitry Andric // promoted to the IMM. This is tracked for an entire block at time.
2482cfca06d7SDimitry Andric SmallPtrSet<MachineInstr *, 4> AnchorList;
2483cfca06d7SDimitry Andric MemInfoMap Visited;
24841d5ae102SDimitry Andric
2485044eb2f6SDimitry Andric for (MachineBasicBlock &MBB : MF) {
2486cfca06d7SDimitry Andric MachineBasicBlock::iterator SectionEnd;
2487cfca06d7SDimitry Andric for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2488cfca06d7SDimitry Andric I = SectionEnd) {
2489cfca06d7SDimitry Andric bool CollectModified;
24901d5ae102SDimitry Andric std::list<std::list<CombineInfo>> MergeableInsts;
2491cfca06d7SDimitry Andric
2492cfca06d7SDimitry Andric // First pass: Collect list of all instructions we know how to merge in a
2493cfca06d7SDimitry Andric // subset of the block.
2494cfca06d7SDimitry Andric std::tie(SectionEnd, CollectModified) =
2495cfca06d7SDimitry Andric collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2496cfca06d7SDimitry Andric
2497cfca06d7SDimitry Andric Modified |= CollectModified;
2498cfca06d7SDimitry Andric
2499d8e91e46SDimitry Andric do {
2500d8e91e46SDimitry Andric OptimizeAgain = false;
25011d5ae102SDimitry Andric Modified |= optimizeBlock(MergeableInsts);
2502d8e91e46SDimitry Andric } while (OptimizeAgain);
2503044eb2f6SDimitry Andric }
2504044eb2f6SDimitry Andric
2505cfca06d7SDimitry Andric Visited.clear();
2506cfca06d7SDimitry Andric AnchorList.clear();
2507cfca06d7SDimitry Andric }
2508cfca06d7SDimitry Andric
250967c32a98SDimitry Andric return Modified;
251067c32a98SDimitry Andric }
2511