xref: /src/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1044eb2f6SDimitry Andric //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
267c32a98SDimitry Andric //
3e6d15924SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e6d15924SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5e6d15924SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
667c32a98SDimitry Andric //
767c32a98SDimitry Andric //===----------------------------------------------------------------------===//
867c32a98SDimitry Andric //
967c32a98SDimitry Andric // This pass tries to fuse DS instructions with close by immediate offsets.
1067c32a98SDimitry Andric // This will fuse operations such as
1167c32a98SDimitry Andric //  ds_read_b32 v0, v2 offset:16
1267c32a98SDimitry Andric //  ds_read_b32 v1, v2 offset:32
1367c32a98SDimitry Andric // ==>
1467c32a98SDimitry Andric //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
1567c32a98SDimitry Andric //
16044eb2f6SDimitry Andric // The same is done for certain SMEM and VMEM opcodes, e.g.:
17044eb2f6SDimitry Andric //  s_buffer_load_dword s4, s[0:3], 4
18044eb2f6SDimitry Andric //  s_buffer_load_dword s5, s[0:3], 8
19044eb2f6SDimitry Andric // ==>
20044eb2f6SDimitry Andric //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21044eb2f6SDimitry Andric //
22d8e91e46SDimitry Andric // This pass also tries to promote constant offset to the immediate by
23d8e91e46SDimitry Andric // adjusting the base. It tries to use a base from the nearby instructions that
24d8e91e46SDimitry Andric // allows it to have a 13bit constant offset and then promotes the 13bit offset
25d8e91e46SDimitry Andric // to the immediate.
26d8e91e46SDimitry Andric // E.g.
27d8e91e46SDimitry Andric //  s_movk_i32 s0, 0x1800
28d8e91e46SDimitry Andric //  v_add_co_u32_e32 v0, vcc, s0, v2
29d8e91e46SDimitry Andric //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30d8e91e46SDimitry Andric //
31d8e91e46SDimitry Andric //  s_movk_i32 s0, 0x1000
32d8e91e46SDimitry Andric //  v_add_co_u32_e32 v5, vcc, s0, v2
33d8e91e46SDimitry Andric //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34d8e91e46SDimitry Andric //  global_load_dwordx2 v[5:6], v[5:6], off
35d8e91e46SDimitry Andric //  global_load_dwordx2 v[0:1], v[0:1], off
36d8e91e46SDimitry Andric // =>
37d8e91e46SDimitry Andric //  s_movk_i32 s0, 0x1000
38d8e91e46SDimitry Andric //  v_add_co_u32_e32 v5, vcc, s0, v2
39d8e91e46SDimitry Andric //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40d8e91e46SDimitry Andric //  global_load_dwordx2 v[5:6], v[5:6], off
41d8e91e46SDimitry Andric //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
4267c32a98SDimitry Andric //
4367c32a98SDimitry Andric // Future improvements:
4467c32a98SDimitry Andric //
451d5ae102SDimitry Andric // - This is currently missing stores of constants because loading
4667c32a98SDimitry Andric //   the constant into the data register is placed between the stores, although
4767c32a98SDimitry Andric //   this is arguably a scheduling problem.
4867c32a98SDimitry Andric //
4967c32a98SDimitry Andric // - Live interval recomputing seems inefficient. This currently only matches
5067c32a98SDimitry Andric //   one pair, and recomputes live intervals and moves on to the next pair. It
5101095a5dSDimitry Andric //   would be better to compute a list of all merges that need to occur.
5267c32a98SDimitry Andric //
5367c32a98SDimitry Andric // - With a list of instructions to process, we can also merge more. If a
5467c32a98SDimitry Andric //   cluster of loads have offsets that are too large to fit in the 8-bit
5567c32a98SDimitry Andric //   offsets, but are close enough to fit in the 8 bits, we can add to the base
5667c32a98SDimitry Andric //   pointer and use the new reduced offsets.
5767c32a98SDimitry Andric //
5867c32a98SDimitry Andric //===----------------------------------------------------------------------===//
5967c32a98SDimitry Andric 
6067c32a98SDimitry Andric #include "AMDGPU.h"
61b60736ecSDimitry Andric #include "GCNSubtarget.h"
62d8e91e46SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
6371d5a254SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h"
6467c32a98SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
65706b4fc4SDimitry Andric #include "llvm/InitializePasses.h"
6667c32a98SDimitry Andric 
6767c32a98SDimitry Andric using namespace llvm;
6867c32a98SDimitry Andric 
6967c32a98SDimitry Andric #define DEBUG_TYPE "si-load-store-opt"
7067c32a98SDimitry Andric 
7167c32a98SDimitry Andric namespace {
72044eb2f6SDimitry Andric enum InstClassEnum {
73d8e91e46SDimitry Andric   UNKNOWN,
74d8e91e46SDimitry Andric   DS_READ,
75d8e91e46SDimitry Andric   DS_WRITE,
76044eb2f6SDimitry Andric   S_BUFFER_LOAD_IMM,
77e3b55780SDimitry Andric   S_BUFFER_LOAD_SGPR_IMM,
78e3b55780SDimitry Andric   S_LOAD_IMM,
791d5ae102SDimitry Andric   BUFFER_LOAD,
801d5ae102SDimitry Andric   BUFFER_STORE,
811d5ae102SDimitry Andric   MIMG,
82706b4fc4SDimitry Andric   TBUFFER_LOAD,
83706b4fc4SDimitry Andric   TBUFFER_STORE,
84145449b1SDimitry Andric   GLOBAL_LOAD_SADDR,
85145449b1SDimitry Andric   GLOBAL_STORE_SADDR,
86145449b1SDimitry Andric   FLAT_LOAD,
87145449b1SDimitry Andric   FLAT_STORE,
88145449b1SDimitry Andric   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89145449b1SDimitry Andric   GLOBAL_STORE // any CombineInfo, they are only ever returned by
90145449b1SDimitry Andric                // getCommonInstClass.
91044eb2f6SDimitry Andric };
9271d5a254SDimitry Andric 
93cfca06d7SDimitry Andric struct AddressRegs {
94cfca06d7SDimitry Andric   unsigned char NumVAddrs = 0;
95cfca06d7SDimitry Andric   bool SBase = false;
96cfca06d7SDimitry Andric   bool SRsrc = false;
97cfca06d7SDimitry Andric   bool SOffset = false;
98145449b1SDimitry Andric   bool SAddr = false;
99cfca06d7SDimitry Andric   bool VAddr = false;
100cfca06d7SDimitry Andric   bool Addr = false;
101cfca06d7SDimitry Andric   bool SSamp = false;
102d8e91e46SDimitry Andric };
103d8e91e46SDimitry Andric 
104cfca06d7SDimitry Andric // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105cfca06d7SDimitry Andric const unsigned MaxAddressRegs = 12 + 1 + 1;
106cfca06d7SDimitry Andric 
107d8e91e46SDimitry Andric class SILoadStoreOptimizer : public MachineFunctionPass {
108044eb2f6SDimitry Andric   struct CombineInfo {
10971d5a254SDimitry Andric     MachineBasicBlock::iterator I;
11071d5a254SDimitry Andric     unsigned EltSize;
111706b4fc4SDimitry Andric     unsigned Offset;
112706b4fc4SDimitry Andric     unsigned Width;
113706b4fc4SDimitry Andric     unsigned Format;
11471d5a254SDimitry Andric     unsigned BaseOff;
115706b4fc4SDimitry Andric     unsigned DMask;
116044eb2f6SDimitry Andric     InstClassEnum InstClass;
117344a3780SDimitry Andric     unsigned CPol = 0;
1186f8fc217SDimitry Andric     bool IsAGPR;
11971d5a254SDimitry Andric     bool UseST64;
120cfca06d7SDimitry Andric     int AddrIdx[MaxAddressRegs];
121cfca06d7SDimitry Andric     const MachineOperand *AddrReg[MaxAddressRegs];
1221d5ae102SDimitry Andric     unsigned NumAddresses;
123cfca06d7SDimitry Andric     unsigned Order;
1241d5ae102SDimitry Andric 
hasSameBaseAddress__anon7062d5540111::SILoadStoreOptimizer::CombineInfo125e3b55780SDimitry Andric     bool hasSameBaseAddress(const CombineInfo &CI) {
126e3b55780SDimitry Andric       if (NumAddresses != CI.NumAddresses)
127e3b55780SDimitry Andric         return false;
128e3b55780SDimitry Andric 
129e3b55780SDimitry Andric       const MachineInstr &MI = *CI.I;
1301d5ae102SDimitry Andric       for (unsigned i = 0; i < NumAddresses; i++) {
1311d5ae102SDimitry Andric         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
1321d5ae102SDimitry Andric 
1331d5ae102SDimitry Andric         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
1341d5ae102SDimitry Andric           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
1351d5ae102SDimitry Andric               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
1361d5ae102SDimitry Andric             return false;
1371d5ae102SDimitry Andric           }
1381d5ae102SDimitry Andric           continue;
1391d5ae102SDimitry Andric         }
1401d5ae102SDimitry Andric 
1411d5ae102SDimitry Andric         // Check same base pointer. Be careful of subregisters, which can occur
1421d5ae102SDimitry Andric         // with vectors of pointers.
1431d5ae102SDimitry Andric         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
1441d5ae102SDimitry Andric             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
1451d5ae102SDimitry Andric          return false;
1461d5ae102SDimitry Andric         }
1471d5ae102SDimitry Andric       }
1481d5ae102SDimitry Andric       return true;
1491d5ae102SDimitry Andric     }
1501d5ae102SDimitry Andric 
hasMergeableAddress__anon7062d5540111::SILoadStoreOptimizer::CombineInfo1511d5ae102SDimitry Andric     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
1521d5ae102SDimitry Andric       for (unsigned i = 0; i < NumAddresses; ++i) {
1531d5ae102SDimitry Andric         const MachineOperand *AddrOp = AddrReg[i];
1541d5ae102SDimitry Andric         // Immediates are always OK.
1551d5ae102SDimitry Andric         if (AddrOp->isImm())
1561d5ae102SDimitry Andric           continue;
1571d5ae102SDimitry Andric 
1581d5ae102SDimitry Andric         // Don't try to merge addresses that aren't either immediates or registers.
1591d5ae102SDimitry Andric         // TODO: Should be possible to merge FrameIndexes and maybe some other
1601d5ae102SDimitry Andric         // non-register
1611d5ae102SDimitry Andric         if (!AddrOp->isReg())
1621d5ae102SDimitry Andric           return false;
1631d5ae102SDimitry Andric 
164312c0ed1SDimitry Andric         // TODO: We should be able to merge instructions with other physical reg
165312c0ed1SDimitry Andric         // addresses too.
166312c0ed1SDimitry Andric         if (AddrOp->getReg().isPhysical() &&
167312c0ed1SDimitry Andric             AddrOp->getReg() != AMDGPU::SGPR_NULL)
1681d5ae102SDimitry Andric           return false;
1691d5ae102SDimitry Andric 
170e3b55780SDimitry Andric         // If an address has only one use then there will be no other
1711d5ae102SDimitry Andric         // instructions with the same address, so we can't merge this one.
1721d5ae102SDimitry Andric         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
1731d5ae102SDimitry Andric           return false;
1741d5ae102SDimitry Andric       }
1751d5ae102SDimitry Andric       return true;
1761d5ae102SDimitry Andric     }
1771d5ae102SDimitry Andric 
1786f8fc217SDimitry Andric     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179145449b1SDimitry Andric 
180145449b1SDimitry Andric     // Compare by pointer order.
operator <__anon7062d5540111::SILoadStoreOptimizer::CombineInfo181145449b1SDimitry Andric     bool operator<(const CombineInfo& Other) const {
182145449b1SDimitry Andric       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183145449b1SDimitry Andric     }
184044eb2f6SDimitry Andric   };
18571d5a254SDimitry Andric 
186d8e91e46SDimitry Andric   struct BaseRegisters {
187cfca06d7SDimitry Andric     Register LoReg;
188cfca06d7SDimitry Andric     Register HiReg;
189d8e91e46SDimitry Andric 
190d8e91e46SDimitry Andric     unsigned LoSubReg = 0;
191d8e91e46SDimitry Andric     unsigned HiSubReg = 0;
192d8e91e46SDimitry Andric   };
193d8e91e46SDimitry Andric 
194d8e91e46SDimitry Andric   struct MemAddress {
195d8e91e46SDimitry Andric     BaseRegisters Base;
196d8e91e46SDimitry Andric     int64_t Offset = 0;
197d8e91e46SDimitry Andric   };
198d8e91e46SDimitry Andric 
199d8e91e46SDimitry Andric   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200d8e91e46SDimitry Andric 
20167c32a98SDimitry Andric private:
202eb11fae6SDimitry Andric   const GCNSubtarget *STM = nullptr;
20371d5a254SDimitry Andric   const SIInstrInfo *TII = nullptr;
20471d5a254SDimitry Andric   const SIRegisterInfo *TRI = nullptr;
20571d5a254SDimitry Andric   MachineRegisterInfo *MRI = nullptr;
20671d5a254SDimitry Andric   AliasAnalysis *AA = nullptr;
207d8e91e46SDimitry Andric   bool OptimizeAgain;
20867c32a98SDimitry Andric 
209145449b1SDimitry Andric   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210145449b1SDimitry Andric                            const DenseSet<Register> &ARegUses,
211145449b1SDimitry Andric                            const MachineInstr &A, const MachineInstr &B) const;
212706b4fc4SDimitry Andric   static bool dmasksCanBeCombined(const CombineInfo &CI,
213706b4fc4SDimitry Andric                                   const SIInstrInfo &TII,
214706b4fc4SDimitry Andric                                   const CombineInfo &Paired);
215cfca06d7SDimitry Andric   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216cfca06d7SDimitry Andric                                    CombineInfo &Paired, bool Modify = false);
217cfca06d7SDimitry Andric   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218706b4fc4SDimitry Andric                         const CombineInfo &Paired);
219ac9a064cSDimitry Andric   unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220706b4fc4SDimitry Andric   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221706b4fc4SDimitry Andric                                                      const CombineInfo &Paired);
222ac9a064cSDimitry Andric   const TargetRegisterClass *
223ac9a064cSDimitry Andric   getTargetRegisterClass(const CombineInfo &CI,
224ac9a064cSDimitry Andric                          const CombineInfo &Paired) const;
225344a3780SDimitry Andric   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
22667c32a98SDimitry Andric 
227145449b1SDimitry Andric   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
22867c32a98SDimitry Andric 
229ac9a064cSDimitry Andric   void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
230ac9a064cSDimitry Andric                       MachineBasicBlock::iterator InsertBefore, int OpName,
231ac9a064cSDimitry Andric                       Register DestReg) const;
232ac9a064cSDimitry Andric   Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
233ac9a064cSDimitry Andric                            MachineBasicBlock::iterator InsertBefore,
234ac9a064cSDimitry Andric                            int OpName) const;
235ac9a064cSDimitry Andric 
236044eb2f6SDimitry Andric   unsigned read2Opcode(unsigned EltSize) const;
237044eb2f6SDimitry Andric   unsigned read2ST64Opcode(unsigned EltSize) const;
238145449b1SDimitry Andric   MachineBasicBlock::iterator
239145449b1SDimitry Andric   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
240145449b1SDimitry Andric                  MachineBasicBlock::iterator InsertBefore);
24167c32a98SDimitry Andric 
242044eb2f6SDimitry Andric   unsigned write2Opcode(unsigned EltSize) const;
243044eb2f6SDimitry Andric   unsigned write2ST64Opcode(unsigned EltSize) const;
244cfca06d7SDimitry Andric   MachineBasicBlock::iterator
245cfca06d7SDimitry Andric   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
246145449b1SDimitry Andric                   MachineBasicBlock::iterator InsertBefore);
247cfca06d7SDimitry Andric   MachineBasicBlock::iterator
248cfca06d7SDimitry Andric   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
249145449b1SDimitry Andric                  MachineBasicBlock::iterator InsertBefore);
250cfca06d7SDimitry Andric   MachineBasicBlock::iterator
251e3b55780SDimitry Andric   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
252145449b1SDimitry Andric                        MachineBasicBlock::iterator InsertBefore);
253cfca06d7SDimitry Andric   MachineBasicBlock::iterator
254cfca06d7SDimitry Andric   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
255145449b1SDimitry Andric                       MachineBasicBlock::iterator InsertBefore);
256cfca06d7SDimitry Andric   MachineBasicBlock::iterator
257cfca06d7SDimitry Andric   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
258145449b1SDimitry Andric                        MachineBasicBlock::iterator InsertBefore);
259cfca06d7SDimitry Andric   MachineBasicBlock::iterator
260cfca06d7SDimitry Andric   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
261145449b1SDimitry Andric                        MachineBasicBlock::iterator InsertBefore);
262cfca06d7SDimitry Andric   MachineBasicBlock::iterator
263cfca06d7SDimitry Andric   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
264145449b1SDimitry Andric                         MachineBasicBlock::iterator InsertBefore);
265145449b1SDimitry Andric   MachineBasicBlock::iterator
266145449b1SDimitry Andric   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
267145449b1SDimitry Andric                     MachineBasicBlock::iterator InsertBefore);
268145449b1SDimitry Andric   MachineBasicBlock::iterator
269145449b1SDimitry Andric   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
270145449b1SDimitry Andric                      MachineBasicBlock::iterator InsertBefore);
27167c32a98SDimitry Andric 
272cfca06d7SDimitry Andric   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
2731d5ae102SDimitry Andric                            int32_t NewOffset) const;
274cfca06d7SDimitry Andric   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
2751d5ae102SDimitry Andric   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
276e3b55780SDimitry Andric   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
2771d5ae102SDimitry Andric   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
278d8e91e46SDimitry Andric   /// Promotes constant offset to the immediate by adjusting the base. It
279d8e91e46SDimitry Andric   /// tries to use a base from the nearby instructions that allows it to have
280d8e91e46SDimitry Andric   /// a 13bit constant offset which gets promoted to the immediate.
281d8e91e46SDimitry Andric   bool promoteConstantOffsetToImm(MachineInstr &CI,
282d8e91e46SDimitry Andric                                   MemInfoMap &Visited,
2831d5ae102SDimitry Andric                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
2841d5ae102SDimitry Andric   void addInstToMergeableList(const CombineInfo &CI,
2851d5ae102SDimitry Andric                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
286cfca06d7SDimitry Andric 
287cfca06d7SDimitry Andric   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
288cfca06d7SDimitry Andric       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
289cfca06d7SDimitry Andric       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2901d5ae102SDimitry Andric       std::list<std::list<CombineInfo>> &MergeableInsts) const;
291d8e91e46SDimitry Andric 
292145449b1SDimitry Andric   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
293145449b1SDimitry Andric                                                      const CombineInfo &Paired);
294145449b1SDimitry Andric 
295145449b1SDimitry Andric   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
296145449b1SDimitry Andric                                           const CombineInfo &Paired);
297145449b1SDimitry Andric 
29867c32a98SDimitry Andric public:
29967c32a98SDimitry Andric   static char ID;
30067c32a98SDimitry Andric 
SILoadStoreOptimizer()301b5630dbaSDimitry Andric   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
30267c32a98SDimitry Andric     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
30367c32a98SDimitry Andric   }
30467c32a98SDimitry Andric 
3051d5ae102SDimitry Andric   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
3061d5ae102SDimitry Andric                                      bool &OptimizeListAgain);
3071d5ae102SDimitry Andric   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
30867c32a98SDimitry Andric 
30967c32a98SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
31067c32a98SDimitry Andric 
getPassName() const311eb11fae6SDimitry Andric   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
31267c32a98SDimitry Andric 
getAnalysisUsage(AnalysisUsage & AU) const31367c32a98SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
31467c32a98SDimitry Andric     AU.setPreservesCFG();
315b915e9e0SDimitry Andric     AU.addRequired<AAResultsWrapperPass>();
31667c32a98SDimitry Andric 
31767c32a98SDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
31867c32a98SDimitry Andric   }
319cfca06d7SDimitry Andric 
getRequiredProperties() const320cfca06d7SDimitry Andric   MachineFunctionProperties getRequiredProperties() const override {
321cfca06d7SDimitry Andric     return MachineFunctionProperties()
322cfca06d7SDimitry Andric       .set(MachineFunctionProperties::Property::IsSSA);
323cfca06d7SDimitry Andric   }
32467c32a98SDimitry Andric };
32567c32a98SDimitry Andric 
getOpcodeWidth(const MachineInstr & MI,const SIInstrInfo & TII)3261d5ae102SDimitry Andric static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
3271d5ae102SDimitry Andric   const unsigned Opc = MI.getOpcode();
3281d5ae102SDimitry Andric 
3291d5ae102SDimitry Andric   if (TII.isMUBUF(Opc)) {
3301d5ae102SDimitry Andric     // FIXME: Handle d16 correctly
3311d5ae102SDimitry Andric     return AMDGPU::getMUBUFElements(Opc);
3321d5ae102SDimitry Andric   }
333312c0ed1SDimitry Andric   if (TII.isImage(MI)) {
3341d5ae102SDimitry Andric     uint64_t DMaskImm =
3351d5ae102SDimitry Andric         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
336e3b55780SDimitry Andric     return llvm::popcount(DMaskImm);
3371d5ae102SDimitry Andric   }
338706b4fc4SDimitry Andric   if (TII.isMTBUF(Opc)) {
339706b4fc4SDimitry Andric     return AMDGPU::getMTBUFElements(Opc);
340706b4fc4SDimitry Andric   }
3411d5ae102SDimitry Andric 
3421d5ae102SDimitry Andric   switch (Opc) {
3431d5ae102SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORD_IMM:
346145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD:
347145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD:
349145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
350145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORD:
351145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORD:
3521d5ae102SDimitry Andric     return 1;
3531d5ae102SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM:
356ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
357145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2:
358145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
359145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2:
360145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
361145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX2:
362145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX2:
3631d5ae102SDimitry Andric     return 2;
364312c0ed1SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
365312c0ed1SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
366312c0ed1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM:
367ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
368145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3:
369145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
370145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3:
371145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
372145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX3:
373145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX3:
374145449b1SDimitry Andric     return 3;
3751d5ae102SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
376e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
377e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM:
378ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
379145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4:
380145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
381145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4:
382145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
383145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX4:
384145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX4:
3851d5ae102SDimitry Andric     return 4;
386c0981da4SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
387e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
388e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM:
389ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
390c0981da4SDimitry Andric     return 8;
391ac9a064cSDimitry Andric   case AMDGPU::DS_READ_B32:
392ac9a064cSDimitry Andric   case AMDGPU::DS_READ_B32_gfx9:
393ac9a064cSDimitry Andric   case AMDGPU::DS_WRITE_B32:
394344a3780SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
395344a3780SDimitry Andric     return 1;
396ac9a064cSDimitry Andric   case AMDGPU::DS_READ_B64:
397ac9a064cSDimitry Andric   case AMDGPU::DS_READ_B64_gfx9:
398ac9a064cSDimitry Andric   case AMDGPU::DS_WRITE_B64:
399344a3780SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
400344a3780SDimitry Andric     return 2;
4011d5ae102SDimitry Andric   default:
4021d5ae102SDimitry Andric     return 0;
4031d5ae102SDimitry Andric   }
4041d5ae102SDimitry Andric }
4051d5ae102SDimitry Andric 
4061d5ae102SDimitry Andric /// Maps instruction opcode to enum InstClassEnum.
getInstClass(unsigned Opc,const SIInstrInfo & TII)4071d5ae102SDimitry Andric static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
4081d5ae102SDimitry Andric   switch (Opc) {
4091d5ae102SDimitry Andric   default:
4101d5ae102SDimitry Andric     if (TII.isMUBUF(Opc)) {
4111d5ae102SDimitry Andric       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
4121d5ae102SDimitry Andric       default:
4131d5ae102SDimitry Andric         return UNKNOWN;
414ac9a064cSDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
415ac9a064cSDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
416ac9a064cSDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
417ac9a064cSDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
4181d5ae102SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
4191d5ae102SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
4201d5ae102SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
4211d5ae102SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
422ac9a064cSDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
423ac9a064cSDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
424ac9a064cSDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
425ac9a064cSDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
426312c0ed1SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
427312c0ed1SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
428312c0ed1SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
429312c0ed1SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
4301d5ae102SDimitry Andric         return BUFFER_LOAD;
431ac9a064cSDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
432ac9a064cSDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
433ac9a064cSDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
434ac9a064cSDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
4351d5ae102SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
4361d5ae102SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
4371d5ae102SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
4381d5ae102SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
439ac9a064cSDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
440ac9a064cSDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
441ac9a064cSDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
442ac9a064cSDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
443312c0ed1SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
444312c0ed1SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
445312c0ed1SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
446312c0ed1SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
4471d5ae102SDimitry Andric         return BUFFER_STORE;
4481d5ae102SDimitry Andric       }
4491d5ae102SDimitry Andric     }
450312c0ed1SDimitry Andric     if (TII.isImage(Opc)) {
4511d5ae102SDimitry Andric       // Ignore instructions encoded without vaddr.
452e3b55780SDimitry Andric       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
453e3b55780SDimitry Andric           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
4541d5ae102SDimitry Andric         return UNKNOWN;
455c0981da4SDimitry Andric       // Ignore BVH instructions
456c0981da4SDimitry Andric       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
457c0981da4SDimitry Andric         return UNKNOWN;
4581d5ae102SDimitry Andric       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
459706b4fc4SDimitry Andric       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
460706b4fc4SDimitry Andric           TII.isGather4(Opc))
4611d5ae102SDimitry Andric         return UNKNOWN;
4621d5ae102SDimitry Andric       return MIMG;
4631d5ae102SDimitry Andric     }
464706b4fc4SDimitry Andric     if (TII.isMTBUF(Opc)) {
465706b4fc4SDimitry Andric       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
466706b4fc4SDimitry Andric       default:
467706b4fc4SDimitry Andric         return UNKNOWN;
468312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
469312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
470312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
471312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
472706b4fc4SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
473706b4fc4SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
474706b4fc4SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
475706b4fc4SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
476312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
477312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
478312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
479312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
480312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
481312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
482312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
483312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
484706b4fc4SDimitry Andric         return TBUFFER_LOAD;
485706b4fc4SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
486706b4fc4SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
487706b4fc4SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
488706b4fc4SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
489312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
490312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
491312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
492312c0ed1SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
493706b4fc4SDimitry Andric         return TBUFFER_STORE;
494706b4fc4SDimitry Andric       }
495706b4fc4SDimitry Andric     }
4961d5ae102SDimitry Andric     return UNKNOWN;
4971d5ae102SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
4981d5ae102SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
499312c0ed1SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
5001d5ae102SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
501c0981da4SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
5021d5ae102SDimitry Andric     return S_BUFFER_LOAD_IMM;
503e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
504e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
505312c0ed1SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
506e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
507e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
508e3b55780SDimitry Andric     return S_BUFFER_LOAD_SGPR_IMM;
509e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORD_IMM:
510e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM:
511312c0ed1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM:
512e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM:
513e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM:
514ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
515ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
516ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
517ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
518e3b55780SDimitry Andric     return S_LOAD_IMM;
5191d5ae102SDimitry Andric   case AMDGPU::DS_READ_B32:
5201d5ae102SDimitry Andric   case AMDGPU::DS_READ_B32_gfx9:
5211d5ae102SDimitry Andric   case AMDGPU::DS_READ_B64:
5221d5ae102SDimitry Andric   case AMDGPU::DS_READ_B64_gfx9:
5231d5ae102SDimitry Andric     return DS_READ;
5241d5ae102SDimitry Andric   case AMDGPU::DS_WRITE_B32:
5251d5ae102SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
5261d5ae102SDimitry Andric   case AMDGPU::DS_WRITE_B64:
5271d5ae102SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
5281d5ae102SDimitry Andric     return DS_WRITE;
529145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD:
530145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2:
531145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3:
532145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4:
533145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORD:
534145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX2:
535145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX3:
536145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX4:
537145449b1SDimitry Andric     return FLAT_LOAD;
538145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
539145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
540145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
541145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
542145449b1SDimitry Andric     return GLOBAL_LOAD_SADDR;
543145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD:
544145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2:
545145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3:
546145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4:
547145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORD:
548145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX2:
549145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX3:
550145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX4:
551145449b1SDimitry Andric     return FLAT_STORE;
552145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
553145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
554145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
555145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
556145449b1SDimitry Andric     return GLOBAL_STORE_SADDR;
5571d5ae102SDimitry Andric   }
5581d5ae102SDimitry Andric }
5591d5ae102SDimitry Andric 
5601d5ae102SDimitry Andric /// Determines instruction subclass from opcode. Only instructions
561145449b1SDimitry Andric /// of the same subclass can be merged together. The merged instruction may have
562145449b1SDimitry Andric /// a different subclass but must have the same class.
getInstSubclass(unsigned Opc,const SIInstrInfo & TII)5631d5ae102SDimitry Andric static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
5641d5ae102SDimitry Andric   switch (Opc) {
5651d5ae102SDimitry Andric   default:
5661d5ae102SDimitry Andric     if (TII.isMUBUF(Opc))
5671d5ae102SDimitry Andric       return AMDGPU::getMUBUFBaseOpcode(Opc);
568312c0ed1SDimitry Andric     if (TII.isImage(Opc)) {
5691d5ae102SDimitry Andric       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
5701d5ae102SDimitry Andric       assert(Info);
5711d5ae102SDimitry Andric       return Info->BaseOpcode;
5721d5ae102SDimitry Andric     }
573706b4fc4SDimitry Andric     if (TII.isMTBUF(Opc))
574706b4fc4SDimitry Andric       return AMDGPU::getMTBUFBaseOpcode(Opc);
5751d5ae102SDimitry Andric     return -1;
5761d5ae102SDimitry Andric   case AMDGPU::DS_READ_B32:
5771d5ae102SDimitry Andric   case AMDGPU::DS_READ_B32_gfx9:
5781d5ae102SDimitry Andric   case AMDGPU::DS_READ_B64:
5791d5ae102SDimitry Andric   case AMDGPU::DS_READ_B64_gfx9:
5801d5ae102SDimitry Andric   case AMDGPU::DS_WRITE_B32:
5811d5ae102SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
5821d5ae102SDimitry Andric   case AMDGPU::DS_WRITE_B64:
5831d5ae102SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
5841d5ae102SDimitry Andric     return Opc;
5851d5ae102SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
5861d5ae102SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
587312c0ed1SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
5881d5ae102SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
589c0981da4SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
5901d5ae102SDimitry Andric     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
591e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
592e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
593312c0ed1SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
594e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
595e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
596e3b55780SDimitry Andric     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
597e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORD_IMM:
598e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM:
599312c0ed1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM:
600e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM:
601e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM:
602ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
603ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
604ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
605ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
606e3b55780SDimitry Andric     return AMDGPU::S_LOAD_DWORD_IMM;
607145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD:
608145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2:
609145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3:
610145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4:
611145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORD:
612145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX2:
613145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX3:
614145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX4:
615145449b1SDimitry Andric     return AMDGPU::FLAT_LOAD_DWORD;
616145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
617145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
618145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
619145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
620145449b1SDimitry Andric     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
621145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD:
622145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2:
623145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3:
624145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4:
625145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORD:
626145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX2:
627145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX3:
628145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX4:
629145449b1SDimitry Andric     return AMDGPU::FLAT_STORE_DWORD;
630145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
631145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
632145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
633145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
634145449b1SDimitry Andric     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
6351d5ae102SDimitry Andric   }
6361d5ae102SDimitry Andric }
6371d5ae102SDimitry Andric 
638145449b1SDimitry Andric // GLOBAL loads and stores are classified as FLAT initially. If both combined
639145449b1SDimitry Andric // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
640145449b1SDimitry Andric // If either or both instructions are non segment specific FLAT the resulting
641145449b1SDimitry Andric // combined operation will be FLAT, potentially promoting one of the GLOBAL
642145449b1SDimitry Andric // operations to FLAT.
643145449b1SDimitry Andric // For other instructions return the original unmodified class.
644145449b1SDimitry Andric InstClassEnum
getCommonInstClass(const CombineInfo & CI,const CombineInfo & Paired)645145449b1SDimitry Andric SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
646145449b1SDimitry Andric                                          const CombineInfo &Paired) {
647145449b1SDimitry Andric   assert(CI.InstClass == Paired.InstClass);
648145449b1SDimitry Andric 
649145449b1SDimitry Andric   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
650145449b1SDimitry Andric       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
651145449b1SDimitry Andric     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
652145449b1SDimitry Andric 
653145449b1SDimitry Andric   return CI.InstClass;
654145449b1SDimitry Andric }
655145449b1SDimitry Andric 
getRegs(unsigned Opc,const SIInstrInfo & TII)656cfca06d7SDimitry Andric static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
657cfca06d7SDimitry Andric   AddressRegs Result;
658cfca06d7SDimitry Andric 
6591d5ae102SDimitry Andric   if (TII.isMUBUF(Opc)) {
660cfca06d7SDimitry Andric     if (AMDGPU::getMUBUFHasVAddr(Opc))
661cfca06d7SDimitry Andric       Result.VAddr = true;
662cfca06d7SDimitry Andric     if (AMDGPU::getMUBUFHasSrsrc(Opc))
663cfca06d7SDimitry Andric       Result.SRsrc = true;
664cfca06d7SDimitry Andric     if (AMDGPU::getMUBUFHasSoffset(Opc))
665cfca06d7SDimitry Andric       Result.SOffset = true;
6661d5ae102SDimitry Andric 
667cfca06d7SDimitry Andric     return Result;
6681d5ae102SDimitry Andric   }
6691d5ae102SDimitry Andric 
670312c0ed1SDimitry Andric   if (TII.isImage(Opc)) {
671cfca06d7SDimitry Andric     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
672cfca06d7SDimitry Andric     if (VAddr0Idx >= 0) {
673312c0ed1SDimitry Andric       int RsrcName =
674312c0ed1SDimitry Andric           TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
675312c0ed1SDimitry Andric       int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
676312c0ed1SDimitry Andric       Result.NumVAddrs = RsrcIdx - VAddr0Idx;
677cfca06d7SDimitry Andric     } else {
678cfca06d7SDimitry Andric       Result.VAddr = true;
679cfca06d7SDimitry Andric     }
680cfca06d7SDimitry Andric     Result.SRsrc = true;
6811d5ae102SDimitry Andric     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
6821d5ae102SDimitry Andric     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
683cfca06d7SDimitry Andric       Result.SSamp = true;
684706b4fc4SDimitry Andric 
685cfca06d7SDimitry Andric     return Result;
686706b4fc4SDimitry Andric   }
687706b4fc4SDimitry Andric   if (TII.isMTBUF(Opc)) {
688cfca06d7SDimitry Andric     if (AMDGPU::getMTBUFHasVAddr(Opc))
689cfca06d7SDimitry Andric       Result.VAddr = true;
690cfca06d7SDimitry Andric     if (AMDGPU::getMTBUFHasSrsrc(Opc))
691cfca06d7SDimitry Andric       Result.SRsrc = true;
692cfca06d7SDimitry Andric     if (AMDGPU::getMTBUFHasSoffset(Opc))
693cfca06d7SDimitry Andric       Result.SOffset = true;
694706b4fc4SDimitry Andric 
695cfca06d7SDimitry Andric     return Result;
6961d5ae102SDimitry Andric   }
6971d5ae102SDimitry Andric 
6981d5ae102SDimitry Andric   switch (Opc) {
6991d5ae102SDimitry Andric   default:
700cfca06d7SDimitry Andric     return Result;
701e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
702e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
703312c0ed1SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
704e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
705e3b55780SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
706e3b55780SDimitry Andric     Result.SOffset = true;
707e3b55780SDimitry Andric     [[fallthrough]];
7081d5ae102SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
7091d5ae102SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
710312c0ed1SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
7111d5ae102SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
712c0981da4SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
713e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORD_IMM:
714e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM:
715312c0ed1SDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM:
716e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM:
717e3b55780SDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM:
718ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
719ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
720ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
721ac9a064cSDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
722cfca06d7SDimitry Andric     Result.SBase = true;
723cfca06d7SDimitry Andric     return Result;
7241d5ae102SDimitry Andric   case AMDGPU::DS_READ_B32:
7251d5ae102SDimitry Andric   case AMDGPU::DS_READ_B64:
7261d5ae102SDimitry Andric   case AMDGPU::DS_READ_B32_gfx9:
7271d5ae102SDimitry Andric   case AMDGPU::DS_READ_B64_gfx9:
7281d5ae102SDimitry Andric   case AMDGPU::DS_WRITE_B32:
7291d5ae102SDimitry Andric   case AMDGPU::DS_WRITE_B64:
7301d5ae102SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
7311d5ae102SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
732cfca06d7SDimitry Andric     Result.Addr = true;
733cfca06d7SDimitry Andric     return Result;
734145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
735145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
736145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
737145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
738145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
739145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
740145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
741145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
742145449b1SDimitry Andric     Result.SAddr = true;
743e3b55780SDimitry Andric     [[fallthrough]];
744145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD:
745145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2:
746145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3:
747145449b1SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4:
748145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD:
749145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2:
750145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3:
751145449b1SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4:
752145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORD:
753145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX2:
754145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX3:
755145449b1SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX4:
756145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORD:
757145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX2:
758145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX3:
759145449b1SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX4:
760145449b1SDimitry Andric     Result.VAddr = true;
761145449b1SDimitry Andric     return Result;
7621d5ae102SDimitry Andric   }
7631d5ae102SDimitry Andric }
7641d5ae102SDimitry Andric 
setMI(MachineBasicBlock::iterator MI,const SILoadStoreOptimizer & LSO)7651d5ae102SDimitry Andric void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
7666f8fc217SDimitry Andric                                               const SILoadStoreOptimizer &LSO) {
7671d5ae102SDimitry Andric   I = MI;
7681d5ae102SDimitry Andric   unsigned Opc = MI->getOpcode();
7696f8fc217SDimitry Andric   InstClass = getInstClass(Opc, *LSO.TII);
7701d5ae102SDimitry Andric 
7711d5ae102SDimitry Andric   if (InstClass == UNKNOWN)
7721d5ae102SDimitry Andric     return;
7731d5ae102SDimitry Andric 
7746f8fc217SDimitry Andric   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
7756f8fc217SDimitry Andric 
7761d5ae102SDimitry Andric   switch (InstClass) {
7771d5ae102SDimitry Andric   case DS_READ:
7781d5ae102SDimitry Andric    EltSize =
7791d5ae102SDimitry Andric           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
7801d5ae102SDimitry Andric                                                                           : 4;
7811d5ae102SDimitry Andric    break;
7821d5ae102SDimitry Andric   case DS_WRITE:
7831d5ae102SDimitry Andric     EltSize =
7841d5ae102SDimitry Andric           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
7851d5ae102SDimitry Andric                                                                             : 4;
7861d5ae102SDimitry Andric     break;
7871d5ae102SDimitry Andric   case S_BUFFER_LOAD_IMM:
788e3b55780SDimitry Andric   case S_BUFFER_LOAD_SGPR_IMM:
789e3b55780SDimitry Andric   case S_LOAD_IMM:
7906f8fc217SDimitry Andric     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
7911d5ae102SDimitry Andric     break;
7921d5ae102SDimitry Andric   default:
7931d5ae102SDimitry Andric     EltSize = 4;
7941d5ae102SDimitry Andric     break;
7951d5ae102SDimitry Andric   }
7961d5ae102SDimitry Andric 
7971d5ae102SDimitry Andric   if (InstClass == MIMG) {
7986f8fc217SDimitry Andric     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
799cfca06d7SDimitry Andric     // Offset is not considered for MIMG instructions.
800cfca06d7SDimitry Andric     Offset = 0;
8011d5ae102SDimitry Andric   } else {
8021d5ae102SDimitry Andric     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
8037fa27ce4SDimitry Andric     Offset = I->getOperand(OffsetIdx).getImm();
8041d5ae102SDimitry Andric   }
8051d5ae102SDimitry Andric 
806706b4fc4SDimitry Andric   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
8076f8fc217SDimitry Andric     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
808706b4fc4SDimitry Andric 
8096f8fc217SDimitry Andric   Width = getOpcodeWidth(*I, *LSO.TII);
8101d5ae102SDimitry Andric 
8111d5ae102SDimitry Andric   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
812706b4fc4SDimitry Andric     Offset &= 0xffff;
8131d5ae102SDimitry Andric   } else if (InstClass != MIMG) {
8146f8fc217SDimitry Andric     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
8151d5ae102SDimitry Andric   }
8161d5ae102SDimitry Andric 
8176f8fc217SDimitry Andric   AddressRegs Regs = getRegs(Opc, *LSO.TII);
818312c0ed1SDimitry Andric   bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
819cfca06d7SDimitry Andric 
8201d5ae102SDimitry Andric   NumAddresses = 0;
821cfca06d7SDimitry Andric   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
822cfca06d7SDimitry Andric     AddrIdx[NumAddresses++] =
823cfca06d7SDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
824cfca06d7SDimitry Andric   if (Regs.Addr)
825cfca06d7SDimitry Andric     AddrIdx[NumAddresses++] =
826cfca06d7SDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
827cfca06d7SDimitry Andric   if (Regs.SBase)
828cfca06d7SDimitry Andric     AddrIdx[NumAddresses++] =
829cfca06d7SDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
830cfca06d7SDimitry Andric   if (Regs.SRsrc)
831312c0ed1SDimitry Andric     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
832312c0ed1SDimitry Andric         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
833cfca06d7SDimitry Andric   if (Regs.SOffset)
834cfca06d7SDimitry Andric     AddrIdx[NumAddresses++] =
835cfca06d7SDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
836145449b1SDimitry Andric   if (Regs.SAddr)
837145449b1SDimitry Andric     AddrIdx[NumAddresses++] =
838145449b1SDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
839cfca06d7SDimitry Andric   if (Regs.VAddr)
840cfca06d7SDimitry Andric     AddrIdx[NumAddresses++] =
841cfca06d7SDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
842cfca06d7SDimitry Andric   if (Regs.SSamp)
843312c0ed1SDimitry Andric     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
844312c0ed1SDimitry Andric         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
845cfca06d7SDimitry Andric   assert(NumAddresses <= MaxAddressRegs);
8461d5ae102SDimitry Andric 
847cfca06d7SDimitry Andric   for (unsigned J = 0; J < NumAddresses; J++)
848cfca06d7SDimitry Andric     AddrReg[J] = &I->getOperand(AddrIdx[J]);
8491d5ae102SDimitry Andric }
8501d5ae102SDimitry Andric 
85171d5a254SDimitry Andric } // end anonymous namespace.
85267c32a98SDimitry Andric 
85367c32a98SDimitry Andric INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
854eb11fae6SDimitry Andric                       "SI Load Store Optimizer", false, false)
855b915e9e0SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
856d8e91e46SDimitry Andric INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
857d8e91e46SDimitry Andric                     false, false)
85867c32a98SDimitry Andric 
85967c32a98SDimitry Andric char SILoadStoreOptimizer::ID = 0;
86067c32a98SDimitry Andric 
86167c32a98SDimitry Andric char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
86267c32a98SDimitry Andric 
createSILoadStoreOptimizerPass()863b5630dbaSDimitry Andric FunctionPass *llvm::createSILoadStoreOptimizerPass() {
864b5630dbaSDimitry Andric   return new SILoadStoreOptimizer();
86567c32a98SDimitry Andric }
86667c32a98SDimitry Andric 
addDefsUsesToList(const MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & RegUses)867eb11fae6SDimitry Andric static void addDefsUsesToList(const MachineInstr &MI,
868cfca06d7SDimitry Andric                               DenseSet<Register> &RegDefs,
869145449b1SDimitry Andric                               DenseSet<Register> &RegUses) {
870145449b1SDimitry Andric   for (const auto &Op : MI.operands()) {
871145449b1SDimitry Andric     if (!Op.isReg())
872145449b1SDimitry Andric       continue;
873eb11fae6SDimitry Andric     if (Op.isDef())
874eb11fae6SDimitry Andric       RegDefs.insert(Op.getReg());
875145449b1SDimitry Andric     if (Op.readsReg())
876145449b1SDimitry Andric       RegUses.insert(Op.getReg());
877b915e9e0SDimitry Andric   }
878b915e9e0SDimitry Andric }
879b915e9e0SDimitry Andric 
canSwapInstructions(const DenseSet<Register> & ARegDefs,const DenseSet<Register> & ARegUses,const MachineInstr & A,const MachineInstr & B) const880145449b1SDimitry Andric bool SILoadStoreOptimizer::canSwapInstructions(
881145449b1SDimitry Andric     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
882145449b1SDimitry Andric     const MachineInstr &A, const MachineInstr &B) const {
883145449b1SDimitry Andric   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
884145449b1SDimitry Andric       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
885b915e9e0SDimitry Andric     return false;
886145449b1SDimitry Andric   for (const auto &BOp : B.operands()) {
887145449b1SDimitry Andric     if (!BOp.isReg())
888b915e9e0SDimitry Andric       continue;
889145449b1SDimitry Andric     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
890145449b1SDimitry Andric       return false;
891145449b1SDimitry Andric     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
892b915e9e0SDimitry Andric       return false;
893b915e9e0SDimitry Andric   }
894b915e9e0SDimitry Andric   return true;
895b915e9e0SDimitry Andric }
896b915e9e0SDimitry Andric 
897145449b1SDimitry Andric // Given that \p CI and \p Paired are adjacent memory operations produce a new
898145449b1SDimitry Andric // MMO for the combined operation with a new access size.
899145449b1SDimitry Andric MachineMemOperand *
combineKnownAdjacentMMOs(const CombineInfo & CI,const CombineInfo & Paired)900145449b1SDimitry Andric SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
901145449b1SDimitry Andric                                                const CombineInfo &Paired) {
902145449b1SDimitry Andric   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
903145449b1SDimitry Andric   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
904145449b1SDimitry Andric 
905ac9a064cSDimitry Andric   unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
906145449b1SDimitry Andric 
907145449b1SDimitry Andric   // A base pointer for the combined operation is the same as the leading
908145449b1SDimitry Andric   // operation's pointer.
909145449b1SDimitry Andric   if (Paired < CI)
910145449b1SDimitry Andric     std::swap(MMOa, MMOb);
911145449b1SDimitry Andric 
912145449b1SDimitry Andric   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
913145449b1SDimitry Andric   // If merging FLAT and GLOBAL set address space to FLAT.
914145449b1SDimitry Andric   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
915145449b1SDimitry Andric     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
916145449b1SDimitry Andric 
917145449b1SDimitry Andric   MachineFunction *MF = CI.I->getMF();
918145449b1SDimitry Andric   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
9191d5ae102SDimitry Andric }
9201d5ae102SDimitry Andric 
dmasksCanBeCombined(const CombineInfo & CI,const SIInstrInfo & TII,const CombineInfo & Paired)921706b4fc4SDimitry Andric bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
922706b4fc4SDimitry Andric                                                const SIInstrInfo &TII,
923706b4fc4SDimitry Andric                                                const CombineInfo &Paired) {
9241d5ae102SDimitry Andric   assert(CI.InstClass == MIMG);
9251d5ae102SDimitry Andric 
9261d5ae102SDimitry Andric   // Ignore instructions with tfe/lwe set.
9271d5ae102SDimitry Andric   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
9281d5ae102SDimitry Andric   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
9291d5ae102SDimitry Andric 
9301d5ae102SDimitry Andric   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
9311d5ae102SDimitry Andric     return false;
9321d5ae102SDimitry Andric 
9331d5ae102SDimitry Andric   // Check other optional immediate operands for equality.
934344a3780SDimitry Andric   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
935344a3780SDimitry Andric                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
936344a3780SDimitry Andric                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
9371d5ae102SDimitry Andric 
9381d5ae102SDimitry Andric   for (auto op : OperandsToMatch) {
9391d5ae102SDimitry Andric     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
940706b4fc4SDimitry Andric     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
9411d5ae102SDimitry Andric       return false;
9421d5ae102SDimitry Andric     if (Idx != -1 &&
943706b4fc4SDimitry Andric         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
9441d5ae102SDimitry Andric       return false;
9451d5ae102SDimitry Andric   }
9461d5ae102SDimitry Andric 
9471d5ae102SDimitry Andric   // Check DMask for overlaps.
948706b4fc4SDimitry Andric   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
949706b4fc4SDimitry Andric   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
9501d5ae102SDimitry Andric 
951b1c73532SDimitry Andric   if (!MaxMask)
952b1c73532SDimitry Andric     return false;
953b1c73532SDimitry Andric 
9547fa27ce4SDimitry Andric   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
9551d5ae102SDimitry Andric   if ((1u << AllowedBitsForMin) <= MinMask)
9561d5ae102SDimitry Andric     return false;
9571d5ae102SDimitry Andric 
9581d5ae102SDimitry Andric   return true;
9591d5ae102SDimitry Andric }
9601d5ae102SDimitry Andric 
getBufferFormatWithCompCount(unsigned OldFormat,unsigned ComponentCount,const GCNSubtarget & STI)961706b4fc4SDimitry Andric static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
962706b4fc4SDimitry Andric                                        unsigned ComponentCount,
963cfca06d7SDimitry Andric                                        const GCNSubtarget &STI) {
964706b4fc4SDimitry Andric   if (ComponentCount > 4)
965706b4fc4SDimitry Andric     return 0;
966706b4fc4SDimitry Andric 
967706b4fc4SDimitry Andric   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
968706b4fc4SDimitry Andric       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
969706b4fc4SDimitry Andric   if (!OldFormatInfo)
970706b4fc4SDimitry Andric     return 0;
971706b4fc4SDimitry Andric 
972706b4fc4SDimitry Andric   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
973706b4fc4SDimitry Andric       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
974706b4fc4SDimitry Andric                                            ComponentCount,
975706b4fc4SDimitry Andric                                            OldFormatInfo->NumFormat, STI);
976706b4fc4SDimitry Andric 
977706b4fc4SDimitry Andric   if (!NewFormatInfo)
978706b4fc4SDimitry Andric     return 0;
979706b4fc4SDimitry Andric 
980706b4fc4SDimitry Andric   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
981706b4fc4SDimitry Andric          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
982706b4fc4SDimitry Andric 
983706b4fc4SDimitry Andric   return NewFormatInfo->Format;
984706b4fc4SDimitry Andric }
985706b4fc4SDimitry Andric 
986344a3780SDimitry Andric // Return the value in the inclusive range [Lo,Hi] that is aligned to the
987344a3780SDimitry Andric // highest power of two. Note that the result is well defined for all inputs
988344a3780SDimitry Andric // including corner cases like:
989344a3780SDimitry Andric // - if Lo == Hi, return that value
990344a3780SDimitry Andric // - if Lo == 0, return 0 (even though the "- 1" below underflows
991344a3780SDimitry Andric // - if Lo > Hi, return 0 (as if the range wrapped around)
mostAlignedValueInRange(uint32_t Lo,uint32_t Hi)992344a3780SDimitry Andric static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
9937fa27ce4SDimitry Andric   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
994344a3780SDimitry Andric }
995344a3780SDimitry Andric 
offsetsCanBeCombined(CombineInfo & CI,const GCNSubtarget & STI,CombineInfo & Paired,bool Modify)996706b4fc4SDimitry Andric bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
997cfca06d7SDimitry Andric                                                 const GCNSubtarget &STI,
998cfca06d7SDimitry Andric                                                 CombineInfo &Paired,
999cfca06d7SDimitry Andric                                                 bool Modify) {
10001d5ae102SDimitry Andric   assert(CI.InstClass != MIMG);
10011d5ae102SDimitry Andric 
100267c32a98SDimitry Andric   // XXX - Would the same offset be OK? Is there any reason this would happen or
100367c32a98SDimitry Andric   // be useful?
1004706b4fc4SDimitry Andric   if (CI.Offset == Paired.Offset)
100567c32a98SDimitry Andric     return false;
100667c32a98SDimitry Andric 
100767c32a98SDimitry Andric   // This won't be valid if the offset isn't aligned.
1008706b4fc4SDimitry Andric   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
100967c32a98SDimitry Andric     return false;
101067c32a98SDimitry Andric 
1011706b4fc4SDimitry Andric   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1012706b4fc4SDimitry Andric 
1013706b4fc4SDimitry Andric     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1014706b4fc4SDimitry Andric         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
1015706b4fc4SDimitry Andric     if (!Info0)
1016706b4fc4SDimitry Andric       return false;
1017706b4fc4SDimitry Andric     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1018706b4fc4SDimitry Andric         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1019706b4fc4SDimitry Andric     if (!Info1)
1020706b4fc4SDimitry Andric       return false;
1021706b4fc4SDimitry Andric 
1022706b4fc4SDimitry Andric     if (Info0->BitsPerComp != Info1->BitsPerComp ||
1023706b4fc4SDimitry Andric         Info0->NumFormat != Info1->NumFormat)
1024706b4fc4SDimitry Andric       return false;
1025706b4fc4SDimitry Andric 
1026706b4fc4SDimitry Andric     // TODO: Should be possible to support more formats, but if format loads
1027706b4fc4SDimitry Andric     // are not dword-aligned, the merged load might not be valid.
1028706b4fc4SDimitry Andric     if (Info0->BitsPerComp != 32)
1029706b4fc4SDimitry Andric       return false;
1030706b4fc4SDimitry Andric 
1031706b4fc4SDimitry Andric     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1032706b4fc4SDimitry Andric       return false;
1033706b4fc4SDimitry Andric   }
1034706b4fc4SDimitry Andric 
1035344a3780SDimitry Andric   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1036344a3780SDimitry Andric   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
103771d5a254SDimitry Andric   CI.UseST64 = false;
103871d5a254SDimitry Andric   CI.BaseOff = 0;
103967c32a98SDimitry Andric 
1040344a3780SDimitry Andric   // Handle all non-DS instructions.
1041d8e91e46SDimitry Andric   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
10427fa27ce4SDimitry Andric     if (EltOffset0 + CI.Width != EltOffset1 &&
10437fa27ce4SDimitry Andric             EltOffset1 + Paired.Width != EltOffset0)
10447fa27ce4SDimitry Andric       return false;
10457fa27ce4SDimitry Andric     if (CI.CPol != Paired.CPol)
10467fa27ce4SDimitry Andric       return false;
1047312c0ed1SDimitry Andric     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1048312c0ed1SDimitry Andric         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1049312c0ed1SDimitry Andric       // Reject cases like:
1050312c0ed1SDimitry Andric       //   dword + dwordx2 -> dwordx3
1051312c0ed1SDimitry Andric       //   dword + dwordx3 -> dwordx4
1052312c0ed1SDimitry Andric       // If we tried to combine these cases, we would fail to extract a subreg
1053312c0ed1SDimitry Andric       // for the result of the second load due to SGPR alignment requirements.
1054312c0ed1SDimitry Andric       if (CI.Width != Paired.Width &&
1055312c0ed1SDimitry Andric           (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1056312c0ed1SDimitry Andric         return false;
1057312c0ed1SDimitry Andric     }
10587fa27ce4SDimitry Andric     return true;
1059044eb2f6SDimitry Andric   }
1060044eb2f6SDimitry Andric 
106167c32a98SDimitry Andric   // If the offset in elements doesn't fit in 8-bits, we might be able to use
106267c32a98SDimitry Andric   // the stride 64 versions.
106371d5a254SDimitry Andric   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
106471d5a254SDimitry Andric       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1065cfca06d7SDimitry Andric     if (Modify) {
1066706b4fc4SDimitry Andric       CI.Offset = EltOffset0 / 64;
1067706b4fc4SDimitry Andric       Paired.Offset = EltOffset1 / 64;
106871d5a254SDimitry Andric       CI.UseST64 = true;
1069cfca06d7SDimitry Andric     }
107071d5a254SDimitry Andric     return true;
107167c32a98SDimitry Andric   }
107267c32a98SDimitry Andric 
107371d5a254SDimitry Andric   // Check if the new offsets fit in the reduced 8-bit range.
107471d5a254SDimitry Andric   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1075cfca06d7SDimitry Andric     if (Modify) {
1076706b4fc4SDimitry Andric       CI.Offset = EltOffset0;
1077706b4fc4SDimitry Andric       Paired.Offset = EltOffset1;
1078cfca06d7SDimitry Andric     }
107971d5a254SDimitry Andric     return true;
108071d5a254SDimitry Andric   }
108171d5a254SDimitry Andric 
108271d5a254SDimitry Andric   // Try to shift base address to decrease offsets.
1083344a3780SDimitry Andric   uint32_t Min = std::min(EltOffset0, EltOffset1);
1084344a3780SDimitry Andric   uint32_t Max = std::max(EltOffset0, EltOffset1);
108571d5a254SDimitry Andric 
1086344a3780SDimitry Andric   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1087344a3780SDimitry Andric   if (((Max - Min) & ~Mask) == 0) {
1088cfca06d7SDimitry Andric     if (Modify) {
1089344a3780SDimitry Andric       // From the range of values we could use for BaseOff, choose the one that
1090344a3780SDimitry Andric       // is aligned to the highest power of two, to maximise the chance that
1091344a3780SDimitry Andric       // the same offset can be reused for other load/store pairs.
1092344a3780SDimitry Andric       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1093344a3780SDimitry Andric       // Copy the low bits of the offsets, so that when we adjust them by
1094344a3780SDimitry Andric       // subtracting BaseOff they will be multiples of 64.
1095344a3780SDimitry Andric       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1096344a3780SDimitry Andric       CI.BaseOff = BaseOff * CI.EltSize;
1097344a3780SDimitry Andric       CI.Offset = (EltOffset0 - BaseOff) / 64;
1098344a3780SDimitry Andric       Paired.Offset = (EltOffset1 - BaseOff) / 64;
109971d5a254SDimitry Andric       CI.UseST64 = true;
1100cfca06d7SDimitry Andric     }
110171d5a254SDimitry Andric     return true;
110271d5a254SDimitry Andric   }
110371d5a254SDimitry Andric 
1104344a3780SDimitry Andric   if (isUInt<8>(Max - Min)) {
1105cfca06d7SDimitry Andric     if (Modify) {
1106344a3780SDimitry Andric       // From the range of values we could use for BaseOff, choose the one that
1107344a3780SDimitry Andric       // is aligned to the highest power of two, to maximise the chance that
1108344a3780SDimitry Andric       // the same offset can be reused for other load/store pairs.
1109344a3780SDimitry Andric       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1110344a3780SDimitry Andric       CI.BaseOff = BaseOff * CI.EltSize;
1111344a3780SDimitry Andric       CI.Offset = EltOffset0 - BaseOff;
1112344a3780SDimitry Andric       Paired.Offset = EltOffset1 - BaseOff;
1113cfca06d7SDimitry Andric     }
111471d5a254SDimitry Andric     return true;
111571d5a254SDimitry Andric   }
111671d5a254SDimitry Andric 
111771d5a254SDimitry Andric   return false;
111871d5a254SDimitry Andric }
111971d5a254SDimitry Andric 
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI,const CombineInfo & Paired)1120d8e91e46SDimitry Andric bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1121706b4fc4SDimitry Andric                                      const CombineInfo &CI,
1122706b4fc4SDimitry Andric                                      const CombineInfo &Paired) {
1123706b4fc4SDimitry Andric   const unsigned Width = (CI.Width + Paired.Width);
1124d8e91e46SDimitry Andric   switch (CI.InstClass) {
1125d8e91e46SDimitry Andric   default:
1126d8e91e46SDimitry Andric     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1127d8e91e46SDimitry Andric   case S_BUFFER_LOAD_IMM:
1128e3b55780SDimitry Andric   case S_BUFFER_LOAD_SGPR_IMM:
1129e3b55780SDimitry Andric   case S_LOAD_IMM:
1130d8e91e46SDimitry Andric     switch (Width) {
1131d8e91e46SDimitry Andric     default:
1132d8e91e46SDimitry Andric       return false;
1133d8e91e46SDimitry Andric     case 2:
1134d8e91e46SDimitry Andric     case 4:
1135c0981da4SDimitry Andric     case 8:
1136d8e91e46SDimitry Andric       return true;
1137312c0ed1SDimitry Andric     case 3:
1138312c0ed1SDimitry Andric       return STM.hasScalarDwordx3Loads();
1139d8e91e46SDimitry Andric     }
1140d8e91e46SDimitry Andric   }
1141d8e91e46SDimitry Andric }
1142d8e91e46SDimitry Andric 
1143344a3780SDimitry Andric const TargetRegisterClass *
getDataRegClass(const MachineInstr & MI) const1144344a3780SDimitry Andric SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1145344a3780SDimitry Andric   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1146344a3780SDimitry Andric     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1147344a3780SDimitry Andric   }
1148344a3780SDimitry Andric   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1149344a3780SDimitry Andric     return TRI->getRegClassForReg(*MRI, Src->getReg());
1150344a3780SDimitry Andric   }
1151344a3780SDimitry Andric   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1152344a3780SDimitry Andric     return TRI->getRegClassForReg(*MRI, Src->getReg());
1153344a3780SDimitry Andric   }
1154344a3780SDimitry Andric   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1155344a3780SDimitry Andric     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1156344a3780SDimitry Andric   }
1157344a3780SDimitry Andric   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1158344a3780SDimitry Andric     return TRI->getRegClassForReg(*MRI, Src->getReg());
1159344a3780SDimitry Andric   }
1160344a3780SDimitry Andric   return nullptr;
1161344a3780SDimitry Andric }
1162344a3780SDimitry Andric 
1163145449b1SDimitry Andric /// This function assumes that CI comes before Paired in a basic block. Return
1164145449b1SDimitry Andric /// an insertion point for the merged instruction or nullptr on failure.
1165145449b1SDimitry Andric SILoadStoreOptimizer::CombineInfo *
checkAndPrepareMerge(CombineInfo & CI,CombineInfo & Paired)1166145449b1SDimitry Andric SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1167145449b1SDimitry Andric                                            CombineInfo &Paired) {
1168145449b1SDimitry Andric   // If another instruction has already been merged into CI, it may now be a
1169145449b1SDimitry Andric   // type that we can't do any further merging into.
1170145449b1SDimitry Andric   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1171145449b1SDimitry Andric     return nullptr;
1172145449b1SDimitry Andric   assert(CI.InstClass == Paired.InstClass);
1173145449b1SDimitry Andric 
1174145449b1SDimitry Andric   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1175145449b1SDimitry Andric       getInstSubclass(Paired.I->getOpcode(), *TII))
1176145449b1SDimitry Andric     return nullptr;
1177cfca06d7SDimitry Andric 
1178cfca06d7SDimitry Andric   // Check both offsets (or masks for MIMG) can be combined and fit in the
1179cfca06d7SDimitry Andric   // reduced range.
1180145449b1SDimitry Andric   if (CI.InstClass == MIMG) {
1181145449b1SDimitry Andric     if (!dmasksCanBeCombined(CI, *TII, Paired))
1182145449b1SDimitry Andric       return nullptr;
1183145449b1SDimitry Andric   } else {
1184145449b1SDimitry Andric     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1185145449b1SDimitry Andric       return nullptr;
1186cfca06d7SDimitry Andric   }
1187cfca06d7SDimitry Andric 
1188145449b1SDimitry Andric   DenseSet<Register> RegDefs;
1189145449b1SDimitry Andric   DenseSet<Register> RegUses;
1190145449b1SDimitry Andric   CombineInfo *Where;
1191145449b1SDimitry Andric   if (CI.I->mayLoad()) {
1192145449b1SDimitry Andric     // Try to hoist Paired up to CI.
1193145449b1SDimitry Andric     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1194145449b1SDimitry Andric     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1195145449b1SDimitry Andric       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1196145449b1SDimitry Andric         return nullptr;
1197b915e9e0SDimitry Andric     }
1198145449b1SDimitry Andric     Where = &CI;
1199145449b1SDimitry Andric   } else {
1200145449b1SDimitry Andric     // Try to sink CI down to Paired.
1201145449b1SDimitry Andric     addDefsUsesToList(*CI.I, RegDefs, RegUses);
1202145449b1SDimitry Andric     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1203145449b1SDimitry Andric       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1204145449b1SDimitry Andric         return nullptr;
1205b915e9e0SDimitry Andric     }
1206145449b1SDimitry Andric     Where = &Paired;
1207145449b1SDimitry Andric   }
1208cfca06d7SDimitry Andric 
1209cfca06d7SDimitry Andric   // Call offsetsCanBeCombined with modify = true so that the offsets are
1210cfca06d7SDimitry Andric   // correct for the new instruction.  This should return true, because
1211cfca06d7SDimitry Andric   // this function should only be called on CombineInfo objects that
1212cfca06d7SDimitry Andric   // have already been confirmed to be mergeable.
1213145449b1SDimitry Andric   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1214cfca06d7SDimitry Andric     offsetsCanBeCombined(CI, *STM, Paired, true);
1215145449b1SDimitry Andric   return Where;
121667c32a98SDimitry Andric }
121767c32a98SDimitry Andric 
1218ac9a064cSDimitry Andric // Copy the merged load result from DestReg to the original dest regs of CI and
1219ac9a064cSDimitry Andric // Paired.
copyToDestRegs(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore,int OpName,Register DestReg) const1220ac9a064cSDimitry Andric void SILoadStoreOptimizer::copyToDestRegs(
1221ac9a064cSDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
1222ac9a064cSDimitry Andric     MachineBasicBlock::iterator InsertBefore, int OpName,
1223ac9a064cSDimitry Andric     Register DestReg) const {
1224ac9a064cSDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1225ac9a064cSDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1226ac9a064cSDimitry Andric 
1227ac9a064cSDimitry Andric   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1228ac9a064cSDimitry Andric 
1229ac9a064cSDimitry Andric   // Copy to the old destination registers.
1230ac9a064cSDimitry Andric   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1231ac9a064cSDimitry Andric   auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1232ac9a064cSDimitry Andric   auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1233ac9a064cSDimitry Andric 
1234ac9a064cSDimitry Andric   // The constrained sload instructions in S_LOAD_IMM class will have
1235ac9a064cSDimitry Andric   // `early-clobber` flag in the dst operand. Remove the flag before using the
1236ac9a064cSDimitry Andric   // MOs in copies.
1237ac9a064cSDimitry Andric   Dest0->setIsEarlyClobber(false);
1238ac9a064cSDimitry Andric   Dest1->setIsEarlyClobber(false);
1239ac9a064cSDimitry Andric 
1240ac9a064cSDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1241ac9a064cSDimitry Andric       .add(*Dest0) // Copy to same destination including flags and sub reg.
1242ac9a064cSDimitry Andric       .addReg(DestReg, 0, SubRegIdx0);
1243ac9a064cSDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1244ac9a064cSDimitry Andric       .add(*Dest1)
1245ac9a064cSDimitry Andric       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1246ac9a064cSDimitry Andric }
1247ac9a064cSDimitry Andric 
1248ac9a064cSDimitry Andric // Return a register for the source of the merged store after copying the
1249ac9a064cSDimitry Andric // original source regs of CI and Paired into it.
1250ac9a064cSDimitry Andric Register
copyFromSrcRegs(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore,int OpName) const1251ac9a064cSDimitry Andric SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1252ac9a064cSDimitry Andric                                       MachineBasicBlock::iterator InsertBefore,
1253ac9a064cSDimitry Andric                                       int OpName) const {
1254ac9a064cSDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1255ac9a064cSDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1256ac9a064cSDimitry Andric 
1257ac9a064cSDimitry Andric   auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1258ac9a064cSDimitry Andric 
1259ac9a064cSDimitry Andric   // Copy to the new source register.
1260ac9a064cSDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1261ac9a064cSDimitry Andric   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1262ac9a064cSDimitry Andric 
1263ac9a064cSDimitry Andric   const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1264ac9a064cSDimitry Andric   const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1265ac9a064cSDimitry Andric 
1266ac9a064cSDimitry Andric   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1267ac9a064cSDimitry Andric       .add(*Src0)
1268ac9a064cSDimitry Andric       .addImm(SubRegIdx0)
1269ac9a064cSDimitry Andric       .add(*Src1)
1270ac9a064cSDimitry Andric       .addImm(SubRegIdx1);
1271ac9a064cSDimitry Andric 
1272ac9a064cSDimitry Andric   return SrcReg;
1273ac9a064cSDimitry Andric }
1274ac9a064cSDimitry Andric 
read2Opcode(unsigned EltSize) const1275044eb2f6SDimitry Andric unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1276044eb2f6SDimitry Andric   if (STM->ldsRequiresM0Init())
1277044eb2f6SDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1278044eb2f6SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1279044eb2f6SDimitry Andric }
1280044eb2f6SDimitry Andric 
read2ST64Opcode(unsigned EltSize) const1281044eb2f6SDimitry Andric unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1282044eb2f6SDimitry Andric   if (STM->ldsRequiresM0Init())
1283044eb2f6SDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1284044eb2f6SDimitry Andric 
1285d8e91e46SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1286d8e91e46SDimitry Andric                         : AMDGPU::DS_READ2ST64_B64_gfx9;
1287044eb2f6SDimitry Andric }
1288044eb2f6SDimitry Andric 
1289d8e91e46SDimitry Andric MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1290cfca06d7SDimitry Andric SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1291145449b1SDimitry Andric                                      MachineBasicBlock::iterator InsertBefore) {
129271d5a254SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
129367c32a98SDimitry Andric 
129467c32a98SDimitry Andric   // Be careful, since the addresses could be subregisters themselves in weird
129567c32a98SDimitry Andric   // cases, like vectors of pointers.
129671d5a254SDimitry Andric   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
129767c32a98SDimitry Andric 
1298ac9a064cSDimitry Andric   unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1299ac9a064cSDimitry Andric   unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1300d8e91e46SDimitry Andric   unsigned Opc =
1301d8e91e46SDimitry Andric       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
130267c32a98SDimitry Andric 
130367c32a98SDimitry Andric   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1304d8e91e46SDimitry Andric          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
130567c32a98SDimitry Andric 
130667c32a98SDimitry Andric   const MCInstrDesc &Read2Desc = TII->get(Opc);
130767c32a98SDimitry Andric 
1308344a3780SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
13091d5ae102SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
131067c32a98SDimitry Andric 
131171d5a254SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
131271d5a254SDimitry Andric 
13131d5ae102SDimitry Andric   Register BaseReg = AddrReg->getReg();
1314d8e91e46SDimitry Andric   unsigned BaseSubReg = AddrReg->getSubReg();
131571d5a254SDimitry Andric   unsigned BaseRegFlags = 0;
131671d5a254SDimitry Andric   if (CI.BaseOff) {
13171d5ae102SDimitry Andric     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1318145449b1SDimitry Andric     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1319eb11fae6SDimitry Andric         .addImm(CI.BaseOff);
1320eb11fae6SDimitry Andric 
132171d5a254SDimitry Andric     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
132271d5a254SDimitry Andric     BaseRegFlags = RegState::Kill;
1323044eb2f6SDimitry Andric 
1324145449b1SDimitry Andric     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1325eb11fae6SDimitry Andric         .addReg(ImmReg)
1326e6d15924SDimitry Andric         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1327e6d15924SDimitry Andric         .addImm(0); // clamp bit
1328d8e91e46SDimitry Andric     BaseSubReg = 0;
132971d5a254SDimitry Andric   }
133071d5a254SDimitry Andric 
133171d5a254SDimitry Andric   MachineInstrBuilder Read2 =
1332145449b1SDimitry Andric       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1333d8e91e46SDimitry Andric           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
133467c32a98SDimitry Andric           .addImm(NewOffset0)                        // offset0
133567c32a98SDimitry Andric           .addImm(NewOffset1)                        // offset1
13365a5ac124SDimitry Andric           .addImm(0)                                 // gds
1337706b4fc4SDimitry Andric           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
133871d5a254SDimitry Andric 
1339ac9a064cSDimitry Andric   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1340ee8648bdSDimitry Andric 
134171d5a254SDimitry Andric   CI.I->eraseFromParent();
1342706b4fc4SDimitry Andric   Paired.I->eraseFromParent();
134367c32a98SDimitry Andric 
1344eb11fae6SDimitry Andric   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
13451d5ae102SDimitry Andric   return Read2;
134667c32a98SDimitry Andric }
134767c32a98SDimitry Andric 
write2Opcode(unsigned EltSize) const1348044eb2f6SDimitry Andric unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1349044eb2f6SDimitry Andric   if (STM->ldsRequiresM0Init())
1350044eb2f6SDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1351d8e91e46SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1352d8e91e46SDimitry Andric                         : AMDGPU::DS_WRITE2_B64_gfx9;
1353044eb2f6SDimitry Andric }
1354044eb2f6SDimitry Andric 
write2ST64Opcode(unsigned EltSize) const1355044eb2f6SDimitry Andric unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1356044eb2f6SDimitry Andric   if (STM->ldsRequiresM0Init())
1357d8e91e46SDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1358d8e91e46SDimitry Andric                           : AMDGPU::DS_WRITE2ST64_B64;
1359044eb2f6SDimitry Andric 
1360d8e91e46SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1361d8e91e46SDimitry Andric                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1362044eb2f6SDimitry Andric }
1363044eb2f6SDimitry Andric 
mergeWrite2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1364145449b1SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1365145449b1SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
1366145449b1SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
136771d5a254SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
136867c32a98SDimitry Andric 
136967c32a98SDimitry Andric   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
137067c32a98SDimitry Andric   // sure we preserve the subregister index and any register flags set on them.
1371d8e91e46SDimitry Andric   const MachineOperand *AddrReg =
1372d8e91e46SDimitry Andric       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1373d8e91e46SDimitry Andric   const MachineOperand *Data0 =
1374d8e91e46SDimitry Andric       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1375d8e91e46SDimitry Andric   const MachineOperand *Data1 =
1376706b4fc4SDimitry Andric       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
137767c32a98SDimitry Andric 
1378706b4fc4SDimitry Andric   unsigned NewOffset0 = CI.Offset;
1379706b4fc4SDimitry Andric   unsigned NewOffset1 = Paired.Offset;
1380d8e91e46SDimitry Andric   unsigned Opc =
1381d8e91e46SDimitry Andric       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
138267c32a98SDimitry Andric 
1383b915e9e0SDimitry Andric   if (NewOffset0 > NewOffset1) {
1384b915e9e0SDimitry Andric     // Canonicalize the merged instruction so the smaller offset comes first.
1385b915e9e0SDimitry Andric     std::swap(NewOffset0, NewOffset1);
1386b915e9e0SDimitry Andric     std::swap(Data0, Data1);
1387b915e9e0SDimitry Andric   }
1388b915e9e0SDimitry Andric 
138967c32a98SDimitry Andric   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1390d8e91e46SDimitry Andric          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
139167c32a98SDimitry Andric 
139267c32a98SDimitry Andric   const MCInstrDesc &Write2Desc = TII->get(Opc);
139371d5a254SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
139467c32a98SDimitry Andric 
13951d5ae102SDimitry Andric   Register BaseReg = AddrReg->getReg();
1396d8e91e46SDimitry Andric   unsigned BaseSubReg = AddrReg->getSubReg();
139771d5a254SDimitry Andric   unsigned BaseRegFlags = 0;
139871d5a254SDimitry Andric   if (CI.BaseOff) {
13991d5ae102SDimitry Andric     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1400145449b1SDimitry Andric     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1401eb11fae6SDimitry Andric         .addImm(CI.BaseOff);
1402eb11fae6SDimitry Andric 
140371d5a254SDimitry Andric     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
140471d5a254SDimitry Andric     BaseRegFlags = RegState::Kill;
1405044eb2f6SDimitry Andric 
1406145449b1SDimitry Andric     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1407eb11fae6SDimitry Andric         .addReg(ImmReg)
1408e6d15924SDimitry Andric         .addReg(AddrReg->getReg(), 0, BaseSubReg)
1409e6d15924SDimitry Andric         .addImm(0); // clamp bit
1410d8e91e46SDimitry Andric     BaseSubReg = 0;
141171d5a254SDimitry Andric   }
141271d5a254SDimitry Andric 
141371d5a254SDimitry Andric   MachineInstrBuilder Write2 =
1414145449b1SDimitry Andric       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1415d8e91e46SDimitry Andric           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
141671d5a254SDimitry Andric           .add(*Data0)                               // data0
141771d5a254SDimitry Andric           .add(*Data1)                               // data1
141867c32a98SDimitry Andric           .addImm(NewOffset0)                        // offset0
141967c32a98SDimitry Andric           .addImm(NewOffset1)                        // offset1
14205a5ac124SDimitry Andric           .addImm(0)                                 // gds
1421706b4fc4SDimitry Andric           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
142267c32a98SDimitry Andric 
142371d5a254SDimitry Andric   CI.I->eraseFromParent();
1424706b4fc4SDimitry Andric   Paired.I->eraseFromParent();
142567c32a98SDimitry Andric 
1426eb11fae6SDimitry Andric   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
14271d5ae102SDimitry Andric   return Write2;
142867c32a98SDimitry Andric }
142967c32a98SDimitry Andric 
1430d8e91e46SDimitry Andric MachineBasicBlock::iterator
mergeImagePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1431cfca06d7SDimitry Andric SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1432145449b1SDimitry Andric                                      MachineBasicBlock::iterator InsertBefore) {
1433044eb2f6SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1434044eb2f6SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1435706b4fc4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
1436044eb2f6SDimitry Andric 
1437706b4fc4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1438d8e91e46SDimitry Andric 
14391d5ae102SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
1440706b4fc4SDimitry Andric   unsigned MergedDMask = CI.DMask | Paired.DMask;
14411d5ae102SDimitry Andric   unsigned DMaskIdx =
14421d5ae102SDimitry Andric       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1443044eb2f6SDimitry Andric 
1444145449b1SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
14451d5ae102SDimitry Andric   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
14461d5ae102SDimitry Andric     if (I == DMaskIdx)
14471d5ae102SDimitry Andric       MIB.addImm(MergedDMask);
14481d5ae102SDimitry Andric     else
14491d5ae102SDimitry Andric       MIB.add((*CI.I).getOperand(I));
14501d5ae102SDimitry Andric   }
1451044eb2f6SDimitry Andric 
14521d5ae102SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
14531d5ae102SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
14541d5ae102SDimitry Andric   // will return true if this is the case.
1455706b4fc4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1456d8e91e46SDimitry Andric 
1457145449b1SDimitry Andric   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1458044eb2f6SDimitry Andric 
1459ac9a064cSDimitry Andric   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1460044eb2f6SDimitry Andric 
1461044eb2f6SDimitry Andric   CI.I->eraseFromParent();
1462706b4fc4SDimitry Andric   Paired.I->eraseFromParent();
14631d5ae102SDimitry Andric   return New;
14641d5ae102SDimitry Andric }
14651d5ae102SDimitry Andric 
mergeSMemLoadImmPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1466e3b55780SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1467cfca06d7SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
1468145449b1SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
14691d5ae102SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
14701d5ae102SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1471706b4fc4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
14721d5ae102SDimitry Andric 
1473706b4fc4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
14741d5ae102SDimitry Andric 
14751d5ae102SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
1476706b4fc4SDimitry Andric   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
14771d5ae102SDimitry Andric 
14781d5ae102SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
14791d5ae102SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
14801d5ae102SDimitry Andric   // will return true if this is the case.
1481706b4fc4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
14821d5ae102SDimitry Andric 
1483e3b55780SDimitry Andric   MachineInstrBuilder New =
1484145449b1SDimitry Andric       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1485e3b55780SDimitry Andric           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1486e3b55780SDimitry Andric   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1487e3b55780SDimitry Andric     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1488e3b55780SDimitry Andric   New.addImm(MergedOffset);
1489e3b55780SDimitry Andric   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
14901d5ae102SDimitry Andric 
1491ac9a064cSDimitry Andric   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
14921d5ae102SDimitry Andric 
14931d5ae102SDimitry Andric   CI.I->eraseFromParent();
1494706b4fc4SDimitry Andric   Paired.I->eraseFromParent();
14951d5ae102SDimitry Andric   return New;
14961d5ae102SDimitry Andric }
14971d5ae102SDimitry Andric 
mergeBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1498cfca06d7SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1499cfca06d7SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
1500145449b1SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
15011d5ae102SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
15021d5ae102SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
15031d5ae102SDimitry Andric 
1504706b4fc4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
15051d5ae102SDimitry Andric 
1506706b4fc4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
15071d5ae102SDimitry Andric 
15081d5ae102SDimitry Andric   // Copy to the new source register.
15091d5ae102SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
1510706b4fc4SDimitry Andric   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
15111d5ae102SDimitry Andric 
1512145449b1SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
15131d5ae102SDimitry Andric 
1514cfca06d7SDimitry Andric   AddressRegs Regs = getRegs(Opcode, *TII);
15151d5ae102SDimitry Andric 
1516cfca06d7SDimitry Andric   if (Regs.VAddr)
15171d5ae102SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
15181d5ae102SDimitry Andric 
15191d5ae102SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
15201d5ae102SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
15211d5ae102SDimitry Andric   // will return true if this is the case.
1522706b4fc4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
15231d5ae102SDimitry Andric 
15241d5ae102SDimitry Andric   MachineInstr *New =
15251d5ae102SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
15261d5ae102SDimitry Andric         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
15271d5ae102SDimitry Andric         .addImm(MergedOffset) // offset
1528344a3780SDimitry Andric         .addImm(CI.CPol)      // cpol
15291d5ae102SDimitry Andric         .addImm(0)            // swz
1530145449b1SDimitry Andric         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
15311d5ae102SDimitry Andric 
1532ac9a064cSDimitry Andric   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
15331d5ae102SDimitry Andric 
15341d5ae102SDimitry Andric   CI.I->eraseFromParent();
1535706b4fc4SDimitry Andric   Paired.I->eraseFromParent();
15361d5ae102SDimitry Andric   return New;
1537044eb2f6SDimitry Andric }
1538044eb2f6SDimitry Andric 
mergeTBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1539cfca06d7SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1540cfca06d7SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
1541145449b1SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
1542706b4fc4SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1543706b4fc4SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1544706b4fc4SDimitry Andric 
1545706b4fc4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
1546706b4fc4SDimitry Andric 
1547706b4fc4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1548706b4fc4SDimitry Andric 
1549706b4fc4SDimitry Andric   // Copy to the new source register.
1550706b4fc4SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
1551706b4fc4SDimitry Andric   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1552706b4fc4SDimitry Andric 
1553145449b1SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1554706b4fc4SDimitry Andric 
1555cfca06d7SDimitry Andric   AddressRegs Regs = getRegs(Opcode, *TII);
1556706b4fc4SDimitry Andric 
1557cfca06d7SDimitry Andric   if (Regs.VAddr)
1558706b4fc4SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1559706b4fc4SDimitry Andric 
1560706b4fc4SDimitry Andric   unsigned JoinedFormat =
1561cfca06d7SDimitry Andric       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1562706b4fc4SDimitry Andric 
1563706b4fc4SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
1564706b4fc4SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
1565706b4fc4SDimitry Andric   // will return true if this is the case.
1566706b4fc4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1567706b4fc4SDimitry Andric 
1568706b4fc4SDimitry Andric   MachineInstr *New =
1569706b4fc4SDimitry Andric       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1570706b4fc4SDimitry Andric           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1571706b4fc4SDimitry Andric           .addImm(MergedOffset) // offset
1572706b4fc4SDimitry Andric           .addImm(JoinedFormat) // format
1573344a3780SDimitry Andric           .addImm(CI.CPol)      // cpol
1574706b4fc4SDimitry Andric           .addImm(0)            // swz
1575145449b1SDimitry Andric           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1576706b4fc4SDimitry Andric 
1577ac9a064cSDimitry Andric   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1578706b4fc4SDimitry Andric 
1579706b4fc4SDimitry Andric   CI.I->eraseFromParent();
1580706b4fc4SDimitry Andric   Paired.I->eraseFromParent();
1581706b4fc4SDimitry Andric   return New;
1582706b4fc4SDimitry Andric }
1583706b4fc4SDimitry Andric 
mergeTBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1584cfca06d7SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1585cfca06d7SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
1586145449b1SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
1587706b4fc4SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1588706b4fc4SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1589706b4fc4SDimitry Andric 
1590706b4fc4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
1591706b4fc4SDimitry Andric 
1592ac9a064cSDimitry Andric   Register SrcReg =
1593ac9a064cSDimitry Andric       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1594706b4fc4SDimitry Andric 
1595145449b1SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1596706b4fc4SDimitry Andric                  .addReg(SrcReg, RegState::Kill);
1597706b4fc4SDimitry Andric 
1598cfca06d7SDimitry Andric   AddressRegs Regs = getRegs(Opcode, *TII);
1599706b4fc4SDimitry Andric 
1600cfca06d7SDimitry Andric   if (Regs.VAddr)
1601706b4fc4SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1602706b4fc4SDimitry Andric 
1603706b4fc4SDimitry Andric   unsigned JoinedFormat =
1604cfca06d7SDimitry Andric       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1605706b4fc4SDimitry Andric 
1606706b4fc4SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
1607706b4fc4SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
1608706b4fc4SDimitry Andric   // will return true if this is the case.
1609706b4fc4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1610706b4fc4SDimitry Andric 
1611706b4fc4SDimitry Andric   MachineInstr *New =
1612706b4fc4SDimitry Andric       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1613706b4fc4SDimitry Andric           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1614706b4fc4SDimitry Andric           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1615706b4fc4SDimitry Andric           .addImm(JoinedFormat)                     // format
1616344a3780SDimitry Andric           .addImm(CI.CPol)                          // cpol
1617706b4fc4SDimitry Andric           .addImm(0)                                // swz
1618145449b1SDimitry Andric           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1619706b4fc4SDimitry Andric 
1620145449b1SDimitry Andric   CI.I->eraseFromParent();
1621145449b1SDimitry Andric   Paired.I->eraseFromParent();
1622145449b1SDimitry Andric   return New;
1623145449b1SDimitry Andric }
1624145449b1SDimitry Andric 
mergeFlatLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1625145449b1SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1626145449b1SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
1627145449b1SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
1628145449b1SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1629145449b1SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1630145449b1SDimitry Andric 
1631145449b1SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
1632145449b1SDimitry Andric 
1633145449b1SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1634145449b1SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
1635145449b1SDimitry Andric 
1636145449b1SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1637145449b1SDimitry Andric 
1638145449b1SDimitry Andric   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1639145449b1SDimitry Andric     MIB.add(*SAddr);
1640145449b1SDimitry Andric 
1641145449b1SDimitry Andric   MachineInstr *New =
1642145449b1SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1643145449b1SDimitry Andric        .addImm(std::min(CI.Offset, Paired.Offset))
1644145449b1SDimitry Andric        .addImm(CI.CPol)
1645145449b1SDimitry Andric        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1646145449b1SDimitry Andric 
1647ac9a064cSDimitry Andric   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1648145449b1SDimitry Andric 
1649145449b1SDimitry Andric   CI.I->eraseFromParent();
1650145449b1SDimitry Andric   Paired.I->eraseFromParent();
1651145449b1SDimitry Andric   return New;
1652145449b1SDimitry Andric }
1653145449b1SDimitry Andric 
mergeFlatStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1654145449b1SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1655145449b1SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
1656145449b1SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
1657145449b1SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1658145449b1SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1659145449b1SDimitry Andric 
1660145449b1SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
1661145449b1SDimitry Andric 
1662ac9a064cSDimitry Andric   Register SrcReg =
1663ac9a064cSDimitry Andric       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1664145449b1SDimitry Andric 
1665145449b1SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1666145449b1SDimitry Andric                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1667145449b1SDimitry Andric                  .addReg(SrcReg, RegState::Kill);
1668145449b1SDimitry Andric 
1669145449b1SDimitry Andric   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1670145449b1SDimitry Andric     MIB.add(*SAddr);
1671145449b1SDimitry Andric 
1672145449b1SDimitry Andric   MachineInstr *New =
1673145449b1SDimitry Andric     MIB.addImm(std::min(CI.Offset, Paired.Offset))
1674145449b1SDimitry Andric        .addImm(CI.CPol)
1675145449b1SDimitry Andric        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1676706b4fc4SDimitry Andric 
1677706b4fc4SDimitry Andric   CI.I->eraseFromParent();
1678706b4fc4SDimitry Andric   Paired.I->eraseFromParent();
1679706b4fc4SDimitry Andric   return New;
1680706b4fc4SDimitry Andric }
1681706b4fc4SDimitry Andric 
getNewOpcode(const CombineInfo & CI,const CombineInfo & Paired)1682706b4fc4SDimitry Andric unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1683706b4fc4SDimitry Andric                                             const CombineInfo &Paired) {
1684706b4fc4SDimitry Andric   const unsigned Width = CI.Width + Paired.Width;
1685044eb2f6SDimitry Andric 
1686145449b1SDimitry Andric   switch (getCommonInstClass(CI, Paired)) {
1687d8e91e46SDimitry Andric   default:
16881d5ae102SDimitry Andric     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
16891d5ae102SDimitry Andric     // FIXME: Handle d16 correctly
16901d5ae102SDimitry Andric     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
16911d5ae102SDimitry Andric                                   Width);
1692706b4fc4SDimitry Andric   case TBUFFER_LOAD:
1693706b4fc4SDimitry Andric   case TBUFFER_STORE:
1694706b4fc4SDimitry Andric     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1695706b4fc4SDimitry Andric                                   Width);
1696706b4fc4SDimitry Andric 
1697d8e91e46SDimitry Andric   case UNKNOWN:
1698d8e91e46SDimitry Andric     llvm_unreachable("Unknown instruction class");
1699d8e91e46SDimitry Andric   case S_BUFFER_LOAD_IMM:
1700d8e91e46SDimitry Andric     switch (Width) {
1701d8e91e46SDimitry Andric     default:
1702044eb2f6SDimitry Andric       return 0;
1703d8e91e46SDimitry Andric     case 2:
1704d8e91e46SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1705312c0ed1SDimitry Andric     case 3:
1706312c0ed1SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1707d8e91e46SDimitry Andric     case 4:
1708d8e91e46SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1709c0981da4SDimitry Andric     case 8:
1710c0981da4SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1711d8e91e46SDimitry Andric     }
1712e3b55780SDimitry Andric   case S_BUFFER_LOAD_SGPR_IMM:
1713e3b55780SDimitry Andric     switch (Width) {
1714e3b55780SDimitry Andric     default:
1715e3b55780SDimitry Andric       return 0;
1716e3b55780SDimitry Andric     case 2:
17177fa27ce4SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1718312c0ed1SDimitry Andric     case 3:
1719312c0ed1SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1720e3b55780SDimitry Andric     case 4:
17217fa27ce4SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1722e3b55780SDimitry Andric     case 8:
17237fa27ce4SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1724e3b55780SDimitry Andric     }
1725ac9a064cSDimitry Andric   case S_LOAD_IMM: {
1726ac9a064cSDimitry Andric     // If XNACK is enabled, use the constrained opcodes when the first load is
1727ac9a064cSDimitry Andric     // under-aligned.
1728ac9a064cSDimitry Andric     const MachineMemOperand *MMO = *CI.I->memoperands_begin();
1729ac9a064cSDimitry Andric     bool NeedsConstrainedOpc =
1730ac9a064cSDimitry Andric         STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
1731e3b55780SDimitry Andric     switch (Width) {
1732e3b55780SDimitry Andric     default:
1733e3b55780SDimitry Andric       return 0;
1734e3b55780SDimitry Andric     case 2:
1735ac9a064cSDimitry Andric       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1736ac9a064cSDimitry Andric                                  : AMDGPU::S_LOAD_DWORDX2_IMM;
1737312c0ed1SDimitry Andric     case 3:
1738ac9a064cSDimitry Andric       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1739ac9a064cSDimitry Andric                                  : AMDGPU::S_LOAD_DWORDX3_IMM;
1740e3b55780SDimitry Andric     case 4:
1741ac9a064cSDimitry Andric       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1742ac9a064cSDimitry Andric                                  : AMDGPU::S_LOAD_DWORDX4_IMM;
1743e3b55780SDimitry Andric     case 8:
1744ac9a064cSDimitry Andric       return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1745ac9a064cSDimitry Andric                                  : AMDGPU::S_LOAD_DWORDX8_IMM;
1746ac9a064cSDimitry Andric     }
1747e3b55780SDimitry Andric   }
1748145449b1SDimitry Andric   case GLOBAL_LOAD:
1749145449b1SDimitry Andric     switch (Width) {
1750145449b1SDimitry Andric     default:
1751145449b1SDimitry Andric       return 0;
1752145449b1SDimitry Andric     case 2:
1753145449b1SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX2;
1754145449b1SDimitry Andric     case 3:
1755145449b1SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX3;
1756145449b1SDimitry Andric     case 4:
1757145449b1SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX4;
1758145449b1SDimitry Andric     }
1759145449b1SDimitry Andric   case GLOBAL_LOAD_SADDR:
1760145449b1SDimitry Andric     switch (Width) {
1761145449b1SDimitry Andric     default:
1762145449b1SDimitry Andric       return 0;
1763145449b1SDimitry Andric     case 2:
1764145449b1SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1765145449b1SDimitry Andric     case 3:
1766145449b1SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1767145449b1SDimitry Andric     case 4:
1768145449b1SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1769145449b1SDimitry Andric     }
1770145449b1SDimitry Andric   case GLOBAL_STORE:
1771145449b1SDimitry Andric     switch (Width) {
1772145449b1SDimitry Andric     default:
1773145449b1SDimitry Andric       return 0;
1774145449b1SDimitry Andric     case 2:
1775145449b1SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX2;
1776145449b1SDimitry Andric     case 3:
1777145449b1SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX3;
1778145449b1SDimitry Andric     case 4:
1779145449b1SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX4;
1780145449b1SDimitry Andric     }
1781145449b1SDimitry Andric   case GLOBAL_STORE_SADDR:
1782145449b1SDimitry Andric     switch (Width) {
1783145449b1SDimitry Andric     default:
1784145449b1SDimitry Andric       return 0;
1785145449b1SDimitry Andric     case 2:
1786145449b1SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1787145449b1SDimitry Andric     case 3:
1788145449b1SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1789145449b1SDimitry Andric     case 4:
1790145449b1SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1791145449b1SDimitry Andric     }
1792145449b1SDimitry Andric   case FLAT_LOAD:
1793145449b1SDimitry Andric     switch (Width) {
1794145449b1SDimitry Andric     default:
1795145449b1SDimitry Andric       return 0;
1796145449b1SDimitry Andric     case 2:
1797145449b1SDimitry Andric       return AMDGPU::FLAT_LOAD_DWORDX2;
1798145449b1SDimitry Andric     case 3:
1799145449b1SDimitry Andric       return AMDGPU::FLAT_LOAD_DWORDX3;
1800145449b1SDimitry Andric     case 4:
1801145449b1SDimitry Andric       return AMDGPU::FLAT_LOAD_DWORDX4;
1802145449b1SDimitry Andric     }
1803145449b1SDimitry Andric   case FLAT_STORE:
1804145449b1SDimitry Andric     switch (Width) {
1805145449b1SDimitry Andric     default:
1806145449b1SDimitry Andric       return 0;
1807145449b1SDimitry Andric     case 2:
1808145449b1SDimitry Andric       return AMDGPU::FLAT_STORE_DWORDX2;
1809145449b1SDimitry Andric     case 3:
1810145449b1SDimitry Andric       return AMDGPU::FLAT_STORE_DWORDX3;
1811145449b1SDimitry Andric     case 4:
1812145449b1SDimitry Andric       return AMDGPU::FLAT_STORE_DWORDX4;
1813145449b1SDimitry Andric     }
18141d5ae102SDimitry Andric   case MIMG:
1815e3b55780SDimitry Andric     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1816c0981da4SDimitry Andric            "No overlaps");
18171d5ae102SDimitry Andric     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1818d8e91e46SDimitry Andric   }
1819044eb2f6SDimitry Andric }
1820044eb2f6SDimitry Andric 
1821d8e91e46SDimitry Andric std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI,const CombineInfo & Paired)1822c0981da4SDimitry Andric SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1823c0981da4SDimitry Andric                                     const CombineInfo &Paired) {
1824e3b55780SDimitry Andric   assert((CI.InstClass != MIMG ||
1825e3b55780SDimitry Andric           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1826145449b1SDimitry Andric            CI.Width + Paired.Width)) &&
18271d5ae102SDimitry Andric          "No overlaps");
18281d5ae102SDimitry Andric 
1829c0981da4SDimitry Andric   unsigned Idx0;
1830c0981da4SDimitry Andric   unsigned Idx1;
1831c0981da4SDimitry Andric 
18326f8fc217SDimitry Andric   static const unsigned Idxs[5][4] = {
18331d5ae102SDimitry Andric       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
18346f8fc217SDimitry Andric       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
18356f8fc217SDimitry Andric       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
18366f8fc217SDimitry Andric       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
18376f8fc217SDimitry Andric       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
18381d5ae102SDimitry Andric   };
18391d5ae102SDimitry Andric 
18406f8fc217SDimitry Andric   assert(CI.Width >= 1 && CI.Width <= 4);
18416f8fc217SDimitry Andric   assert(Paired.Width >= 1 && Paired.Width <= 4);
18421d5ae102SDimitry Andric 
1843145449b1SDimitry Andric   if (Paired < CI) {
1844706b4fc4SDimitry Andric     Idx1 = Idxs[0][Paired.Width - 1];
1845706b4fc4SDimitry Andric     Idx0 = Idxs[Paired.Width][CI.Width - 1];
1846d8e91e46SDimitry Andric   } else {
1847706b4fc4SDimitry Andric     Idx0 = Idxs[0][CI.Width - 1];
1848706b4fc4SDimitry Andric     Idx1 = Idxs[CI.Width][Paired.Width - 1];
1849d8e91e46SDimitry Andric   }
18501d5ae102SDimitry Andric 
1851ac9a064cSDimitry Andric   return {Idx0, Idx1};
1852d8e91e46SDimitry Andric }
1853d8e91e46SDimitry Andric 
1854d8e91e46SDimitry Andric const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI,const CombineInfo & Paired) const1855706b4fc4SDimitry Andric SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1856ac9a064cSDimitry Andric                                              const CombineInfo &Paired) const {
1857e3b55780SDimitry Andric   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1858e3b55780SDimitry Andric       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1859706b4fc4SDimitry Andric     switch (CI.Width + Paired.Width) {
1860d8e91e46SDimitry Andric     default:
1861d8e91e46SDimitry Andric       return nullptr;
1862d8e91e46SDimitry Andric     case 2:
1863d8e91e46SDimitry Andric       return &AMDGPU::SReg_64_XEXECRegClass;
1864312c0ed1SDimitry Andric     case 3:
1865312c0ed1SDimitry Andric       return &AMDGPU::SGPR_96RegClass;
1866d8e91e46SDimitry Andric     case 4:
18671d5ae102SDimitry Andric       return &AMDGPU::SGPR_128RegClass;
1868d8e91e46SDimitry Andric     case 8:
1869cfca06d7SDimitry Andric       return &AMDGPU::SGPR_256RegClass;
1870d8e91e46SDimitry Andric     case 16:
1871cfca06d7SDimitry Andric       return &AMDGPU::SGPR_512RegClass;
1872d8e91e46SDimitry Andric     }
1873d8e91e46SDimitry Andric   }
1874344a3780SDimitry Andric 
1875344a3780SDimitry Andric   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1876f65dcba8SDimitry Andric   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1877344a3780SDimitry Andric              ? TRI->getAGPRClassForBitWidth(BitWidth)
1878344a3780SDimitry Andric              : TRI->getVGPRClassForBitWidth(BitWidth);
1879d8e91e46SDimitry Andric }
1880d8e91e46SDimitry Andric 
mergeBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1881cfca06d7SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1882cfca06d7SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
1883145449b1SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
1884044eb2f6SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1885044eb2f6SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1886044eb2f6SDimitry Andric 
1887706b4fc4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
1888044eb2f6SDimitry Andric 
1889ac9a064cSDimitry Andric   Register SrcReg =
1890ac9a064cSDimitry Andric       copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1891044eb2f6SDimitry Andric 
1892145449b1SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1893044eb2f6SDimitry Andric                  .addReg(SrcReg, RegState::Kill);
1894044eb2f6SDimitry Andric 
1895cfca06d7SDimitry Andric   AddressRegs Regs = getRegs(Opcode, *TII);
1896d8e91e46SDimitry Andric 
1897cfca06d7SDimitry Andric   if (Regs.VAddr)
1898044eb2f6SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1899044eb2f6SDimitry Andric 
19001d5ae102SDimitry Andric 
19011d5ae102SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
19021d5ae102SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
19031d5ae102SDimitry Andric   // will return true if this is the case.
1904706b4fc4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
19051d5ae102SDimitry Andric 
19061d5ae102SDimitry Andric   MachineInstr *New =
1907044eb2f6SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1908044eb2f6SDimitry Andric         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1909706b4fc4SDimitry Andric         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1910344a3780SDimitry Andric         .addImm(CI.CPol)      // cpol
19111d5ae102SDimitry Andric         .addImm(0)            // swz
1912145449b1SDimitry Andric         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1913044eb2f6SDimitry Andric 
1914044eb2f6SDimitry Andric   CI.I->eraseFromParent();
1915706b4fc4SDimitry Andric   Paired.I->eraseFromParent();
19161d5ae102SDimitry Andric   return New;
1917044eb2f6SDimitry Andric }
1918044eb2f6SDimitry Andric 
1919d8e91e46SDimitry Andric MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI) const19201d5ae102SDimitry Andric SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1921d8e91e46SDimitry Andric   APInt V(32, Val, true);
1922d8e91e46SDimitry Andric   if (TII->isInlineConstant(V))
1923d8e91e46SDimitry Andric     return MachineOperand::CreateImm(Val);
1924d8e91e46SDimitry Andric 
19251d5ae102SDimitry Andric   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1926d8e91e46SDimitry Andric   MachineInstr *Mov =
1927d8e91e46SDimitry Andric   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1928d8e91e46SDimitry Andric           TII->get(AMDGPU::S_MOV_B32), Reg)
1929d8e91e46SDimitry Andric     .addImm(Val);
1930d8e91e46SDimitry Andric   (void)Mov;
1931d8e91e46SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1932d8e91e46SDimitry Andric   return MachineOperand::CreateReg(Reg, false);
1933d8e91e46SDimitry Andric }
1934d8e91e46SDimitry Andric 
1935d8e91e46SDimitry Andric // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr) const1936cfca06d7SDimitry Andric Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
19371d5ae102SDimitry Andric                                            const MemAddress &Addr) const {
1938d8e91e46SDimitry Andric   MachineBasicBlock *MBB = MI.getParent();
1939d8e91e46SDimitry Andric   MachineBasicBlock::iterator MBBI = MI.getIterator();
1940d8e91e46SDimitry Andric   DebugLoc DL = MI.getDebugLoc();
1941d8e91e46SDimitry Andric 
1942d8e91e46SDimitry Andric   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1943d8e91e46SDimitry Andric           Addr.Base.LoSubReg) &&
1944d8e91e46SDimitry Andric          "Expected 32-bit Base-Register-Low!!");
1945d8e91e46SDimitry Andric 
1946d8e91e46SDimitry Andric   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1947d8e91e46SDimitry Andric           Addr.Base.HiSubReg) &&
1948d8e91e46SDimitry Andric          "Expected 32-bit Base-Register-Hi!!");
1949d8e91e46SDimitry Andric 
1950d8e91e46SDimitry Andric   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1951d8e91e46SDimitry Andric   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1952d8e91e46SDimitry Andric   MachineOperand OffsetHi =
1953d8e91e46SDimitry Andric     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1954e6d15924SDimitry Andric 
1955e6d15924SDimitry Andric   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
19561d5ae102SDimitry Andric   Register CarryReg = MRI->createVirtualRegister(CarryRC);
19571d5ae102SDimitry Andric   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1958d8e91e46SDimitry Andric 
19591d5ae102SDimitry Andric   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
19601d5ae102SDimitry Andric   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1961d8e91e46SDimitry Andric   MachineInstr *LoHalf =
1962b60736ecSDimitry Andric     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1963d8e91e46SDimitry Andric       .addReg(CarryReg, RegState::Define)
1964d8e91e46SDimitry Andric       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1965e6d15924SDimitry Andric       .add(OffsetLo)
1966e6d15924SDimitry Andric       .addImm(0); // clamp bit
1967d8e91e46SDimitry Andric   (void)LoHalf;
1968d8e91e46SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1969d8e91e46SDimitry Andric 
1970d8e91e46SDimitry Andric   MachineInstr *HiHalf =
1971d8e91e46SDimitry Andric   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1972d8e91e46SDimitry Andric     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1973d8e91e46SDimitry Andric     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1974d8e91e46SDimitry Andric     .add(OffsetHi)
1975e6d15924SDimitry Andric     .addReg(CarryReg, RegState::Kill)
1976e6d15924SDimitry Andric     .addImm(0); // clamp bit
1977d8e91e46SDimitry Andric   (void)HiHalf;
1978d8e91e46SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1979d8e91e46SDimitry Andric 
1980344a3780SDimitry Andric   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1981d8e91e46SDimitry Andric   MachineInstr *FullBase =
1982d8e91e46SDimitry Andric     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1983d8e91e46SDimitry Andric       .addReg(DestSub0)
1984d8e91e46SDimitry Andric       .addImm(AMDGPU::sub0)
1985d8e91e46SDimitry Andric       .addReg(DestSub1)
1986d8e91e46SDimitry Andric       .addImm(AMDGPU::sub1);
1987d8e91e46SDimitry Andric   (void)FullBase;
1988d8e91e46SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1989d8e91e46SDimitry Andric 
1990d8e91e46SDimitry Andric   return FullDestReg;
1991d8e91e46SDimitry Andric }
1992d8e91e46SDimitry Andric 
1993d8e91e46SDimitry Andric // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,Register NewBase,int32_t NewOffset) const1994d8e91e46SDimitry Andric void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1995cfca06d7SDimitry Andric                                                Register NewBase,
19961d5ae102SDimitry Andric                                                int32_t NewOffset) const {
1997706b4fc4SDimitry Andric   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1998706b4fc4SDimitry Andric   Base->setReg(NewBase);
1999706b4fc4SDimitry Andric   Base->setIsKill(false);
2000d8e91e46SDimitry Andric   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2001d8e91e46SDimitry Andric }
2002d8e91e46SDimitry Andric 
2003e3b55780SDimitry Andric std::optional<int32_t>
extractConstOffset(const MachineOperand & Op) const20041d5ae102SDimitry Andric SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2005d8e91e46SDimitry Andric   if (Op.isImm())
2006d8e91e46SDimitry Andric     return Op.getImm();
2007d8e91e46SDimitry Andric 
2008d8e91e46SDimitry Andric   if (!Op.isReg())
2009e3b55780SDimitry Andric     return std::nullopt;
2010d8e91e46SDimitry Andric 
2011d8e91e46SDimitry Andric   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2012d8e91e46SDimitry Andric   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2013d8e91e46SDimitry Andric       !Def->getOperand(1).isImm())
2014e3b55780SDimitry Andric     return std::nullopt;
2015d8e91e46SDimitry Andric 
2016d8e91e46SDimitry Andric   return Def->getOperand(1).getImm();
2017d8e91e46SDimitry Andric }
2018d8e91e46SDimitry Andric 
2019d8e91e46SDimitry Andric // Analyze Base and extracts:
2020d8e91e46SDimitry Andric //  - 32bit base registers, subregisters
2021d8e91e46SDimitry Andric //  - 64bit constant offset
2022d8e91e46SDimitry Andric // Expecting base computation as:
2023d8e91e46SDimitry Andric //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
2024d8e91e46SDimitry Andric //   %LO:vgpr_32, %c:sreg_64_xexec =
2025b60736ecSDimitry Andric //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2026d8e91e46SDimitry Andric //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2027d8e91e46SDimitry Andric //   %Base:vreg_64 =
2028d8e91e46SDimitry Andric //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr) const2029d8e91e46SDimitry Andric void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
20301d5ae102SDimitry Andric                                                       MemAddress &Addr) const {
2031d8e91e46SDimitry Andric   if (!Base.isReg())
2032d8e91e46SDimitry Andric     return;
2033d8e91e46SDimitry Andric 
2034d8e91e46SDimitry Andric   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2035d8e91e46SDimitry Andric   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2036d8e91e46SDimitry Andric       || Def->getNumOperands() != 5)
2037d8e91e46SDimitry Andric     return;
2038d8e91e46SDimitry Andric 
2039d8e91e46SDimitry Andric   MachineOperand BaseLo = Def->getOperand(1);
2040d8e91e46SDimitry Andric   MachineOperand BaseHi = Def->getOperand(3);
2041d8e91e46SDimitry Andric   if (!BaseLo.isReg() || !BaseHi.isReg())
2042d8e91e46SDimitry Andric     return;
2043d8e91e46SDimitry Andric 
2044d8e91e46SDimitry Andric   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2045d8e91e46SDimitry Andric   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2046d8e91e46SDimitry Andric 
2047b60736ecSDimitry Andric   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2048d8e91e46SDimitry Andric       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2049d8e91e46SDimitry Andric     return;
2050d8e91e46SDimitry Andric 
2051d8e91e46SDimitry Andric   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2052d8e91e46SDimitry Andric   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2053d8e91e46SDimitry Andric 
2054d8e91e46SDimitry Andric   auto Offset0P = extractConstOffset(*Src0);
2055d8e91e46SDimitry Andric   if (Offset0P)
2056d8e91e46SDimitry Andric     BaseLo = *Src1;
2057d8e91e46SDimitry Andric   else {
2058d8e91e46SDimitry Andric     if (!(Offset0P = extractConstOffset(*Src1)))
2059d8e91e46SDimitry Andric       return;
2060d8e91e46SDimitry Andric     BaseLo = *Src0;
2061d8e91e46SDimitry Andric   }
2062d8e91e46SDimitry Andric 
2063d8e91e46SDimitry Andric   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2064d8e91e46SDimitry Andric   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2065d8e91e46SDimitry Andric 
2066d8e91e46SDimitry Andric   if (Src0->isImm())
2067d8e91e46SDimitry Andric     std::swap(Src0, Src1);
2068d8e91e46SDimitry Andric 
2069ac9a064cSDimitry Andric   if (!Src1->isImm() || Src0->isImm())
2070d8e91e46SDimitry Andric     return;
2071d8e91e46SDimitry Andric 
2072d8e91e46SDimitry Andric   uint64_t Offset1 = Src1->getImm();
2073d8e91e46SDimitry Andric   BaseHi = *Src0;
2074d8e91e46SDimitry Andric 
2075d8e91e46SDimitry Andric   Addr.Base.LoReg = BaseLo.getReg();
2076d8e91e46SDimitry Andric   Addr.Base.HiReg = BaseHi.getReg();
2077d8e91e46SDimitry Andric   Addr.Base.LoSubReg = BaseLo.getSubReg();
2078d8e91e46SDimitry Andric   Addr.Base.HiSubReg = BaseHi.getSubReg();
2079d8e91e46SDimitry Andric   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2080d8e91e46SDimitry Andric }
2081d8e91e46SDimitry Andric 
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList) const2082d8e91e46SDimitry Andric bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2083d8e91e46SDimitry Andric     MachineInstr &MI,
2084d8e91e46SDimitry Andric     MemInfoMap &Visited,
20851d5ae102SDimitry Andric     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2086d8e91e46SDimitry Andric 
2087ac9a064cSDimitry Andric   if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2088d8e91e46SDimitry Andric     return false;
2089d8e91e46SDimitry Andric 
2090ac9a064cSDimitry Andric   // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2091ac9a064cSDimitry Andric   if (SIInstrInfo::isFLATScratch(MI))
20921d5ae102SDimitry Andric     return false;
20931d5ae102SDimitry Andric 
2094ac9a064cSDimitry Andric   unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS
2095ac9a064cSDimitry Andric                                               : AMDGPUAS::FLAT_ADDRESS;
2096d8e91e46SDimitry Andric 
2097d8e91e46SDimitry Andric   if (AnchorList.count(&MI))
2098d8e91e46SDimitry Andric     return false;
2099d8e91e46SDimitry Andric 
2100d8e91e46SDimitry Andric   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2101d8e91e46SDimitry Andric 
2102d8e91e46SDimitry Andric   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2103d8e91e46SDimitry Andric     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
2104d8e91e46SDimitry Andric     return false;
2105d8e91e46SDimitry Andric   }
2106d8e91e46SDimitry Andric 
2107d8e91e46SDimitry Andric   // Step1: Find the base-registers and a 64bit constant offset.
2108d8e91e46SDimitry Andric   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2109d8e91e46SDimitry Andric   MemAddress MAddr;
21107fa27ce4SDimitry Andric   if (!Visited.contains(&MI)) {
2111d8e91e46SDimitry Andric     processBaseWithConstOffset(Base, MAddr);
2112d8e91e46SDimitry Andric     Visited[&MI] = MAddr;
2113d8e91e46SDimitry Andric   } else
2114d8e91e46SDimitry Andric     MAddr = Visited[&MI];
2115d8e91e46SDimitry Andric 
2116d8e91e46SDimitry Andric   if (MAddr.Offset == 0) {
2117d8e91e46SDimitry Andric     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
2118d8e91e46SDimitry Andric                          " constant offsets that can be promoted.\n";);
2119d8e91e46SDimitry Andric     return false;
2120d8e91e46SDimitry Andric   }
2121d8e91e46SDimitry Andric 
2122d8e91e46SDimitry Andric   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
2123d8e91e46SDimitry Andric              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2124d8e91e46SDimitry Andric 
2125d8e91e46SDimitry Andric   // Step2: Traverse through MI's basic block and find an anchor(that has the
2126d8e91e46SDimitry Andric   // same base-registers) with the highest 13bit distance from MI's offset.
2127d8e91e46SDimitry Andric   // E.g. (64bit loads)
2128d8e91e46SDimitry Andric   // bb:
2129d8e91e46SDimitry Andric   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
2130d8e91e46SDimitry Andric   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
2131d8e91e46SDimitry Andric   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
2132d8e91e46SDimitry Andric   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
2133d8e91e46SDimitry Andric   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2134d8e91e46SDimitry Andric   //
2135d8e91e46SDimitry Andric   // Starting from the first load, the optimization will try to find a new base
2136d8e91e46SDimitry Andric   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2137d8e91e46SDimitry Andric   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2138d8e91e46SDimitry Andric   // as the new-base(anchor) because of the maximum distance which can
2139145449b1SDimitry Andric   // accommodate more intermediate bases presumably.
2140d8e91e46SDimitry Andric   //
2141d8e91e46SDimitry Andric   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2142d8e91e46SDimitry Andric   // (&a + 8192) for load1, load2, load4.
2143d8e91e46SDimitry Andric   //   addr = &a + 8192
2144d8e91e46SDimitry Andric   //   load1 = load(addr,       -4096)
2145d8e91e46SDimitry Andric   //   load2 = load(addr,       -2048)
2146d8e91e46SDimitry Andric   //   load3 = load(addr,       0)
2147d8e91e46SDimitry Andric   //   load4 = load(addr,       2048)
2148d8e91e46SDimitry Andric   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
2149d8e91e46SDimitry Andric   //
2150d8e91e46SDimitry Andric   MachineInstr *AnchorInst = nullptr;
2151d8e91e46SDimitry Andric   MemAddress AnchorAddr;
2152d8e91e46SDimitry Andric   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2153d8e91e46SDimitry Andric   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2154d8e91e46SDimitry Andric 
2155d8e91e46SDimitry Andric   MachineBasicBlock *MBB = MI.getParent();
2156d8e91e46SDimitry Andric   MachineBasicBlock::iterator E = MBB->end();
2157d8e91e46SDimitry Andric   MachineBasicBlock::iterator MBBI = MI.getIterator();
2158d8e91e46SDimitry Andric   ++MBBI;
2159d8e91e46SDimitry Andric   const SITargetLowering *TLI =
2160d8e91e46SDimitry Andric     static_cast<const SITargetLowering *>(STM->getTargetLowering());
2161d8e91e46SDimitry Andric 
2162d8e91e46SDimitry Andric   for ( ; MBBI != E; ++MBBI) {
2163d8e91e46SDimitry Andric     MachineInstr &MINext = *MBBI;
2164d8e91e46SDimitry Andric     // TODO: Support finding an anchor(with same base) from store addresses or
2165d8e91e46SDimitry Andric     // any other load addresses where the opcodes are different.
2166d8e91e46SDimitry Andric     if (MINext.getOpcode() != MI.getOpcode() ||
2167d8e91e46SDimitry Andric         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2168d8e91e46SDimitry Andric       continue;
2169d8e91e46SDimitry Andric 
2170d8e91e46SDimitry Andric     const MachineOperand &BaseNext =
2171d8e91e46SDimitry Andric       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2172d8e91e46SDimitry Andric     MemAddress MAddrNext;
21737fa27ce4SDimitry Andric     if (!Visited.contains(&MINext)) {
2174d8e91e46SDimitry Andric       processBaseWithConstOffset(BaseNext, MAddrNext);
2175d8e91e46SDimitry Andric       Visited[&MINext] = MAddrNext;
2176d8e91e46SDimitry Andric     } else
2177d8e91e46SDimitry Andric       MAddrNext = Visited[&MINext];
2178d8e91e46SDimitry Andric 
2179d8e91e46SDimitry Andric     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2180d8e91e46SDimitry Andric         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2181d8e91e46SDimitry Andric         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2182d8e91e46SDimitry Andric         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2183d8e91e46SDimitry Andric       continue;
2184d8e91e46SDimitry Andric 
2185ac9a064cSDimitry Andric     InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2186d8e91e46SDimitry Andric 
2187d8e91e46SDimitry Andric     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2188d8e91e46SDimitry Andric     TargetLoweringBase::AddrMode AM;
2189d8e91e46SDimitry Andric     AM.HasBaseReg = true;
2190d8e91e46SDimitry Andric     AM.BaseOffs = Dist;
2191ac9a064cSDimitry Andric     if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2192d8e91e46SDimitry Andric         (uint32_t)std::abs(Dist) > MaxDist) {
2193d8e91e46SDimitry Andric       MaxDist = std::abs(Dist);
2194d8e91e46SDimitry Andric 
2195d8e91e46SDimitry Andric       AnchorAddr = MAddrNext;
2196d8e91e46SDimitry Andric       AnchorInst = &MINext;
2197d8e91e46SDimitry Andric     }
2198d8e91e46SDimitry Andric   }
2199d8e91e46SDimitry Andric 
2200d8e91e46SDimitry Andric   if (AnchorInst) {
2201d8e91e46SDimitry Andric     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
2202d8e91e46SDimitry Andric                AnchorInst->dump());
2203d8e91e46SDimitry Andric     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
2204d8e91e46SDimitry Andric                <<  AnchorAddr.Offset << "\n\n");
2205d8e91e46SDimitry Andric 
2206d8e91e46SDimitry Andric     // Instead of moving up, just re-compute anchor-instruction's base address.
2207cfca06d7SDimitry Andric     Register Base = computeBase(MI, AnchorAddr);
2208d8e91e46SDimitry Andric 
2209d8e91e46SDimitry Andric     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2210d8e91e46SDimitry Andric     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
2211d8e91e46SDimitry Andric 
2212ac9a064cSDimitry Andric     for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2213d8e91e46SDimitry Andric       TargetLoweringBase::AddrMode AM;
2214d8e91e46SDimitry Andric       AM.HasBaseReg = true;
2215ac9a064cSDimitry Andric       AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2216d8e91e46SDimitry Andric 
2217ac9a064cSDimitry Andric       if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2218ac9a064cSDimitry Andric         LLVM_DEBUG(dbgs() << "  Promote Offset(" << OtherOffset; dbgs() << ")";
2219ac9a064cSDimitry Andric                    OtherMI->dump());
2220ac9a064cSDimitry Andric         updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);
2221ac9a064cSDimitry Andric         LLVM_DEBUG(dbgs() << "     After promotion: "; OtherMI->dump());
2222d8e91e46SDimitry Andric       }
2223d8e91e46SDimitry Andric     }
2224d8e91e46SDimitry Andric     AnchorList.insert(AnchorInst);
2225d8e91e46SDimitry Andric     return true;
2226d8e91e46SDimitry Andric   }
2227d8e91e46SDimitry Andric 
2228d8e91e46SDimitry Andric   return false;
2229d8e91e46SDimitry Andric }
2230d8e91e46SDimitry Andric 
addInstToMergeableList(const CombineInfo & CI,std::list<std::list<CombineInfo>> & MergeableInsts) const22311d5ae102SDimitry Andric void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
22321d5ae102SDimitry Andric                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
22331d5ae102SDimitry Andric   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2234706b4fc4SDimitry Andric     if (AddrList.front().InstClass == CI.InstClass &&
22356f8fc217SDimitry Andric         AddrList.front().IsAGPR == CI.IsAGPR &&
2236e3b55780SDimitry Andric         AddrList.front().hasSameBaseAddress(CI)) {
22371d5ae102SDimitry Andric       AddrList.emplace_back(CI);
22381d5ae102SDimitry Andric       return;
22391d5ae102SDimitry Andric     }
22401d5ae102SDimitry Andric   }
224167c32a98SDimitry Andric 
22421d5ae102SDimitry Andric   // Base address not found, so add a new list.
22431d5ae102SDimitry Andric   MergeableInsts.emplace_back(1, CI);
22441d5ae102SDimitry Andric }
22451d5ae102SDimitry Andric 
2246cfca06d7SDimitry Andric std::pair<MachineBasicBlock::iterator, bool>
collectMergeableInsts(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList,std::list<std::list<CombineInfo>> & MergeableInsts) const2247cfca06d7SDimitry Andric SILoadStoreOptimizer::collectMergeableInsts(
2248cfca06d7SDimitry Andric     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2249cfca06d7SDimitry Andric     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
22501d5ae102SDimitry Andric     std::list<std::list<CombineInfo>> &MergeableInsts) const {
22511d5ae102SDimitry Andric   bool Modified = false;
2252d8e91e46SDimitry Andric 
22531d5ae102SDimitry Andric   // Sort potential mergeable instructions into lists.  One list per base address.
2254cfca06d7SDimitry Andric   unsigned Order = 0;
2255cfca06d7SDimitry Andric   MachineBasicBlock::iterator BlockI = Begin;
2256cfca06d7SDimitry Andric   for (; BlockI != End; ++BlockI) {
2257cfca06d7SDimitry Andric     MachineInstr &MI = *BlockI;
2258cfca06d7SDimitry Andric 
22591d5ae102SDimitry Andric     // We run this before checking if an address is mergeable, because it can produce
22601d5ae102SDimitry Andric     // better code even if the instructions aren't mergeable.
2261d8e91e46SDimitry Andric     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2262d8e91e46SDimitry Andric       Modified = true;
2263d8e91e46SDimitry Andric 
2264ecbca9f5SDimitry Andric     // Treat volatile accesses, ordered accesses and unmodeled side effects as
2265ecbca9f5SDimitry Andric     // barriers. We can look after this barrier for separate merges.
2266ecbca9f5SDimitry Andric     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2267ecbca9f5SDimitry Andric       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2268cfca06d7SDimitry Andric 
2269cfca06d7SDimitry Andric       // Search will resume after this instruction in a separate merge list.
2270cfca06d7SDimitry Andric       ++BlockI;
2271cfca06d7SDimitry Andric       break;
2272cfca06d7SDimitry Andric     }
2273cfca06d7SDimitry Andric 
22741d5ae102SDimitry Andric     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
22751d5ae102SDimitry Andric     if (InstClass == UNKNOWN)
22761d5ae102SDimitry Andric       continue;
22771d5ae102SDimitry Andric 
22786f8fc217SDimitry Andric     // Do not merge VMEM buffer instructions with "swizzled" bit set.
22796f8fc217SDimitry Andric     int Swizzled =
22806f8fc217SDimitry Andric         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
22816f8fc217SDimitry Andric     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
22826f8fc217SDimitry Andric       continue;
22836f8fc217SDimitry Andric 
22841d5ae102SDimitry Andric     CombineInfo CI;
22856f8fc217SDimitry Andric     CI.setMI(MI, *this);
2286cfca06d7SDimitry Andric     CI.Order = Order++;
22871d5ae102SDimitry Andric 
22881d5ae102SDimitry Andric     if (!CI.hasMergeableAddress(*MRI))
22891d5ae102SDimitry Andric       continue;
22901d5ae102SDimitry Andric 
22916f8fc217SDimitry Andric     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
22926f8fc217SDimitry Andric       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
22936f8fc217SDimitry Andric       //        operands. However we are reporting that ds_write2 shall have
22946f8fc217SDimitry Andric       //        only VGPR data so that machine copy propagation does not
22956f8fc217SDimitry Andric       //        create an illegal instruction with a VGPR and AGPR sources.
22966f8fc217SDimitry Andric       //        Consequenctially if we create such instruction the verifier
22976f8fc217SDimitry Andric       //        will complain.
22986f8fc217SDimitry Andric       continue;
22996f8fc217SDimitry Andric     }
23006f8fc217SDimitry Andric 
2301cfca06d7SDimitry Andric     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2302cfca06d7SDimitry Andric 
23031d5ae102SDimitry Andric     addInstToMergeableList(CI, MergeableInsts);
23041d5ae102SDimitry Andric   }
2305cfca06d7SDimitry Andric 
2306cfca06d7SDimitry Andric   // At this point we have lists of Mergeable instructions.
2307cfca06d7SDimitry Andric   //
2308cfca06d7SDimitry Andric   // Part 2: Sort lists by offset and then for each CombineInfo object in the
2309cfca06d7SDimitry Andric   // list try to find an instruction that can be merged with I.  If an instruction
2310cfca06d7SDimitry Andric   // is found, it is stored in the Paired field.  If no instructions are found, then
2311cfca06d7SDimitry Andric   // the CombineInfo object is deleted from the list.
2312cfca06d7SDimitry Andric 
2313cfca06d7SDimitry Andric   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2314cfca06d7SDimitry Andric                                                    E = MergeableInsts.end(); I != E;) {
2315cfca06d7SDimitry Andric 
2316cfca06d7SDimitry Andric     std::list<CombineInfo> &MergeList = *I;
2317cfca06d7SDimitry Andric     if (MergeList.size() <= 1) {
2318cfca06d7SDimitry Andric       // This means we have found only one instruction with a given address
2319cfca06d7SDimitry Andric       // that can be merged, and we need at least 2 instructions to do a merge,
2320cfca06d7SDimitry Andric       // so this list can be discarded.
2321cfca06d7SDimitry Andric       I = MergeableInsts.erase(I);
2322cfca06d7SDimitry Andric       continue;
2323cfca06d7SDimitry Andric     }
2324cfca06d7SDimitry Andric 
2325cfca06d7SDimitry Andric     // Sort the lists by offsets, this way mergeable instructions will be
2326cfca06d7SDimitry Andric     // adjacent to each other in the list, which will make it easier to find
2327cfca06d7SDimitry Andric     // matches.
2328cfca06d7SDimitry Andric     MergeList.sort(
2329c0981da4SDimitry Andric         [] (const CombineInfo &A, const CombineInfo &B) {
2330cfca06d7SDimitry Andric           return A.Offset < B.Offset;
2331cfca06d7SDimitry Andric         });
2332cfca06d7SDimitry Andric     ++I;
2333cfca06d7SDimitry Andric   }
2334cfca06d7SDimitry Andric 
2335ac9a064cSDimitry Andric   return {BlockI, Modified};
23361d5ae102SDimitry Andric }
23371d5ae102SDimitry Andric 
23381d5ae102SDimitry Andric // Scan through looking for adjacent LDS operations with constant offsets from
23391d5ae102SDimitry Andric // the same base register. We rely on the scheduler to do the hard work of
23401d5ae102SDimitry Andric // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(std::list<std::list<CombineInfo>> & MergeableInsts)23411d5ae102SDimitry Andric bool SILoadStoreOptimizer::optimizeBlock(
23421d5ae102SDimitry Andric                        std::list<std::list<CombineInfo> > &MergeableInsts) {
23431d5ae102SDimitry Andric   bool Modified = false;
23441d5ae102SDimitry Andric 
2345cfca06d7SDimitry Andric   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2346cfca06d7SDimitry Andric                                                    E = MergeableInsts.end(); I != E;) {
2347cfca06d7SDimitry Andric     std::list<CombineInfo> &MergeList = *I;
23481d5ae102SDimitry Andric 
23491d5ae102SDimitry Andric     bool OptimizeListAgain = false;
23501d5ae102SDimitry Andric     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2351cfca06d7SDimitry Andric       // We weren't able to make any changes, so delete the list so we don't
23521d5ae102SDimitry Andric       // process the same instructions the next time we try to optimize this
23531d5ae102SDimitry Andric       // block.
2354cfca06d7SDimitry Andric       I = MergeableInsts.erase(I);
235567c32a98SDimitry Andric       continue;
235667c32a98SDimitry Andric     }
235767c32a98SDimitry Andric 
2358cfca06d7SDimitry Andric     Modified = true;
2359cfca06d7SDimitry Andric 
23601d5ae102SDimitry Andric     // We made changes, but also determined that there were no more optimization
23611d5ae102SDimitry Andric     // opportunities, so we don't need to reprocess the list
2362cfca06d7SDimitry Andric     if (!OptimizeListAgain) {
2363cfca06d7SDimitry Andric       I = MergeableInsts.erase(I);
2364cfca06d7SDimitry Andric       continue;
2365cfca06d7SDimitry Andric     }
2366cfca06d7SDimitry Andric     OptimizeAgain = true;
23671d5ae102SDimitry Andric   }
23681d5ae102SDimitry Andric   return Modified;
23691d5ae102SDimitry Andric }
23701d5ae102SDimitry Andric 
23711d5ae102SDimitry Andric bool
optimizeInstsWithSameBaseAddr(std::list<CombineInfo> & MergeList,bool & OptimizeListAgain)23721d5ae102SDimitry Andric SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
23731d5ae102SDimitry Andric                                           std::list<CombineInfo> &MergeList,
23741d5ae102SDimitry Andric                                           bool &OptimizeListAgain) {
2375cfca06d7SDimitry Andric   if (MergeList.empty())
2376cfca06d7SDimitry Andric     return false;
2377cfca06d7SDimitry Andric 
23781d5ae102SDimitry Andric   bool Modified = false;
2379706b4fc4SDimitry Andric 
2380cfca06d7SDimitry Andric   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2381cfca06d7SDimitry Andric        Next = std::next(I)) {
2382cfca06d7SDimitry Andric 
2383cfca06d7SDimitry Andric     auto First = I;
2384cfca06d7SDimitry Andric     auto Second = Next;
2385cfca06d7SDimitry Andric 
2386cfca06d7SDimitry Andric     if ((*First).Order > (*Second).Order)
2387cfca06d7SDimitry Andric       std::swap(First, Second);
2388cfca06d7SDimitry Andric     CombineInfo &CI = *First;
2389cfca06d7SDimitry Andric     CombineInfo &Paired = *Second;
2390cfca06d7SDimitry Andric 
2391145449b1SDimitry Andric     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2392145449b1SDimitry Andric     if (!Where) {
2393cfca06d7SDimitry Andric       ++I;
2394706b4fc4SDimitry Andric       continue;
2395cfca06d7SDimitry Andric     }
2396706b4fc4SDimitry Andric 
2397706b4fc4SDimitry Andric     Modified = true;
2398cfca06d7SDimitry Andric 
2399cfca06d7SDimitry Andric     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
2400044eb2f6SDimitry Andric 
2401145449b1SDimitry Andric     MachineBasicBlock::iterator NewMI;
2402d8e91e46SDimitry Andric     switch (CI.InstClass) {
2403d8e91e46SDimitry Andric     default:
2404706b4fc4SDimitry Andric       llvm_unreachable("unknown InstClass");
2405d8e91e46SDimitry Andric       break;
2406145449b1SDimitry Andric     case DS_READ:
2407145449b1SDimitry Andric       NewMI = mergeRead2Pair(CI, Paired, Where->I);
2408145449b1SDimitry Andric       break;
2409145449b1SDimitry Andric     case DS_WRITE:
2410145449b1SDimitry Andric       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2411145449b1SDimitry Andric       break;
2412145449b1SDimitry Andric     case S_BUFFER_LOAD_IMM:
2413e3b55780SDimitry Andric     case S_BUFFER_LOAD_SGPR_IMM:
2414e3b55780SDimitry Andric     case S_LOAD_IMM:
2415e3b55780SDimitry Andric       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2416145449b1SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 8;
2417145449b1SDimitry Andric       break;
2418145449b1SDimitry Andric     case BUFFER_LOAD:
2419145449b1SDimitry Andric       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2420145449b1SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2421145449b1SDimitry Andric       break;
2422145449b1SDimitry Andric     case BUFFER_STORE:
2423145449b1SDimitry Andric       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2424145449b1SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2425145449b1SDimitry Andric       break;
2426145449b1SDimitry Andric     case MIMG:
2427145449b1SDimitry Andric       NewMI = mergeImagePair(CI, Paired, Where->I);
2428145449b1SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2429145449b1SDimitry Andric       break;
2430145449b1SDimitry Andric     case TBUFFER_LOAD:
2431145449b1SDimitry Andric       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2432145449b1SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2433145449b1SDimitry Andric       break;
2434145449b1SDimitry Andric     case TBUFFER_STORE:
2435145449b1SDimitry Andric       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2436145449b1SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2437145449b1SDimitry Andric       break;
2438145449b1SDimitry Andric     case FLAT_LOAD:
2439145449b1SDimitry Andric     case GLOBAL_LOAD:
2440145449b1SDimitry Andric     case GLOBAL_LOAD_SADDR:
2441145449b1SDimitry Andric       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2442145449b1SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
2443145449b1SDimitry Andric       break;
2444145449b1SDimitry Andric     case FLAT_STORE:
2445145449b1SDimitry Andric     case GLOBAL_STORE:
2446145449b1SDimitry Andric     case GLOBAL_STORE_SADDR:
2447145449b1SDimitry Andric       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2448145449b1SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
24491d5ae102SDimitry Andric       break;
2450706b4fc4SDimitry Andric     }
24516f8fc217SDimitry Andric     CI.setMI(NewMI, *this);
2452145449b1SDimitry Andric     CI.Order = Where->Order;
2453cfca06d7SDimitry Andric     if (I == Second)
2454cfca06d7SDimitry Andric       I = Next;
2455706b4fc4SDimitry Andric 
2456cfca06d7SDimitry Andric     MergeList.erase(Second);
245767c32a98SDimitry Andric   }
245867c32a98SDimitry Andric 
245967c32a98SDimitry Andric   return Modified;
246067c32a98SDimitry Andric }
246167c32a98SDimitry Andric 
runOnMachineFunction(MachineFunction & MF)246267c32a98SDimitry Andric bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2463044eb2f6SDimitry Andric   if (skipFunction(MF.getFunction()))
246401095a5dSDimitry Andric     return false;
246501095a5dSDimitry Andric 
2466eb11fae6SDimitry Andric   STM = &MF.getSubtarget<GCNSubtarget>();
2467044eb2f6SDimitry Andric   if (!STM->loadStoreOptEnabled())
246801095a5dSDimitry Andric     return false;
246901095a5dSDimitry Andric 
2470044eb2f6SDimitry Andric   TII = STM->getInstrInfo();
247101095a5dSDimitry Andric   TRI = &TII->getRegisterInfo();
247201095a5dSDimitry Andric 
247367c32a98SDimitry Andric   MRI = &MF.getRegInfo();
2474b915e9e0SDimitry Andric   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
247567c32a98SDimitry Andric 
2476eb11fae6SDimitry Andric   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
247767c32a98SDimitry Andric 
247867c32a98SDimitry Andric   bool Modified = false;
247967c32a98SDimitry Andric 
2480cfca06d7SDimitry Andric   // Contains the list of instructions for which constant offsets are being
2481cfca06d7SDimitry Andric   // promoted to the IMM. This is tracked for an entire block at time.
2482cfca06d7SDimitry Andric   SmallPtrSet<MachineInstr *, 4> AnchorList;
2483cfca06d7SDimitry Andric   MemInfoMap Visited;
24841d5ae102SDimitry Andric 
2485044eb2f6SDimitry Andric   for (MachineBasicBlock &MBB : MF) {
2486cfca06d7SDimitry Andric     MachineBasicBlock::iterator SectionEnd;
2487cfca06d7SDimitry Andric     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2488cfca06d7SDimitry Andric          I = SectionEnd) {
2489cfca06d7SDimitry Andric       bool CollectModified;
24901d5ae102SDimitry Andric       std::list<std::list<CombineInfo>> MergeableInsts;
2491cfca06d7SDimitry Andric 
2492cfca06d7SDimitry Andric       // First pass: Collect list of all instructions we know how to merge in a
2493cfca06d7SDimitry Andric       // subset of the block.
2494cfca06d7SDimitry Andric       std::tie(SectionEnd, CollectModified) =
2495cfca06d7SDimitry Andric           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2496cfca06d7SDimitry Andric 
2497cfca06d7SDimitry Andric       Modified |= CollectModified;
2498cfca06d7SDimitry Andric 
2499d8e91e46SDimitry Andric       do {
2500d8e91e46SDimitry Andric         OptimizeAgain = false;
25011d5ae102SDimitry Andric         Modified |= optimizeBlock(MergeableInsts);
2502d8e91e46SDimitry Andric       } while (OptimizeAgain);
2503044eb2f6SDimitry Andric     }
2504044eb2f6SDimitry Andric 
2505cfca06d7SDimitry Andric     Visited.clear();
2506cfca06d7SDimitry Andric     AnchorList.clear();
2507cfca06d7SDimitry Andric   }
2508cfca06d7SDimitry Andric 
250967c32a98SDimitry Andric   return Modified;
251067c32a98SDimitry Andric }
2511