1b1c73532SDimitry Andric //===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==// 2b1c73532SDimitry Andric // 3b1c73532SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4b1c73532SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5b1c73532SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6b1c73532SDimitry Andric // 7b1c73532SDimitry Andric //===----------------------------------------------------------------------===// 8b1c73532SDimitry Andric // 9b1c73532SDimitry Andric /// \file 10b1c73532SDimitry Andric /// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU 11b1c73532SDimitry Andric /// instructions that produce single-use VGPR values. If the value is forwarded 12b1c73532SDimitry Andric /// to the consumer instruction prior to VGPR writeback, the hardware can 13b1c73532SDimitry Andric /// then skip (kill) the VGPR write. 14b1c73532SDimitry Andric // 15b1c73532SDimitry Andric //===----------------------------------------------------------------------===// 16b1c73532SDimitry Andric 17b1c73532SDimitry Andric #include "AMDGPU.h" 18ac9a064cSDimitry Andric #include "AMDGPUGenSearchableTables.inc" 19b1c73532SDimitry Andric #include "GCNSubtarget.h" 20b1c73532SDimitry Andric #include "SIInstrInfo.h" 21ac9a064cSDimitry Andric #include "SIRegisterInfo.h" 22b1c73532SDimitry Andric #include "llvm/ADT/DenseMap.h" 23b1c73532SDimitry Andric #include "llvm/ADT/STLExtras.h" 24ac9a064cSDimitry Andric #include "llvm/ADT/SmallVector.h" 25b1c73532SDimitry Andric #include "llvm/ADT/StringRef.h" 26b1c73532SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h" 27b1c73532SDimitry Andric #include "llvm/CodeGen/MachineFunction.h" 28b1c73532SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 29b1c73532SDimitry Andric #include "llvm/CodeGen/MachineInstr.h" 30b1c73532SDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h" 31b1c73532SDimitry Andric #include "llvm/CodeGen/MachineOperand.h" 32b1c73532SDimitry Andric #include "llvm/CodeGen/Register.h" 33b1c73532SDimitry Andric #include "llvm/IR/DebugLoc.h" 34b1c73532SDimitry Andric #include "llvm/MC/MCRegister.h" 35ac9a064cSDimitry Andric #include "llvm/MC/MCRegisterInfo.h" 36b1c73532SDimitry Andric #include "llvm/Pass.h" 37ac9a064cSDimitry Andric #include <array> 38b1c73532SDimitry Andric 39b1c73532SDimitry Andric using namespace llvm; 40b1c73532SDimitry Andric 41b1c73532SDimitry Andric #define DEBUG_TYPE "amdgpu-insert-single-use-vdst" 42b1c73532SDimitry Andric 43b1c73532SDimitry Andric namespace { 44b1c73532SDimitry Andric class AMDGPUInsertSingleUseVDST : public MachineFunctionPass { 45b1c73532SDimitry Andric private: 46b1c73532SDimitry Andric const SIInstrInfo *SII; 47ac9a064cSDimitry Andric class SingleUseInstruction { 48ac9a064cSDimitry Andric private: 49ac9a064cSDimitry Andric static const unsigned MaxSkipRange = 0b111; 50ac9a064cSDimitry Andric static const unsigned MaxNumberOfSkipRegions = 2; 51ac9a064cSDimitry Andric 52ac9a064cSDimitry Andric unsigned LastEncodedPositionEnd; 53ac9a064cSDimitry Andric MachineInstr *ProducerInstr; 54ac9a064cSDimitry Andric 55ac9a064cSDimitry Andric std::array<unsigned, MaxNumberOfSkipRegions + 1> SingleUseRegions; 56ac9a064cSDimitry Andric SmallVector<unsigned, MaxNumberOfSkipRegions> SkipRegions; 57ac9a064cSDimitry Andric 58ac9a064cSDimitry Andric // Adds a skip region into the instruction. skip(const unsigned ProducerPosition)59ac9a064cSDimitry Andric void skip(const unsigned ProducerPosition) { 60ac9a064cSDimitry Andric while (LastEncodedPositionEnd + MaxSkipRange < ProducerPosition) { 61ac9a064cSDimitry Andric SkipRegions.push_back(MaxSkipRange); 62ac9a064cSDimitry Andric LastEncodedPositionEnd += MaxSkipRange; 63ac9a064cSDimitry Andric } 64ac9a064cSDimitry Andric SkipRegions.push_back(ProducerPosition - LastEncodedPositionEnd); 65ac9a064cSDimitry Andric LastEncodedPositionEnd = ProducerPosition; 66ac9a064cSDimitry Andric } 67ac9a064cSDimitry Andric currentRegionHasSpace()68ac9a064cSDimitry Andric bool currentRegionHasSpace() { 69ac9a064cSDimitry Andric const auto Region = SkipRegions.size(); 70ac9a064cSDimitry Andric // The first region has an extra bit of encoding space. 71ac9a064cSDimitry Andric return SingleUseRegions[Region] < 72ac9a064cSDimitry Andric ((Region == MaxNumberOfSkipRegions) ? 0b1111U : 0b111U); 73ac9a064cSDimitry Andric } 74ac9a064cSDimitry Andric encodeImm()75ac9a064cSDimitry Andric unsigned encodeImm() { 76ac9a064cSDimitry Andric // Handle the first Single Use Region separately as it has an extra bit 77ac9a064cSDimitry Andric // of encoding space. 78ac9a064cSDimitry Andric unsigned Imm = SingleUseRegions[SkipRegions.size()]; 79ac9a064cSDimitry Andric unsigned ShiftAmount = 4; 80ac9a064cSDimitry Andric for (unsigned i = SkipRegions.size(); i > 0; i--) { 81ac9a064cSDimitry Andric Imm |= SkipRegions[i - 1] << ShiftAmount; 82ac9a064cSDimitry Andric ShiftAmount += 3; 83ac9a064cSDimitry Andric Imm |= SingleUseRegions[i - 1] << ShiftAmount; 84ac9a064cSDimitry Andric ShiftAmount += 3; 85ac9a064cSDimitry Andric } 86ac9a064cSDimitry Andric return Imm; 87ac9a064cSDimitry Andric } 88ac9a064cSDimitry Andric 89ac9a064cSDimitry Andric public: SingleUseInstruction(const unsigned ProducerPosition,MachineInstr * Producer)90ac9a064cSDimitry Andric SingleUseInstruction(const unsigned ProducerPosition, 91ac9a064cSDimitry Andric MachineInstr *Producer) 92ac9a064cSDimitry Andric : LastEncodedPositionEnd(ProducerPosition + 1), ProducerInstr(Producer), 93ac9a064cSDimitry Andric SingleUseRegions({1, 0, 0}) {} 94ac9a064cSDimitry Andric 95ac9a064cSDimitry Andric // Returns false if adding a new single use producer failed. This happens 96ac9a064cSDimitry Andric // because it could not be encoded, either because there is no room to 97ac9a064cSDimitry Andric // encode another single use producer region or that this single use 98ac9a064cSDimitry Andric // producer is too far away to encode the amount of instructions to skip. tryAddProducer(const unsigned ProducerPosition,MachineInstr * MI)99ac9a064cSDimitry Andric bool tryAddProducer(const unsigned ProducerPosition, MachineInstr *MI) { 100ac9a064cSDimitry Andric // Producer is too far away to encode into this instruction or another 101ac9a064cSDimitry Andric // skip region is needed and SkipRegions.size() = 2 so there's no room for 102ac9a064cSDimitry Andric // another skip region, therefore a new instruction is needed. 103ac9a064cSDimitry Andric if (LastEncodedPositionEnd + 104ac9a064cSDimitry Andric (MaxSkipRange * (MaxNumberOfSkipRegions - SkipRegions.size())) < 105ac9a064cSDimitry Andric ProducerPosition) 106ac9a064cSDimitry Andric return false; 107ac9a064cSDimitry Andric 108ac9a064cSDimitry Andric // If a skip region is needed. 109ac9a064cSDimitry Andric if (LastEncodedPositionEnd != ProducerPosition || 110ac9a064cSDimitry Andric !currentRegionHasSpace()) { 111ac9a064cSDimitry Andric // If the current region is out of space therefore a skip region would 112ac9a064cSDimitry Andric // be needed, but there is no room for another skip region. 113ac9a064cSDimitry Andric if (SkipRegions.size() == MaxNumberOfSkipRegions) 114ac9a064cSDimitry Andric return false; 115ac9a064cSDimitry Andric skip(ProducerPosition); 116ac9a064cSDimitry Andric } 117ac9a064cSDimitry Andric 118ac9a064cSDimitry Andric SingleUseRegions[SkipRegions.size()]++; 119ac9a064cSDimitry Andric LastEncodedPositionEnd = ProducerPosition + 1; 120ac9a064cSDimitry Andric ProducerInstr = MI; 121ac9a064cSDimitry Andric return true; 122ac9a064cSDimitry Andric } 123ac9a064cSDimitry Andric emit(const SIInstrInfo * SII)124ac9a064cSDimitry Andric auto emit(const SIInstrInfo *SII) { 125ac9a064cSDimitry Andric return BuildMI(*ProducerInstr->getParent(), ProducerInstr, DebugLoc(), 126ac9a064cSDimitry Andric SII->get(AMDGPU::S_SINGLEUSE_VDST)) 127ac9a064cSDimitry Andric .addImm(encodeImm()); 128ac9a064cSDimitry Andric } 129ac9a064cSDimitry Andric }; 130b1c73532SDimitry Andric 131b1c73532SDimitry Andric public: 132b1c73532SDimitry Andric static char ID; 133b1c73532SDimitry Andric AMDGPUInsertSingleUseVDST()134b1c73532SDimitry Andric AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {} 135b1c73532SDimitry Andric insertSingleUseInstructions(ArrayRef<std::pair<unsigned,MachineInstr * >> SingleUseProducers) const136ac9a064cSDimitry Andric void insertSingleUseInstructions( 137ac9a064cSDimitry Andric ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const { 138ac9a064cSDimitry Andric SmallVector<SingleUseInstruction> Instructions; 139ac9a064cSDimitry Andric 140ac9a064cSDimitry Andric for (auto &[Position, MI] : SingleUseProducers) { 141ac9a064cSDimitry Andric // Encode this position into the last single use instruction if possible. 142ac9a064cSDimitry Andric if (Instructions.empty() || 143ac9a064cSDimitry Andric !Instructions.back().tryAddProducer(Position, MI)) { 144ac9a064cSDimitry Andric // If not, add a new instruction. 145ac9a064cSDimitry Andric Instructions.push_back(SingleUseInstruction(Position, MI)); 146ac9a064cSDimitry Andric } 147ac9a064cSDimitry Andric } 148ac9a064cSDimitry Andric 149ac9a064cSDimitry Andric for (auto &Instruction : Instructions) 150ac9a064cSDimitry Andric Instruction.emit(SII); 151b1c73532SDimitry Andric } 152b1c73532SDimitry Andric runOnMachineFunction(MachineFunction & MF)153b1c73532SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override { 154b1c73532SDimitry Andric const auto &ST = MF.getSubtarget<GCNSubtarget>(); 155b1c73532SDimitry Andric if (!ST.hasVGPRSingleUseHintInsts()) 156b1c73532SDimitry Andric return false; 157b1c73532SDimitry Andric 158b1c73532SDimitry Andric SII = ST.getInstrInfo(); 159b1c73532SDimitry Andric const auto *TRI = &SII->getRegisterInfo(); 160b1c73532SDimitry Andric bool InstructionEmitted = false; 161b1c73532SDimitry Andric 162b1c73532SDimitry Andric for (MachineBasicBlock &MBB : MF) { 163ac9a064cSDimitry Andric DenseMap<MCRegUnit, unsigned> RegisterUseCount; 164b1c73532SDimitry Andric 165b1c73532SDimitry Andric // Handle boundaries at the end of basic block separately to avoid 166b1c73532SDimitry Andric // false positives. If they are live at the end of a basic block then 167b1c73532SDimitry Andric // assume it has more uses later on. 168ac9a064cSDimitry Andric for (const auto &Liveout : MBB.liveouts()) { 169ac9a064cSDimitry Andric for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid(); 170ac9a064cSDimitry Andric ++Units) { 171ac9a064cSDimitry Andric const auto [Unit, Mask] = *Units; 172ac9a064cSDimitry Andric if ((Mask & Liveout.LaneMask).any()) 173ac9a064cSDimitry Andric RegisterUseCount[Unit] = 2; 174ac9a064cSDimitry Andric } 175ac9a064cSDimitry Andric } 176b1c73532SDimitry Andric 177ac9a064cSDimitry Andric SmallVector<std::pair<unsigned, MachineInstr *>> 178ac9a064cSDimitry Andric SingleUseProducerPositions; 179ac9a064cSDimitry Andric 180ac9a064cSDimitry Andric unsigned VALUInstrCount = 0; 181b1c73532SDimitry Andric for (MachineInstr &MI : reverse(MBB.instrs())) { 182b1c73532SDimitry Andric // All registers in all operands need to be single use for an 183b1c73532SDimitry Andric // instruction to be marked as a single use producer. 184b1c73532SDimitry Andric bool AllProducerOperandsAreSingleUse = true; 185b1c73532SDimitry Andric 186ac9a064cSDimitry Andric // Gather a list of Registers used before updating use counts to avoid 187ac9a064cSDimitry Andric // double counting registers that appear multiple times in a single 188ac9a064cSDimitry Andric // MachineInstr. 189ac9a064cSDimitry Andric SmallVector<MCRegUnit> RegistersUsed; 190ac9a064cSDimitry Andric 191ac9a064cSDimitry Andric for (const auto &Operand : MI.all_defs()) { 192ac9a064cSDimitry Andric const auto Reg = Operand.getReg(); 193ac9a064cSDimitry Andric 194ac9a064cSDimitry Andric const auto RegUnits = TRI->regunits(Reg); 195ac9a064cSDimitry Andric if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit Unit) { 196ac9a064cSDimitry Andric return RegisterUseCount[Unit] > 1; 197ac9a064cSDimitry Andric })) 198ac9a064cSDimitry Andric AllProducerOperandsAreSingleUse = false; 199ac9a064cSDimitry Andric 200ac9a064cSDimitry Andric // Reset uses count when a register is no longer live. 201ac9a064cSDimitry Andric for (const MCRegUnit Unit : RegUnits) 202ac9a064cSDimitry Andric RegisterUseCount.erase(Unit); 203ac9a064cSDimitry Andric } 204ac9a064cSDimitry Andric 205ac9a064cSDimitry Andric for (const auto &Operand : MI.all_uses()) { 206b1c73532SDimitry Andric const auto Reg = Operand.getReg(); 207b1c73532SDimitry Andric 208b1c73532SDimitry Andric // Count the number of times each register is read. 209ac9a064cSDimitry Andric for (const MCRegUnit Unit : TRI->regunits(Reg)) { 210ac9a064cSDimitry Andric if (!is_contained(RegistersUsed, Unit)) 211ac9a064cSDimitry Andric RegistersUsed.push_back(Unit); 212ac9a064cSDimitry Andric } 213ac9a064cSDimitry Andric } 214ac9a064cSDimitry Andric for (const MCRegUnit Unit : RegistersUsed) 215ac9a064cSDimitry Andric RegisterUseCount[Unit]++; 216b1c73532SDimitry Andric 217b1c73532SDimitry Andric // Do not attempt to optimise across exec mask changes. 218ac9a064cSDimitry Andric if (MI.modifiesRegister(AMDGPU::EXEC, TRI) || 219ac9a064cSDimitry Andric AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) { 220b1c73532SDimitry Andric for (auto &UsedReg : RegisterUseCount) 221b1c73532SDimitry Andric UsedReg.second = 2; 222b1c73532SDimitry Andric } 223b1c73532SDimitry Andric 224ac9a064cSDimitry Andric if (!SIInstrInfo::isVALU(MI) || 225ac9a064cSDimitry Andric AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode())) 226b1c73532SDimitry Andric continue; 227ac9a064cSDimitry Andric if (AllProducerOperandsAreSingleUse) { 228ac9a064cSDimitry Andric SingleUseProducerPositions.push_back({VALUInstrCount, &MI}); 229b1c73532SDimitry Andric InstructionEmitted = true; 230b1c73532SDimitry Andric } 231ac9a064cSDimitry Andric VALUInstrCount++; 232b1c73532SDimitry Andric } 233ac9a064cSDimitry Andric insertSingleUseInstructions(SingleUseProducerPositions); 234b1c73532SDimitry Andric } 235b1c73532SDimitry Andric return InstructionEmitted; 236b1c73532SDimitry Andric } 237b1c73532SDimitry Andric }; 238b1c73532SDimitry Andric } // namespace 239b1c73532SDimitry Andric 240b1c73532SDimitry Andric char AMDGPUInsertSingleUseVDST::ID = 0; 241b1c73532SDimitry Andric 242b1c73532SDimitry Andric char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID; 243b1c73532SDimitry Andric 244b1c73532SDimitry Andric INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE, 245b1c73532SDimitry Andric "AMDGPU Insert SingleUseVDST", false, false) 246