1ac9a064cSDimitry Andric //===- AMDGPUSplitModule.cpp ----------------------------------------------===//
2ac9a064cSDimitry Andric //
3ac9a064cSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4ac9a064cSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5ac9a064cSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6ac9a064cSDimitry Andric //
7ac9a064cSDimitry Andric //===----------------------------------------------------------------------===//
8ac9a064cSDimitry Andric //
9ac9a064cSDimitry Andric /// \file Implements a module splitting algorithm designed to support the
10ac9a064cSDimitry Andric /// FullLTO --lto-partitions option for parallel codegen. This is completely
11ac9a064cSDimitry Andric /// different from the common SplitModule pass, as this system is designed with
12ac9a064cSDimitry Andric /// AMDGPU in mind.
13ac9a064cSDimitry Andric ///
14ac9a064cSDimitry Andric /// The basic idea of this module splitting implementation is the same as
15ac9a064cSDimitry Andric /// SplitModule: load-balance the module's functions across a set of N
16ac9a064cSDimitry Andric /// partitions to allow parallel codegen. However, it does it very
17ac9a064cSDimitry Andric /// differently than the target-agnostic variant:
18ac9a064cSDimitry Andric /// - The module has "split roots", which are kernels in the vast
19ac9a064cSDimitry Andric // majority of cases.
20ac9a064cSDimitry Andric /// - Each root has a set of dependencies, and when a root and its
21ac9a064cSDimitry Andric /// dependencies is considered "big", we try to put it in a partition where
22ac9a064cSDimitry Andric /// most dependencies are already imported, to avoid duplicating large
23ac9a064cSDimitry Andric /// amounts of code.
24ac9a064cSDimitry Andric /// - There's special care for indirect calls in order to ensure
25ac9a064cSDimitry Andric /// AMDGPUResourceUsageAnalysis can work correctly.
26ac9a064cSDimitry Andric ///
27ac9a064cSDimitry Andric /// This file also includes a more elaborate logging system to enable
28ac9a064cSDimitry Andric /// users to easily generate logs that (if desired) do not include any value
29ac9a064cSDimitry Andric /// names, in order to not leak information about the source file.
30ac9a064cSDimitry Andric /// Such logs are very helpful to understand and fix potential issues with
31ac9a064cSDimitry Andric /// module splitting.
32ac9a064cSDimitry Andric
33ac9a064cSDimitry Andric #include "AMDGPUSplitModule.h"
34ac9a064cSDimitry Andric #include "AMDGPUTargetMachine.h"
35ac9a064cSDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
36ac9a064cSDimitry Andric #include "llvm/ADT/DenseMap.h"
37ac9a064cSDimitry Andric #include "llvm/ADT/SmallVector.h"
38ac9a064cSDimitry Andric #include "llvm/ADT/StringExtras.h"
39ac9a064cSDimitry Andric #include "llvm/ADT/StringRef.h"
40ac9a064cSDimitry Andric #include "llvm/Analysis/CallGraph.h"
41ac9a064cSDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
42ac9a064cSDimitry Andric #include "llvm/IR/Function.h"
43ac9a064cSDimitry Andric #include "llvm/IR/Instruction.h"
44ac9a064cSDimitry Andric #include "llvm/IR/Module.h"
45ac9a064cSDimitry Andric #include "llvm/IR/User.h"
46ac9a064cSDimitry Andric #include "llvm/IR/Value.h"
47ac9a064cSDimitry Andric #include "llvm/Support/Casting.h"
48ac9a064cSDimitry Andric #include "llvm/Support/Debug.h"
49ac9a064cSDimitry Andric #include "llvm/Support/FileSystem.h"
50ac9a064cSDimitry Andric #include "llvm/Support/Path.h"
51ac9a064cSDimitry Andric #include "llvm/Support/Process.h"
52ac9a064cSDimitry Andric #include "llvm/Support/SHA256.h"
53ac9a064cSDimitry Andric #include "llvm/Support/Threading.h"
54ac9a064cSDimitry Andric #include "llvm/Support/raw_ostream.h"
55ac9a064cSDimitry Andric #include "llvm/Transforms/Utils/Cloning.h"
56ac9a064cSDimitry Andric #include <algorithm>
57ac9a064cSDimitry Andric #include <cassert>
58ac9a064cSDimitry Andric #include <iterator>
59ac9a064cSDimitry Andric #include <memory>
60ac9a064cSDimitry Andric #include <utility>
61ac9a064cSDimitry Andric #include <vector>
62ac9a064cSDimitry Andric
63ac9a064cSDimitry Andric using namespace llvm;
64ac9a064cSDimitry Andric
65ac9a064cSDimitry Andric #define DEBUG_TYPE "amdgpu-split-module"
66ac9a064cSDimitry Andric
67ac9a064cSDimitry Andric namespace {
68ac9a064cSDimitry Andric
69ac9a064cSDimitry Andric static cl::opt<float> LargeFnFactor(
70ac9a064cSDimitry Andric "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f),
71ac9a064cSDimitry Andric cl::Hidden,
72ac9a064cSDimitry Andric cl::desc(
73ac9a064cSDimitry Andric "consider a function as large and needing special treatment when the "
74ac9a064cSDimitry Andric "cost of importing it into a partition"
75ac9a064cSDimitry Andric "exceeds the average cost of a partition by this factor; e;g. 2.0 "
76ac9a064cSDimitry Andric "means if the function and its dependencies is 2 times bigger than "
77ac9a064cSDimitry Andric "an average partition; 0 disables large functions handling entirely"));
78ac9a064cSDimitry Andric
79ac9a064cSDimitry Andric static cl::opt<float> LargeFnOverlapForMerge(
80ac9a064cSDimitry Andric "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f),
81ac9a064cSDimitry Andric cl::Hidden,
82ac9a064cSDimitry Andric cl::desc(
83ac9a064cSDimitry Andric "defines how much overlap between two large function's dependencies "
84ac9a064cSDimitry Andric "is needed to put them in the same partition"));
85ac9a064cSDimitry Andric
86ac9a064cSDimitry Andric static cl::opt<bool> NoExternalizeGlobals(
87ac9a064cSDimitry Andric "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
88ac9a064cSDimitry Andric cl::desc("disables externalization of global variable with local linkage; "
89ac9a064cSDimitry Andric "may cause globals to be duplicated which increases binary size"));
90ac9a064cSDimitry Andric
91ac9a064cSDimitry Andric static cl::opt<std::string>
92ac9a064cSDimitry Andric LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
93ac9a064cSDimitry Andric cl::desc("output directory for AMDGPU module splitting logs"));
94ac9a064cSDimitry Andric
95ac9a064cSDimitry Andric static cl::opt<bool>
96ac9a064cSDimitry Andric LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
97ac9a064cSDimitry Andric cl::desc("hash value names before printing them in the AMDGPU "
98ac9a064cSDimitry Andric "module splitting logs"));
99ac9a064cSDimitry Andric
100ac9a064cSDimitry Andric using CostType = InstructionCost::CostType;
101ac9a064cSDimitry Andric using PartitionID = unsigned;
102ac9a064cSDimitry Andric using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
103ac9a064cSDimitry Andric
isEntryPoint(const Function * F)104ac9a064cSDimitry Andric static bool isEntryPoint(const Function *F) {
105ac9a064cSDimitry Andric return AMDGPU::isEntryFunctionCC(F->getCallingConv());
106ac9a064cSDimitry Andric }
107ac9a064cSDimitry Andric
getName(const Value & V)108ac9a064cSDimitry Andric static std::string getName(const Value &V) {
109ac9a064cSDimitry Andric static bool HideNames;
110ac9a064cSDimitry Andric
111ac9a064cSDimitry Andric static llvm::once_flag HideNameInitFlag;
112ac9a064cSDimitry Andric llvm::call_once(HideNameInitFlag, [&]() {
113ac9a064cSDimitry Andric if (LogPrivate.getNumOccurrences())
114ac9a064cSDimitry Andric HideNames = LogPrivate;
115ac9a064cSDimitry Andric else {
116ac9a064cSDimitry Andric const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
117ac9a064cSDimitry Andric HideNames = (EV.value_or("0") != "0");
118ac9a064cSDimitry Andric }
119ac9a064cSDimitry Andric });
120ac9a064cSDimitry Andric
121ac9a064cSDimitry Andric if (!HideNames)
122ac9a064cSDimitry Andric return V.getName().str();
123ac9a064cSDimitry Andric return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
124ac9a064cSDimitry Andric /*LowerCase=*/true);
125ac9a064cSDimitry Andric }
126ac9a064cSDimitry Andric
127ac9a064cSDimitry Andric /// Main logging helper.
128ac9a064cSDimitry Andric ///
129ac9a064cSDimitry Andric /// Logging can be configured by the following environment variable.
130ac9a064cSDimitry Andric /// AMD_SPLIT_MODULE_LOG_DIR=<filepath>
131ac9a064cSDimitry Andric /// If set, uses <filepath> as the directory to write logfiles to
132ac9a064cSDimitry Andric /// each time module splitting is used.
133ac9a064cSDimitry Andric /// AMD_SPLIT_MODULE_LOG_PRIVATE
134ac9a064cSDimitry Andric /// If set to anything other than zero, all names are hidden.
135ac9a064cSDimitry Andric ///
136ac9a064cSDimitry Andric /// Both environment variables have corresponding CL options which
137ac9a064cSDimitry Andric /// takes priority over them.
138ac9a064cSDimitry Andric ///
139ac9a064cSDimitry Andric /// Any output printed to the log files is also printed to dbgs() when -debug is
140ac9a064cSDimitry Andric /// used and LLVM_DEBUG is defined.
141ac9a064cSDimitry Andric ///
142ac9a064cSDimitry Andric /// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
143ac9a064cSDimitry Andric /// cannot be removed from the code (by building without debug). This probably
144ac9a064cSDimitry Andric /// has a small performance cost because if some computation/formatting is
145ac9a064cSDimitry Andric /// needed for logging purpose, it may be done everytime only to be ignored
146ac9a064cSDimitry Andric /// by the logger.
147ac9a064cSDimitry Andric ///
148ac9a064cSDimitry Andric /// As this pass only runs once and is not doing anything computationally
149ac9a064cSDimitry Andric /// expensive, this is likely a reasonable trade-off.
150ac9a064cSDimitry Andric ///
151ac9a064cSDimitry Andric /// If some computation should really be avoided when unused, users of the class
152ac9a064cSDimitry Andric /// can check whether any logging will occur by using the bool operator.
153ac9a064cSDimitry Andric ///
154ac9a064cSDimitry Andric /// \code
155ac9a064cSDimitry Andric /// if (SML) {
156ac9a064cSDimitry Andric /// // Executes only if logging to a file or if -debug is available and
157ac9a064cSDimitry Andric /// used.
158ac9a064cSDimitry Andric /// }
159ac9a064cSDimitry Andric /// \endcode
160ac9a064cSDimitry Andric class SplitModuleLogger {
161ac9a064cSDimitry Andric public:
SplitModuleLogger(const Module & M)162ac9a064cSDimitry Andric SplitModuleLogger(const Module &M) {
163ac9a064cSDimitry Andric std::string LogDir = LogDirOpt;
164ac9a064cSDimitry Andric if (LogDir.empty())
165ac9a064cSDimitry Andric LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
166ac9a064cSDimitry Andric
167ac9a064cSDimitry Andric // No log dir specified means we don't need to log to a file.
168ac9a064cSDimitry Andric // We may still log to dbgs(), though.
169ac9a064cSDimitry Andric if (LogDir.empty())
170ac9a064cSDimitry Andric return;
171ac9a064cSDimitry Andric
172ac9a064cSDimitry Andric // If a log directory is specified, create a new file with a unique name in
173ac9a064cSDimitry Andric // that directory.
174ac9a064cSDimitry Andric int Fd;
175ac9a064cSDimitry Andric SmallString<0> PathTemplate;
176ac9a064cSDimitry Andric SmallString<0> RealPath;
177ac9a064cSDimitry Andric sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt");
178ac9a064cSDimitry Andric if (auto Err =
179ac9a064cSDimitry Andric sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) {
180ac9a064cSDimitry Andric report_fatal_error("Failed to create log file at '" + Twine(LogDir) +
181ac9a064cSDimitry Andric "': " + Err.message(),
182ac9a064cSDimitry Andric /*CrashDiag=*/false);
183ac9a064cSDimitry Andric }
184ac9a064cSDimitry Andric
185ac9a064cSDimitry Andric FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true);
186ac9a064cSDimitry Andric }
187ac9a064cSDimitry Andric
hasLogFile() const188ac9a064cSDimitry Andric bool hasLogFile() const { return FileOS != nullptr; }
189ac9a064cSDimitry Andric
logfile()190ac9a064cSDimitry Andric raw_ostream &logfile() {
191ac9a064cSDimitry Andric assert(FileOS && "no logfile!");
192ac9a064cSDimitry Andric return *FileOS;
193ac9a064cSDimitry Andric }
194ac9a064cSDimitry Andric
195ac9a064cSDimitry Andric /// \returns true if this SML will log anything either to a file or dbgs().
196ac9a064cSDimitry Andric /// Can be used to avoid expensive computations that are ignored when logging
197ac9a064cSDimitry Andric /// is disabled.
operator bool() const198ac9a064cSDimitry Andric operator bool() const {
199ac9a064cSDimitry Andric return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
200ac9a064cSDimitry Andric }
201ac9a064cSDimitry Andric
202ac9a064cSDimitry Andric private:
203ac9a064cSDimitry Andric std::unique_ptr<raw_fd_ostream> FileOS;
204ac9a064cSDimitry Andric };
205ac9a064cSDimitry Andric
206ac9a064cSDimitry Andric template <typename Ty>
operator <<(SplitModuleLogger & SML,const Ty & Val)207ac9a064cSDimitry Andric static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
208ac9a064cSDimitry Andric static_assert(
209ac9a064cSDimitry Andric !std::is_same_v<Ty, Value>,
210ac9a064cSDimitry Andric "do not print values to logs directly, use handleName instead!");
211ac9a064cSDimitry Andric LLVM_DEBUG(dbgs() << Val);
212ac9a064cSDimitry Andric if (SML.hasLogFile())
213ac9a064cSDimitry Andric SML.logfile() << Val;
214ac9a064cSDimitry Andric return SML;
215ac9a064cSDimitry Andric }
216ac9a064cSDimitry Andric
217ac9a064cSDimitry Andric /// Calculate the cost of each function in \p M
218ac9a064cSDimitry Andric /// \param SML Log Helper
219ac9a064cSDimitry Andric /// \param GetTTI Abstract getter for TargetTransformInfo.
220ac9a064cSDimitry Andric /// \param M Module to analyze.
221ac9a064cSDimitry Andric /// \param CostMap[out] Resulting Function -> Cost map.
222ac9a064cSDimitry Andric /// \return The module's total cost.
223ac9a064cSDimitry Andric static CostType
calculateFunctionCosts(SplitModuleLogger & SML,GetTTIFn GetTTI,Module & M,DenseMap<const Function *,CostType> & CostMap)224ac9a064cSDimitry Andric calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
225ac9a064cSDimitry Andric DenseMap<const Function *, CostType> &CostMap) {
226ac9a064cSDimitry Andric CostType ModuleCost = 0;
227ac9a064cSDimitry Andric CostType KernelCost = 0;
228ac9a064cSDimitry Andric
229ac9a064cSDimitry Andric for (auto &Fn : M) {
230ac9a064cSDimitry Andric if (Fn.isDeclaration())
231ac9a064cSDimitry Andric continue;
232ac9a064cSDimitry Andric
233ac9a064cSDimitry Andric CostType FnCost = 0;
234ac9a064cSDimitry Andric const auto &TTI = GetTTI(Fn);
235ac9a064cSDimitry Andric for (const auto &BB : Fn) {
236ac9a064cSDimitry Andric for (const auto &I : BB) {
237ac9a064cSDimitry Andric auto Cost =
238ac9a064cSDimitry Andric TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
239ac9a064cSDimitry Andric assert(Cost != InstructionCost::getMax());
240ac9a064cSDimitry Andric // Assume expensive if we can't tell the cost of an instruction.
241ac9a064cSDimitry Andric CostType CostVal =
242ac9a064cSDimitry Andric Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive);
243ac9a064cSDimitry Andric assert((FnCost + CostVal) >= FnCost && "Overflow!");
244ac9a064cSDimitry Andric FnCost += CostVal;
245ac9a064cSDimitry Andric }
246ac9a064cSDimitry Andric }
247ac9a064cSDimitry Andric
248ac9a064cSDimitry Andric assert(FnCost != 0);
249ac9a064cSDimitry Andric
250ac9a064cSDimitry Andric CostMap[&Fn] = FnCost;
251ac9a064cSDimitry Andric assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
252ac9a064cSDimitry Andric ModuleCost += FnCost;
253ac9a064cSDimitry Andric
254ac9a064cSDimitry Andric if (isEntryPoint(&Fn))
255ac9a064cSDimitry Andric KernelCost += FnCost;
256ac9a064cSDimitry Andric }
257ac9a064cSDimitry Andric
258ac9a064cSDimitry Andric CostType FnCost = (ModuleCost - KernelCost);
259ac9a064cSDimitry Andric CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1;
260ac9a064cSDimitry Andric SML << "=> Total Module Cost: " << ModuleCost << '\n'
261ac9a064cSDimitry Andric << " => KernelCost: " << KernelCost << " ("
262ac9a064cSDimitry Andric << format("%0.2f", (float(KernelCost) / ModuleCostOr1) * 100) << "%)\n"
263ac9a064cSDimitry Andric << " => FnsCost: " << FnCost << " ("
264ac9a064cSDimitry Andric << format("%0.2f", (float(FnCost) / ModuleCostOr1) * 100) << "%)\n";
265ac9a064cSDimitry Andric
266ac9a064cSDimitry Andric return ModuleCost;
267ac9a064cSDimitry Andric }
268ac9a064cSDimitry Andric
canBeIndirectlyCalled(const Function & F)269ac9a064cSDimitry Andric static bool canBeIndirectlyCalled(const Function &F) {
270ac9a064cSDimitry Andric if (F.isDeclaration() || isEntryPoint(&F))
271ac9a064cSDimitry Andric return false;
272ac9a064cSDimitry Andric return !F.hasLocalLinkage() ||
273ac9a064cSDimitry Andric F.hasAddressTaken(/*PutOffender=*/nullptr,
274ac9a064cSDimitry Andric /*IgnoreCallbackUses=*/false,
275ac9a064cSDimitry Andric /*IgnoreAssumeLikeCalls=*/true,
276ac9a064cSDimitry Andric /*IgnoreLLVMUsed=*/true,
277ac9a064cSDimitry Andric /*IgnoreARCAttachedCall=*/false,
278ac9a064cSDimitry Andric /*IgnoreCastedDirectCall=*/true);
279ac9a064cSDimitry Andric }
280ac9a064cSDimitry Andric
281ac9a064cSDimitry Andric /// When a function or any of its callees performs an indirect call, this
282ac9a064cSDimitry Andric /// takes over \ref addAllDependencies and adds all potentially callable
283ac9a064cSDimitry Andric /// functions to \p Fns so they can be counted as dependencies of the function.
284ac9a064cSDimitry Andric ///
285ac9a064cSDimitry Andric /// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
286ac9a064cSDimitry Andric /// presence of an indirect call, the function's resource usage is the same as
287ac9a064cSDimitry Andric /// the most expensive function in the module.
288ac9a064cSDimitry Andric /// \param M The module.
289ac9a064cSDimitry Andric /// \param Fns[out] Resulting list of functions.
addAllIndirectCallDependencies(const Module & M,DenseSet<const Function * > & Fns)290ac9a064cSDimitry Andric static void addAllIndirectCallDependencies(const Module &M,
291ac9a064cSDimitry Andric DenseSet<const Function *> &Fns) {
292ac9a064cSDimitry Andric for (const auto &Fn : M) {
293ac9a064cSDimitry Andric if (canBeIndirectlyCalled(Fn))
294ac9a064cSDimitry Andric Fns.insert(&Fn);
295ac9a064cSDimitry Andric }
296ac9a064cSDimitry Andric }
297ac9a064cSDimitry Andric
298ac9a064cSDimitry Andric /// Adds the functions that \p Fn may call to \p Fns, then recurses into each
299ac9a064cSDimitry Andric /// callee until all reachable functions have been gathered.
300ac9a064cSDimitry Andric ///
301ac9a064cSDimitry Andric /// \param SML Log Helper
302ac9a064cSDimitry Andric /// \param CG Call graph for \p Fn's module.
303ac9a064cSDimitry Andric /// \param Fn Current function to look at.
304ac9a064cSDimitry Andric /// \param Fns[out] Resulting list of functions.
305ac9a064cSDimitry Andric /// \param OnlyDirect Whether to only consider direct callees.
306ac9a064cSDimitry Andric /// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
307ac9a064cSDimitry Andric /// point, either in \p Fn or in one of the function it calls. When that
308ac9a064cSDimitry Andric /// happens, we fall back to adding all callable functions inside \p Fn's module
309ac9a064cSDimitry Andric /// to \p Fns.
addAllDependencies(SplitModuleLogger & SML,const CallGraph & CG,const Function & Fn,DenseSet<const Function * > & Fns,bool OnlyDirect,bool & HadIndirectCall)310ac9a064cSDimitry Andric static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
311ac9a064cSDimitry Andric const Function &Fn,
312ac9a064cSDimitry Andric DenseSet<const Function *> &Fns, bool OnlyDirect,
313ac9a064cSDimitry Andric bool &HadIndirectCall) {
314ac9a064cSDimitry Andric assert(!Fn.isDeclaration());
315ac9a064cSDimitry Andric
316ac9a064cSDimitry Andric const Module &M = *Fn.getParent();
317ac9a064cSDimitry Andric SmallVector<const Function *> WorkList({&Fn});
318ac9a064cSDimitry Andric while (!WorkList.empty()) {
319ac9a064cSDimitry Andric const auto &CurFn = *WorkList.pop_back_val();
320ac9a064cSDimitry Andric assert(!CurFn.isDeclaration());
321ac9a064cSDimitry Andric
322ac9a064cSDimitry Andric // Scan for an indirect call. If such a call is found, we have to
323ac9a064cSDimitry Andric // conservatively assume this can call all non-entrypoint functions in the
324ac9a064cSDimitry Andric // module.
325ac9a064cSDimitry Andric
326ac9a064cSDimitry Andric for (auto &CGEntry : *CG[&CurFn]) {
327ac9a064cSDimitry Andric auto *CGNode = CGEntry.second;
328ac9a064cSDimitry Andric auto *Callee = CGNode->getFunction();
329ac9a064cSDimitry Andric if (!Callee) {
330ac9a064cSDimitry Andric if (OnlyDirect)
331ac9a064cSDimitry Andric continue;
332ac9a064cSDimitry Andric
333ac9a064cSDimitry Andric // Functions have an edge towards CallsExternalNode if they're external
334ac9a064cSDimitry Andric // declarations, or if they do an indirect call. As we only process
335ac9a064cSDimitry Andric // definitions here, we know this means the function has an indirect
336ac9a064cSDimitry Andric // call. We then have to conservatively assume this can call all
337ac9a064cSDimitry Andric // non-entrypoint functions in the module.
338ac9a064cSDimitry Andric if (CGNode != CG.getCallsExternalNode())
339ac9a064cSDimitry Andric continue; // this is another function-less node we don't care about.
340ac9a064cSDimitry Andric
341ac9a064cSDimitry Andric SML << "Indirect call detected in " << getName(CurFn)
342ac9a064cSDimitry Andric << " - treating all non-entrypoint functions as "
343ac9a064cSDimitry Andric "potential dependencies\n";
344ac9a064cSDimitry Andric
345ac9a064cSDimitry Andric // TODO: Print an ORE as well ?
346ac9a064cSDimitry Andric addAllIndirectCallDependencies(M, Fns);
347ac9a064cSDimitry Andric HadIndirectCall = true;
348ac9a064cSDimitry Andric continue;
349ac9a064cSDimitry Andric }
350ac9a064cSDimitry Andric
351ac9a064cSDimitry Andric if (Callee->isDeclaration())
352ac9a064cSDimitry Andric continue;
353ac9a064cSDimitry Andric
354ac9a064cSDimitry Andric auto [It, Inserted] = Fns.insert(Callee);
355ac9a064cSDimitry Andric if (Inserted)
356ac9a064cSDimitry Andric WorkList.push_back(Callee);
357ac9a064cSDimitry Andric }
358ac9a064cSDimitry Andric }
359ac9a064cSDimitry Andric }
360ac9a064cSDimitry Andric
361ac9a064cSDimitry Andric /// Contains information about a function and its dependencies.
362ac9a064cSDimitry Andric /// This is a splitting root. The splitting algorithm works by
363ac9a064cSDimitry Andric /// assigning these to partitions.
364ac9a064cSDimitry Andric struct FunctionWithDependencies {
FunctionWithDependencies__anon934ba1180111::FunctionWithDependencies365ac9a064cSDimitry Andric FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
366ac9a064cSDimitry Andric const DenseMap<const Function *, CostType> &FnCosts,
367ac9a064cSDimitry Andric const Function *Fn)
368ac9a064cSDimitry Andric : Fn(Fn) {
369ac9a064cSDimitry Andric // When Fn is not a kernel, we don't need to collect indirect callees.
370ac9a064cSDimitry Andric // Resource usage analysis is only performed on kernels, and we collect
371ac9a064cSDimitry Andric // indirect callees for resource usage analysis.
372ac9a064cSDimitry Andric addAllDependencies(SML, CG, *Fn, Dependencies,
373ac9a064cSDimitry Andric /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall);
374ac9a064cSDimitry Andric TotalCost = FnCosts.at(Fn);
375ac9a064cSDimitry Andric for (const auto *Dep : Dependencies) {
376ac9a064cSDimitry Andric TotalCost += FnCosts.at(Dep);
377ac9a064cSDimitry Andric
378ac9a064cSDimitry Andric // We cannot duplicate functions with external linkage, or functions that
379ac9a064cSDimitry Andric // may be overriden at runtime.
380ac9a064cSDimitry Andric HasNonDuplicatableDependecy |=
381ac9a064cSDimitry Andric (Dep->hasExternalLinkage() || !Dep->isDefinitionExact());
382ac9a064cSDimitry Andric }
383ac9a064cSDimitry Andric }
384ac9a064cSDimitry Andric
385ac9a064cSDimitry Andric const Function *Fn = nullptr;
386ac9a064cSDimitry Andric DenseSet<const Function *> Dependencies;
387ac9a064cSDimitry Andric /// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
388ac9a064cSDimitry Andric bool HasIndirectCall = false;
389ac9a064cSDimitry Andric /// Whether any of \p Fn's dependencies cannot be duplicated.
390ac9a064cSDimitry Andric bool HasNonDuplicatableDependecy = false;
391ac9a064cSDimitry Andric
392ac9a064cSDimitry Andric CostType TotalCost = 0;
393ac9a064cSDimitry Andric
394ac9a064cSDimitry Andric /// \returns true if this function and its dependencies can be considered
395ac9a064cSDimitry Andric /// large according to \p Threshold.
isLarge__anon934ba1180111::FunctionWithDependencies396ac9a064cSDimitry Andric bool isLarge(CostType Threshold) const {
397ac9a064cSDimitry Andric return TotalCost > Threshold && !Dependencies.empty();
398ac9a064cSDimitry Andric }
399ac9a064cSDimitry Andric };
400ac9a064cSDimitry Andric
401ac9a064cSDimitry Andric /// Calculates how much overlap there is between \p A and \p B.
402ac9a064cSDimitry Andric /// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
403ac9a064cSDimitry Andric /// and B have no shared elements. Kernels do not count in overlap calculation.
calculateOverlap(const DenseSet<const Function * > & A,const DenseSet<const Function * > & B)404ac9a064cSDimitry Andric static float calculateOverlap(const DenseSet<const Function *> &A,
405ac9a064cSDimitry Andric const DenseSet<const Function *> &B) {
406ac9a064cSDimitry Andric DenseSet<const Function *> Total;
407ac9a064cSDimitry Andric for (const auto *F : A) {
408ac9a064cSDimitry Andric if (!isEntryPoint(F))
409ac9a064cSDimitry Andric Total.insert(F);
410ac9a064cSDimitry Andric }
411ac9a064cSDimitry Andric
412ac9a064cSDimitry Andric if (Total.empty())
413ac9a064cSDimitry Andric return 0.0f;
414ac9a064cSDimitry Andric
415ac9a064cSDimitry Andric unsigned NumCommon = 0;
416ac9a064cSDimitry Andric for (const auto *F : B) {
417ac9a064cSDimitry Andric if (isEntryPoint(F))
418ac9a064cSDimitry Andric continue;
419ac9a064cSDimitry Andric
420ac9a064cSDimitry Andric auto [It, Inserted] = Total.insert(F);
421ac9a064cSDimitry Andric if (!Inserted)
422ac9a064cSDimitry Andric ++NumCommon;
423ac9a064cSDimitry Andric }
424ac9a064cSDimitry Andric
425ac9a064cSDimitry Andric return static_cast<float>(NumCommon) / Total.size();
426ac9a064cSDimitry Andric }
427ac9a064cSDimitry Andric
428ac9a064cSDimitry Andric /// Performs all of the partitioning work on \p M.
429ac9a064cSDimitry Andric /// \param SML Log Helper
430ac9a064cSDimitry Andric /// \param M Module to partition.
431ac9a064cSDimitry Andric /// \param NumParts Number of partitions to create.
432ac9a064cSDimitry Andric /// \param ModuleCost Total cost of all functions in \p M.
433ac9a064cSDimitry Andric /// \param FnCosts Map of Function -> Cost
434ac9a064cSDimitry Andric /// \param WorkList Functions and their dependencies to process in order.
435ac9a064cSDimitry Andric /// \returns The created partitions (a vector of size \p NumParts )
436ac9a064cSDimitry Andric static std::vector<DenseSet<const Function *>>
doPartitioning(SplitModuleLogger & SML,Module & M,unsigned NumParts,CostType ModuleCost,const DenseMap<const Function *,CostType> & FnCosts,const SmallVector<FunctionWithDependencies> & WorkList)437ac9a064cSDimitry Andric doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
438ac9a064cSDimitry Andric CostType ModuleCost,
439ac9a064cSDimitry Andric const DenseMap<const Function *, CostType> &FnCosts,
440ac9a064cSDimitry Andric const SmallVector<FunctionWithDependencies> &WorkList) {
441ac9a064cSDimitry Andric
442ac9a064cSDimitry Andric SML << "\n--Partitioning Starts--\n";
443ac9a064cSDimitry Andric
444ac9a064cSDimitry Andric // Calculate a "large function threshold". When more than one function's total
445ac9a064cSDimitry Andric // import cost exceeds this value, we will try to assign it to an existing
446ac9a064cSDimitry Andric // partition to reduce the amount of duplication needed.
447ac9a064cSDimitry Andric //
448ac9a064cSDimitry Andric // e.g. let two functions X and Y have a import cost of ~10% of the module, we
449ac9a064cSDimitry Andric // assign X to a partition as usual, but when we get to Y, we check if it's
450ac9a064cSDimitry Andric // worth also putting it in Y's partition.
451ac9a064cSDimitry Andric const CostType LargeFnThreshold =
452ac9a064cSDimitry Andric LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor))
453ac9a064cSDimitry Andric : std::numeric_limits<CostType>::max();
454ac9a064cSDimitry Andric
455ac9a064cSDimitry Andric std::vector<DenseSet<const Function *>> Partitions;
456ac9a064cSDimitry Andric Partitions.resize(NumParts);
457ac9a064cSDimitry Andric
458ac9a064cSDimitry Andric // Assign functions to partitions, and try to keep the partitions more or
459ac9a064cSDimitry Andric // less balanced. We do that through a priority queue sorted in reverse, so we
460ac9a064cSDimitry Andric // can always look at the partition with the least content.
461ac9a064cSDimitry Andric //
462ac9a064cSDimitry Andric // There are some cases where we will be deliberately unbalanced though.
463ac9a064cSDimitry Andric // - Large functions: we try to merge with existing partitions to reduce code
464ac9a064cSDimitry Andric // duplication.
465ac9a064cSDimitry Andric // - Functions with indirect or external calls always go in the first
466ac9a064cSDimitry Andric // partition (P0).
467ac9a064cSDimitry Andric auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
468ac9a064cSDimitry Andric const std::pair<PartitionID, CostType> &b) {
469ac9a064cSDimitry Andric // When two partitions have the same cost, assign to the one with the
470ac9a064cSDimitry Andric // biggest ID first. This allows us to put things in P0 last, because P0 may
471ac9a064cSDimitry Andric // have other stuff added later.
472ac9a064cSDimitry Andric if (a.second == b.second)
473ac9a064cSDimitry Andric return a.first < b.first;
474ac9a064cSDimitry Andric return a.second > b.second;
475ac9a064cSDimitry Andric };
476ac9a064cSDimitry Andric
477ac9a064cSDimitry Andric // We can't use priority_queue here because we need to be able to access any
478ac9a064cSDimitry Andric // element. This makes this a bit inefficient as we need to sort it again
479ac9a064cSDimitry Andric // everytime we change it, but it's a very small array anyway (likely under 64
480ac9a064cSDimitry Andric // partitions) so it's a cheap operation.
481ac9a064cSDimitry Andric std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
482ac9a064cSDimitry Andric for (unsigned I = 0; I < NumParts; ++I)
483ac9a064cSDimitry Andric BalancingQueue.emplace_back(I, 0);
484ac9a064cSDimitry Andric
485ac9a064cSDimitry Andric // Helper function to handle assigning a function to a partition. This takes
486ac9a064cSDimitry Andric // care of updating the balancing queue.
487ac9a064cSDimitry Andric const auto AssignToPartition = [&](PartitionID PID,
488ac9a064cSDimitry Andric const FunctionWithDependencies &FWD) {
489ac9a064cSDimitry Andric auto &FnsInPart = Partitions[PID];
490ac9a064cSDimitry Andric FnsInPart.insert(FWD.Fn);
491ac9a064cSDimitry Andric FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
492ac9a064cSDimitry Andric
493ac9a064cSDimitry Andric SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n -> ";
494ac9a064cSDimitry Andric if (!FWD.Dependencies.empty()) {
495ac9a064cSDimitry Andric SML << FWD.Dependencies.size() << " dependencies added\n";
496ac9a064cSDimitry Andric };
497ac9a064cSDimitry Andric
498ac9a064cSDimitry Andric // Update the balancing queue. we scan backwards because in the common case
499ac9a064cSDimitry Andric // the partition is at the end.
500ac9a064cSDimitry Andric for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
501ac9a064cSDimitry Andric if (QueuePID == PID) {
502ac9a064cSDimitry Andric CostType NewCost = 0;
503ac9a064cSDimitry Andric for (auto *Fn : Partitions[PID])
504ac9a064cSDimitry Andric NewCost += FnCosts.at(Fn);
505ac9a064cSDimitry Andric
506ac9a064cSDimitry Andric SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost;
507ac9a064cSDimitry Andric if (Cost) {
508ac9a064cSDimitry Andric SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100)
509ac9a064cSDimitry Andric << "% increase)";
510ac9a064cSDimitry Andric }
511ac9a064cSDimitry Andric SML << '\n';
512ac9a064cSDimitry Andric
513ac9a064cSDimitry Andric Cost = NewCost;
514ac9a064cSDimitry Andric }
515ac9a064cSDimitry Andric }
516ac9a064cSDimitry Andric
517ac9a064cSDimitry Andric sort(BalancingQueue, ComparePartitions);
518ac9a064cSDimitry Andric };
519ac9a064cSDimitry Andric
520ac9a064cSDimitry Andric for (auto &CurFn : WorkList) {
521ac9a064cSDimitry Andric // When a function has indirect calls, it must stay in the first partition
522ac9a064cSDimitry Andric // alongside every reachable non-entry function. This is a nightmare case
523ac9a064cSDimitry Andric // for splitting as it severely limits what we can do.
524ac9a064cSDimitry Andric if (CurFn.HasIndirectCall) {
525ac9a064cSDimitry Andric SML << "Function with indirect call(s): " << getName(*CurFn.Fn)
526ac9a064cSDimitry Andric << " defaulting to P0\n";
527ac9a064cSDimitry Andric AssignToPartition(0, CurFn);
528ac9a064cSDimitry Andric continue;
529ac9a064cSDimitry Andric }
530ac9a064cSDimitry Andric
531ac9a064cSDimitry Andric // When a function has non duplicatable dependencies, we have to keep it in
532ac9a064cSDimitry Andric // the first partition as well. This is a conservative approach, a
533ac9a064cSDimitry Andric // finer-grained approach could keep track of which dependencies are
534ac9a064cSDimitry Andric // non-duplicatable exactly and just make sure they're grouped together.
535ac9a064cSDimitry Andric if (CurFn.HasNonDuplicatableDependecy) {
536ac9a064cSDimitry Andric SML << "Function with externally visible dependency "
537ac9a064cSDimitry Andric << getName(*CurFn.Fn) << " defaulting to P0\n";
538ac9a064cSDimitry Andric AssignToPartition(0, CurFn);
539ac9a064cSDimitry Andric continue;
540ac9a064cSDimitry Andric }
541ac9a064cSDimitry Andric
542ac9a064cSDimitry Andric // Be smart with large functions to avoid duplicating their dependencies.
543ac9a064cSDimitry Andric if (CurFn.isLarge(LargeFnThreshold)) {
544ac9a064cSDimitry Andric assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f);
545ac9a064cSDimitry Andric SML << "Large Function: " << getName(*CurFn.Fn)
546ac9a064cSDimitry Andric << " - looking for partition with at least "
547ac9a064cSDimitry Andric << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n";
548ac9a064cSDimitry Andric
549ac9a064cSDimitry Andric bool Assigned = false;
550ac9a064cSDimitry Andric for (const auto &[PID, Fns] : enumerate(Partitions)) {
551ac9a064cSDimitry Andric float Overlap = calculateOverlap(CurFn.Dependencies, Fns);
552ac9a064cSDimitry Andric SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P"
553ac9a064cSDimitry Andric << PID << '\n';
554ac9a064cSDimitry Andric if (Overlap > LargeFnOverlapForMerge) {
555ac9a064cSDimitry Andric SML << " selecting P" << PID << '\n';
556ac9a064cSDimitry Andric AssignToPartition(PID, CurFn);
557ac9a064cSDimitry Andric Assigned = true;
558ac9a064cSDimitry Andric }
559ac9a064cSDimitry Andric }
560ac9a064cSDimitry Andric
561ac9a064cSDimitry Andric if (Assigned)
562ac9a064cSDimitry Andric continue;
563ac9a064cSDimitry Andric }
564ac9a064cSDimitry Andric
565ac9a064cSDimitry Andric // Normal "load-balancing", assign to partition with least pressure.
566ac9a064cSDimitry Andric auto [PID, CurCost] = BalancingQueue.back();
567ac9a064cSDimitry Andric AssignToPartition(PID, CurFn);
568ac9a064cSDimitry Andric }
569ac9a064cSDimitry Andric
570ac9a064cSDimitry Andric if (SML) {
571ac9a064cSDimitry Andric for (const auto &[Idx, Part] : enumerate(Partitions)) {
572ac9a064cSDimitry Andric CostType Cost = 0;
573ac9a064cSDimitry Andric for (auto *Fn : Part)
574ac9a064cSDimitry Andric Cost += FnCosts.at(Fn);
575ac9a064cSDimitry Andric SML << "P" << Idx << " has a total cost of " << Cost << " ("
576ac9a064cSDimitry Andric << format("%0.2f", (float(Cost) / ModuleCost) * 100)
577ac9a064cSDimitry Andric << "% of source module)\n";
578ac9a064cSDimitry Andric }
579ac9a064cSDimitry Andric
580ac9a064cSDimitry Andric SML << "--Partitioning Done--\n\n";
581ac9a064cSDimitry Andric }
582ac9a064cSDimitry Andric
583ac9a064cSDimitry Andric // Check no functions were missed.
584ac9a064cSDimitry Andric #ifndef NDEBUG
585ac9a064cSDimitry Andric DenseSet<const Function *> AllFunctions;
586ac9a064cSDimitry Andric for (const auto &Part : Partitions)
587ac9a064cSDimitry Andric AllFunctions.insert(Part.begin(), Part.end());
588ac9a064cSDimitry Andric
589ac9a064cSDimitry Andric for (auto &Fn : M) {
590ac9a064cSDimitry Andric if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
591ac9a064cSDimitry Andric assert(AllFunctions.contains(&Fn) && "Missed a function?!");
592ac9a064cSDimitry Andric }
593ac9a064cSDimitry Andric }
594ac9a064cSDimitry Andric #endif
595ac9a064cSDimitry Andric
596ac9a064cSDimitry Andric return Partitions;
597ac9a064cSDimitry Andric }
598ac9a064cSDimitry Andric
externalize(GlobalValue & GV)599ac9a064cSDimitry Andric static void externalize(GlobalValue &GV) {
600ac9a064cSDimitry Andric if (GV.hasLocalLinkage()) {
601ac9a064cSDimitry Andric GV.setLinkage(GlobalValue::ExternalLinkage);
602ac9a064cSDimitry Andric GV.setVisibility(GlobalValue::HiddenVisibility);
603ac9a064cSDimitry Andric }
604ac9a064cSDimitry Andric
605ac9a064cSDimitry Andric // Unnamed entities must be named consistently between modules. setName will
606ac9a064cSDimitry Andric // give a distinct name to each such entity.
607ac9a064cSDimitry Andric if (!GV.hasName())
608ac9a064cSDimitry Andric GV.setName("__llvmsplit_unnamed");
609ac9a064cSDimitry Andric }
610ac9a064cSDimitry Andric
hasDirectCaller(const Function & Fn)611ac9a064cSDimitry Andric static bool hasDirectCaller(const Function &Fn) {
612ac9a064cSDimitry Andric for (auto &U : Fn.uses()) {
613ac9a064cSDimitry Andric if (auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U))
614ac9a064cSDimitry Andric return true;
615ac9a064cSDimitry Andric }
616ac9a064cSDimitry Andric return false;
617ac9a064cSDimitry Andric }
618ac9a064cSDimitry Andric
splitAMDGPUModule(GetTTIFn GetTTI,Module & M,unsigned N,function_ref<void (std::unique_ptr<Module> MPart)> ModuleCallback)619ac9a064cSDimitry Andric static void splitAMDGPUModule(
620ac9a064cSDimitry Andric GetTTIFn GetTTI, Module &M, unsigned N,
621ac9a064cSDimitry Andric function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
622ac9a064cSDimitry Andric
623ac9a064cSDimitry Andric SplitModuleLogger SML(M);
624ac9a064cSDimitry Andric
625ac9a064cSDimitry Andric CallGraph CG(M);
626ac9a064cSDimitry Andric
627ac9a064cSDimitry Andric // Externalize functions whose address are taken.
628ac9a064cSDimitry Andric //
629ac9a064cSDimitry Andric // This is needed because partitioning is purely based on calls, but sometimes
630ac9a064cSDimitry Andric // a kernel/function may just look at the address of another local function
631ac9a064cSDimitry Andric // and not do anything (no calls). After partitioning, that local function may
632ac9a064cSDimitry Andric // end up in a different module (so it's just a declaration in the module
633ac9a064cSDimitry Andric // where its address is taken), which emits a "undefined hidden symbol" linker
634ac9a064cSDimitry Andric // error.
635ac9a064cSDimitry Andric //
636ac9a064cSDimitry Andric // Additionally, it guides partitioning to not duplicate this function if it's
637ac9a064cSDimitry Andric // called directly at some point.
638ac9a064cSDimitry Andric for (auto &Fn : M) {
639ac9a064cSDimitry Andric if (Fn.hasAddressTaken()) {
640ac9a064cSDimitry Andric if (Fn.hasLocalLinkage()) {
641ac9a064cSDimitry Andric SML << "[externalize] " << Fn.getName()
642ac9a064cSDimitry Andric << " because its address is taken\n";
643ac9a064cSDimitry Andric }
644ac9a064cSDimitry Andric externalize(Fn);
645ac9a064cSDimitry Andric }
646ac9a064cSDimitry Andric }
647ac9a064cSDimitry Andric
648ac9a064cSDimitry Andric // Externalize local GVs, which avoids duplicating their initializers, which
649ac9a064cSDimitry Andric // in turns helps keep code size in check.
650ac9a064cSDimitry Andric if (!NoExternalizeGlobals) {
651ac9a064cSDimitry Andric for (auto &GV : M.globals()) {
652ac9a064cSDimitry Andric if (GV.hasLocalLinkage())
653ac9a064cSDimitry Andric SML << "[externalize] GV " << GV.getName() << '\n';
654ac9a064cSDimitry Andric externalize(GV);
655ac9a064cSDimitry Andric }
656ac9a064cSDimitry Andric }
657ac9a064cSDimitry Andric
658ac9a064cSDimitry Andric // Start by calculating the cost of every function in the module, as well as
659ac9a064cSDimitry Andric // the module's overall cost.
660ac9a064cSDimitry Andric DenseMap<const Function *, CostType> FnCosts;
661ac9a064cSDimitry Andric const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
662ac9a064cSDimitry Andric
663ac9a064cSDimitry Andric // First, gather ever kernel into the worklist.
664ac9a064cSDimitry Andric SmallVector<FunctionWithDependencies> WorkList;
665ac9a064cSDimitry Andric for (auto &Fn : M) {
666ac9a064cSDimitry Andric if (isEntryPoint(&Fn) && !Fn.isDeclaration())
667ac9a064cSDimitry Andric WorkList.emplace_back(SML, CG, FnCosts, &Fn);
668ac9a064cSDimitry Andric }
669ac9a064cSDimitry Andric
670ac9a064cSDimitry Andric // Then, find missing functions that need to be considered as additional
671ac9a064cSDimitry Andric // roots. These can't be called in theory, but in practice we still have to
672ac9a064cSDimitry Andric // handle them to avoid linker errors.
673ac9a064cSDimitry Andric {
674ac9a064cSDimitry Andric DenseSet<const Function *> SeenFunctions;
675ac9a064cSDimitry Andric for (const auto &FWD : WorkList) {
676ac9a064cSDimitry Andric SeenFunctions.insert(FWD.Fn);
677ac9a064cSDimitry Andric SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
678ac9a064cSDimitry Andric }
679ac9a064cSDimitry Andric
680ac9a064cSDimitry Andric for (auto &Fn : M) {
681ac9a064cSDimitry Andric // If this function is not part of any kernel's dependencies and isn't
682ac9a064cSDimitry Andric // directly called, consider it as a root.
683ac9a064cSDimitry Andric if (!Fn.isDeclaration() && !isEntryPoint(&Fn) &&
684ac9a064cSDimitry Andric !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) {
685ac9a064cSDimitry Andric WorkList.emplace_back(SML, CG, FnCosts, &Fn);
686ac9a064cSDimitry Andric }
687ac9a064cSDimitry Andric }
688ac9a064cSDimitry Andric }
689ac9a064cSDimitry Andric
690ac9a064cSDimitry Andric // Sort the worklist so the most expensive roots are seen first.
691ac9a064cSDimitry Andric sort(WorkList, [&](auto &A, auto &B) {
692ac9a064cSDimitry Andric // Sort by total cost, and if the total cost is identical, sort
693ac9a064cSDimitry Andric // alphabetically.
694ac9a064cSDimitry Andric if (A.TotalCost == B.TotalCost)
695ac9a064cSDimitry Andric return A.Fn->getName() < B.Fn->getName();
696ac9a064cSDimitry Andric return A.TotalCost > B.TotalCost;
697ac9a064cSDimitry Andric });
698ac9a064cSDimitry Andric
699ac9a064cSDimitry Andric if (SML) {
700ac9a064cSDimitry Andric SML << "Worklist\n";
701ac9a064cSDimitry Andric for (const auto &FWD : WorkList) {
702ac9a064cSDimitry Andric SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost
703ac9a064cSDimitry Andric << " indirect:" << FWD.HasIndirectCall
704ac9a064cSDimitry Andric << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy
705ac9a064cSDimitry Andric << ")\n";
706ac9a064cSDimitry Andric // Sort function names before printing to ensure determinism.
707ac9a064cSDimitry Andric SmallVector<std::string> SortedDepNames;
708ac9a064cSDimitry Andric SortedDepNames.reserve(FWD.Dependencies.size());
709ac9a064cSDimitry Andric for (const auto *Dep : FWD.Dependencies)
710ac9a064cSDimitry Andric SortedDepNames.push_back(getName(*Dep));
711ac9a064cSDimitry Andric sort(SortedDepNames);
712ac9a064cSDimitry Andric
713ac9a064cSDimitry Andric for (const auto &Name : SortedDepNames)
714ac9a064cSDimitry Andric SML << " [dependency] " << Name << '\n';
715ac9a064cSDimitry Andric }
716ac9a064cSDimitry Andric }
717ac9a064cSDimitry Andric
718ac9a064cSDimitry Andric // This performs all of the partitioning work.
719ac9a064cSDimitry Andric auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList);
720ac9a064cSDimitry Andric assert(Partitions.size() == N);
721ac9a064cSDimitry Andric
722ac9a064cSDimitry Andric // If we didn't externalize GVs, then local GVs need to be conservatively
723ac9a064cSDimitry Andric // imported into every module (including their initializers), and then cleaned
724ac9a064cSDimitry Andric // up afterwards.
725ac9a064cSDimitry Andric const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
726ac9a064cSDimitry Andric // We conservatively import private/internal GVs into every module and clean
727ac9a064cSDimitry Andric // them up afterwards.
728ac9a064cSDimitry Andric const auto *Var = dyn_cast<GlobalVariable>(GV);
729ac9a064cSDimitry Andric return Var && Var->hasLocalLinkage();
730ac9a064cSDimitry Andric };
731ac9a064cSDimitry Andric
732ac9a064cSDimitry Andric SML << "Creating " << N << " modules...\n";
733ac9a064cSDimitry Andric unsigned TotalFnImpls = 0;
734ac9a064cSDimitry Andric for (unsigned I = 0; I < N; ++I) {
735ac9a064cSDimitry Andric const auto &FnsInPart = Partitions[I];
736ac9a064cSDimitry Andric
737ac9a064cSDimitry Andric ValueToValueMapTy VMap;
738ac9a064cSDimitry Andric std::unique_ptr<Module> MPart(
739ac9a064cSDimitry Andric CloneModule(M, VMap, [&](const GlobalValue *GV) {
740ac9a064cSDimitry Andric // Functions go in their assigned partition.
741ac9a064cSDimitry Andric if (const auto *Fn = dyn_cast<Function>(GV))
742ac9a064cSDimitry Andric return FnsInPart.contains(Fn);
743ac9a064cSDimitry Andric
744ac9a064cSDimitry Andric if (NeedsConservativeImport(GV))
745ac9a064cSDimitry Andric return true;
746ac9a064cSDimitry Andric
747ac9a064cSDimitry Andric // Everything else goes in the first partition.
748ac9a064cSDimitry Andric return I == 0;
749ac9a064cSDimitry Andric }));
750ac9a064cSDimitry Andric
751ac9a064cSDimitry Andric // Clean-up conservatively imported GVs without any users.
752ac9a064cSDimitry Andric for (auto &GV : make_early_inc_range(MPart->globals())) {
753ac9a064cSDimitry Andric if (NeedsConservativeImport(&GV) && GV.use_empty())
754ac9a064cSDimitry Andric GV.eraseFromParent();
755ac9a064cSDimitry Andric }
756ac9a064cSDimitry Andric
757ac9a064cSDimitry Andric unsigned NumAllFns = 0, NumKernels = 0;
758ac9a064cSDimitry Andric for (auto &Cur : *MPart) {
759ac9a064cSDimitry Andric if (!Cur.isDeclaration()) {
760ac9a064cSDimitry Andric ++NumAllFns;
761ac9a064cSDimitry Andric if (isEntryPoint(&Cur))
762ac9a064cSDimitry Andric ++NumKernels;
763ac9a064cSDimitry Andric }
764ac9a064cSDimitry Andric }
765ac9a064cSDimitry Andric TotalFnImpls += NumAllFns;
766ac9a064cSDimitry Andric SML << " - Module " << I << " with " << NumAllFns << " functions ("
767ac9a064cSDimitry Andric << NumKernels << " kernels)\n";
768ac9a064cSDimitry Andric ModuleCallback(std::move(MPart));
769ac9a064cSDimitry Andric }
770ac9a064cSDimitry Andric
771ac9a064cSDimitry Andric SML << TotalFnImpls << " function definitions across all modules ("
772ac9a064cSDimitry Andric << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
773ac9a064cSDimitry Andric << "% of original module)\n";
774ac9a064cSDimitry Andric }
775ac9a064cSDimitry Andric } // namespace
776ac9a064cSDimitry Andric
run(Module & M,ModuleAnalysisManager & MAM)777ac9a064cSDimitry Andric PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
778ac9a064cSDimitry Andric ModuleAnalysisManager &MAM) {
779ac9a064cSDimitry Andric FunctionAnalysisManager &FAM =
780ac9a064cSDimitry Andric MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
781ac9a064cSDimitry Andric const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
782ac9a064cSDimitry Andric return FAM.getResult<TargetIRAnalysis>(F);
783ac9a064cSDimitry Andric };
784ac9a064cSDimitry Andric splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
785ac9a064cSDimitry Andric // We don't change the original module.
786ac9a064cSDimitry Andric return PreservedAnalyses::all();
787ac9a064cSDimitry Andric }
788