1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
13 //
14 // The pass is inspired by the work described in the paper:
15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16 //
17 //===----------------------------------------------------------------------===//
18
19 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20 #include "llvm/ADT/DenseMap.h"
21 #include "llvm/ADT/DenseSet.h"
22 #include "llvm/ADT/PriorityQueue.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/ScopeExit.h"
25 #include "llvm/ADT/SetOperations.h"
26 #include "llvm/ADT/SetVector.h"
27 #include "llvm/ADT/SmallBitVector.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallString.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/ADT/iterator.h"
33 #include "llvm/ADT/iterator_range.h"
34 #include "llvm/Analysis/AliasAnalysis.h"
35 #include "llvm/Analysis/AssumptionCache.h"
36 #include "llvm/Analysis/CodeMetrics.h"
37 #include "llvm/Analysis/ConstantFolding.h"
38 #include "llvm/Analysis/DemandedBits.h"
39 #include "llvm/Analysis/GlobalsModRef.h"
40 #include "llvm/Analysis/IVDescriptors.h"
41 #include "llvm/Analysis/LoopAccessAnalysis.h"
42 #include "llvm/Analysis/LoopInfo.h"
43 #include "llvm/Analysis/MemoryLocation.h"
44 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
45 #include "llvm/Analysis/ScalarEvolution.h"
46 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
47 #include "llvm/Analysis/TargetLibraryInfo.h"
48 #include "llvm/Analysis/TargetTransformInfo.h"
49 #include "llvm/Analysis/ValueTracking.h"
50 #include "llvm/Analysis/VectorUtils.h"
51 #include "llvm/IR/Attributes.h"
52 #include "llvm/IR/BasicBlock.h"
53 #include "llvm/IR/Constant.h"
54 #include "llvm/IR/Constants.h"
55 #include "llvm/IR/DataLayout.h"
56 #include "llvm/IR/DerivedTypes.h"
57 #include "llvm/IR/Dominators.h"
58 #include "llvm/IR/Function.h"
59 #include "llvm/IR/IRBuilder.h"
60 #include "llvm/IR/InstrTypes.h"
61 #include "llvm/IR/Instruction.h"
62 #include "llvm/IR/Instructions.h"
63 #include "llvm/IR/IntrinsicInst.h"
64 #include "llvm/IR/Intrinsics.h"
65 #include "llvm/IR/Module.h"
66 #include "llvm/IR/Operator.h"
67 #include "llvm/IR/PatternMatch.h"
68 #include "llvm/IR/Type.h"
69 #include "llvm/IR/Use.h"
70 #include "llvm/IR/User.h"
71 #include "llvm/IR/Value.h"
72 #include "llvm/IR/ValueHandle.h"
73 #ifdef EXPENSIVE_CHECKS
74 #include "llvm/IR/Verifier.h"
75 #endif
76 #include "llvm/Pass.h"
77 #include "llvm/Support/Casting.h"
78 #include "llvm/Support/CommandLine.h"
79 #include "llvm/Support/Compiler.h"
80 #include "llvm/Support/DOTGraphTraits.h"
81 #include "llvm/Support/Debug.h"
82 #include "llvm/Support/ErrorHandling.h"
83 #include "llvm/Support/GraphWriter.h"
84 #include "llvm/Support/InstructionCost.h"
85 #include "llvm/Support/KnownBits.h"
86 #include "llvm/Support/MathExtras.h"
87 #include "llvm/Support/raw_ostream.h"
88 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
89 #include "llvm/Transforms/Utils/Local.h"
90 #include "llvm/Transforms/Utils/LoopUtils.h"
91 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
92 #include <algorithm>
93 #include <cassert>
94 #include <cstdint>
95 #include <iterator>
96 #include <memory>
97 #include <optional>
98 #include <set>
99 #include <string>
100 #include <tuple>
101 #include <utility>
102
103 using namespace llvm;
104 using namespace llvm::PatternMatch;
105 using namespace slpvectorizer;
106
107 #define SV_NAME "slp-vectorizer"
108 #define DEBUG_TYPE "SLP"
109
110 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112 static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116 static cl::opt<bool>
117 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
118 cl::desc("Enable vectorization for wider vector utilization"));
119
120 static cl::opt<int>
121 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
122 cl::desc("Only vectorize if you gain more than this "
123 "number "));
124
125 static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
126 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
127 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
128 "heuristics and makes vectorization decision via cost modeling."));
129
130 static cl::opt<bool>
131 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
132 cl::desc("Attempt to vectorize horizontal reductions"));
133
134 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
135 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
136 cl::desc(
137 "Attempt to vectorize horizontal reductions feeding into a store"));
138
139 // NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
140 // even if we match a reduction but do not vectorize in the end.
141 static cl::opt<bool> AllowHorRdxIdenityOptimization(
142 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
143 cl::desc("Allow optimization of original scalar identity operations on "
144 "matched horizontal reductions."));
145
146 static cl::opt<int>
147 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
148 cl::desc("Attempt to vectorize for this register size in bits"));
149
150 static cl::opt<unsigned>
151 MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
152 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
153
154 /// Limits the size of scheduling regions in a block.
155 /// It avoid long compile times for _very_ large blocks where vector
156 /// instructions are spread over a wide range.
157 /// This limit is way higher than needed by real-world functions.
158 static cl::opt<int>
159 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
160 cl::desc("Limit the size of the SLP scheduling region per block"));
161
162 static cl::opt<int> MinVectorRegSizeOption(
163 "slp-min-reg-size", cl::init(128), cl::Hidden,
164 cl::desc("Attempt to vectorize for this register size in bits"));
165
166 static cl::opt<unsigned> RecursionMaxDepth(
167 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
168 cl::desc("Limit the recursion depth when building a vectorizable tree"));
169
170 static cl::opt<unsigned> MinTreeSize(
171 "slp-min-tree-size", cl::init(3), cl::Hidden,
172 cl::desc("Only vectorize small trees if they are fully vectorizable"));
173
174 // The maximum depth that the look-ahead score heuristic will explore.
175 // The higher this value, the higher the compilation time overhead.
176 static cl::opt<int> LookAheadMaxDepth(
177 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
178 cl::desc("The maximum look-ahead depth for operand reordering scores"));
179
180 // The maximum depth that the look-ahead score heuristic will explore
181 // when it probing among candidates for vectorization tree roots.
182 // The higher this value, the higher the compilation time overhead but unlike
183 // similar limit for operands ordering this is less frequently used, hence
184 // impact of higher value is less noticeable.
185 static cl::opt<int> RootLookAheadMaxDepth(
186 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
187 cl::desc("The maximum look-ahead depth for searching best rooting option"));
188
189 static cl::opt<unsigned> MinProfitableStridedLoads(
190 "slp-min-strided-loads", cl::init(2), cl::Hidden,
191 cl::desc("The minimum number of loads, which should be considered strided, "
192 "if the stride is > 1 or is runtime value"));
193
194 static cl::opt<unsigned> MaxProfitableLoadStride(
195 "slp-max-stride", cl::init(8), cl::Hidden,
196 cl::desc("The maximum stride, considered to be profitable."));
197
198 static cl::opt<bool>
199 ViewSLPTree("view-slp-tree", cl::Hidden,
200 cl::desc("Display the SLP trees with Graphviz"));
201
202 static cl::opt<bool> VectorizeNonPowerOf2(
203 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
204 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
205
206 // Limit the number of alias checks. The limit is chosen so that
207 // it has no negative effect on the llvm benchmarks.
208 static const unsigned AliasedCheckLimit = 10;
209
210 // Limit of the number of uses for potentially transformed instructions/values,
211 // used in checks to avoid compile-time explode.
212 static constexpr int UsesLimit = 64;
213
214 // Another limit for the alias checks: The maximum distance between load/store
215 // instructions where alias checks are done.
216 // This limit is useful for very large basic blocks.
217 static const unsigned MaxMemDepDistance = 160;
218
219 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
220 /// regions to be handled.
221 static const int MinScheduleRegionSize = 16;
222
223 /// Maximum allowed number of operands in the PHI nodes.
224 static const unsigned MaxPHINumOperands = 128;
225
226 /// Predicate for the element types that the SLP vectorizer supports.
227 ///
228 /// The most important thing to filter here are types which are invalid in LLVM
229 /// vectors. We also filter target specific types which have absolutely no
230 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
231 /// avoids spending time checking the cost model and realizing that they will
232 /// be inevitably scalarized.
isValidElementType(Type * Ty)233 static bool isValidElementType(Type *Ty) {
234 // TODO: Support ScalableVectorType.
235 if (SLPReVec && isa<FixedVectorType>(Ty))
236 Ty = Ty->getScalarType();
237 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
238 !Ty->isPPC_FP128Ty();
239 }
240
241 /// \returns the number of elements for Ty.
getNumElements(Type * Ty)242 static unsigned getNumElements(Type *Ty) {
243 assert(!isa<ScalableVectorType>(Ty) &&
244 "ScalableVectorType is not supported.");
245 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
246 return VecTy->getNumElements();
247 return 1;
248 }
249
250 /// \returns the vector type of ScalarTy based on vectorization factor.
getWidenedType(Type * ScalarTy,unsigned VF)251 static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
252 return FixedVectorType::get(ScalarTy->getScalarType(),
253 VF * getNumElements(ScalarTy));
254 }
255
256 /// \returns True if the value is a constant (but not globals/constant
257 /// expressions).
isConstant(Value * V)258 static bool isConstant(Value *V) {
259 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
260 }
261
262 /// Checks if \p V is one of vector-like instructions, i.e. undef,
263 /// insertelement/extractelement with constant indices for fixed vector type or
264 /// extractvalue instruction.
isVectorLikeInstWithConstOps(Value * V)265 static bool isVectorLikeInstWithConstOps(Value *V) {
266 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
267 !isa<ExtractValueInst, UndefValue>(V))
268 return false;
269 auto *I = dyn_cast<Instruction>(V);
270 if (!I || isa<ExtractValueInst>(I))
271 return true;
272 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
273 return false;
274 if (isa<ExtractElementInst>(I))
275 return isConstant(I->getOperand(1));
276 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
277 return isConstant(I->getOperand(2));
278 }
279
280 /// Returns power-of-2 number of elements in a single register (part), given the
281 /// total number of elements \p Size and number of registers (parts) \p
282 /// NumParts.
getPartNumElems(unsigned Size,unsigned NumParts)283 static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
284 return PowerOf2Ceil(divideCeil(Size, NumParts));
285 }
286
287 /// Returns correct remaining number of elements, considering total amount \p
288 /// Size, (power-of-2 number) of elements in a single register \p PartNumElems
289 /// and current register (part) \p Part.
getNumElems(unsigned Size,unsigned PartNumElems,unsigned Part)290 static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
291 unsigned Part) {
292 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
293 }
294
295 #if !defined(NDEBUG)
296 /// Print a short descriptor of the instruction bundle suitable for debug output.
shortBundleName(ArrayRef<Value * > VL)297 static std::string shortBundleName(ArrayRef<Value *> VL) {
298 std::string Result;
299 raw_string_ostream OS(Result);
300 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
301 OS.flush();
302 return Result;
303 }
304 #endif
305
306 /// \returns true if all of the instructions in \p VL are in the same block or
307 /// false otherwise.
allSameBlock(ArrayRef<Value * > VL)308 static bool allSameBlock(ArrayRef<Value *> VL) {
309 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
310 if (!I0)
311 return false;
312 if (all_of(VL, isVectorLikeInstWithConstOps))
313 return true;
314
315 BasicBlock *BB = I0->getParent();
316 for (int I = 1, E = VL.size(); I < E; I++) {
317 auto *II = dyn_cast<Instruction>(VL[I]);
318 if (!II)
319 return false;
320
321 if (BB != II->getParent())
322 return false;
323 }
324 return true;
325 }
326
327 /// \returns True if all of the values in \p VL are constants (but not
328 /// globals/constant expressions).
allConstant(ArrayRef<Value * > VL)329 static bool allConstant(ArrayRef<Value *> VL) {
330 // Constant expressions and globals can't be vectorized like normal integer/FP
331 // constants.
332 return all_of(VL, isConstant);
333 }
334
335 /// \returns True if all of the values in \p VL are identical or some of them
336 /// are UndefValue.
isSplat(ArrayRef<Value * > VL)337 static bool isSplat(ArrayRef<Value *> VL) {
338 Value *FirstNonUndef = nullptr;
339 for (Value *V : VL) {
340 if (isa<UndefValue>(V))
341 continue;
342 if (!FirstNonUndef) {
343 FirstNonUndef = V;
344 continue;
345 }
346 if (V != FirstNonUndef)
347 return false;
348 }
349 return FirstNonUndef != nullptr;
350 }
351
352 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
isCommutative(Instruction * I)353 static bool isCommutative(Instruction *I) {
354 if (auto *Cmp = dyn_cast<CmpInst>(I))
355 return Cmp->isCommutative();
356 if (auto *BO = dyn_cast<BinaryOperator>(I))
357 return BO->isCommutative() ||
358 (BO->getOpcode() == Instruction::Sub &&
359 !BO->hasNUsesOrMore(UsesLimit) &&
360 all_of(
361 BO->uses(),
362 [](const Use &U) {
363 // Commutative, if icmp eq/ne sub, 0
364 ICmpInst::Predicate Pred;
365 if (match(U.getUser(),
366 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
367 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
368 return true;
369 // Commutative, if abs(sub nsw, true) or abs(sub, false).
370 ConstantInt *Flag;
371 return match(U.getUser(),
372 m_Intrinsic<Intrinsic::abs>(
373 m_Specific(U.get()), m_ConstantInt(Flag))) &&
374 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
375 Flag->isOne());
376 })) ||
377 (BO->getOpcode() == Instruction::FSub &&
378 !BO->hasNUsesOrMore(UsesLimit) &&
379 all_of(BO->uses(), [](const Use &U) {
380 return match(U.getUser(),
381 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
382 }));
383 return I->isCommutative();
384 }
385
386 template <typename T>
getInsertExtractIndex(const Value * Inst,unsigned Offset)387 static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
388 unsigned Offset) {
389 static_assert(std::is_same_v<T, InsertElementInst> ||
390 std::is_same_v<T, ExtractElementInst>,
391 "unsupported T");
392 int Index = Offset;
393 if (const auto *IE = dyn_cast<T>(Inst)) {
394 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
395 if (!VT)
396 return std::nullopt;
397 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
398 if (!CI)
399 return std::nullopt;
400 if (CI->getValue().uge(VT->getNumElements()))
401 return std::nullopt;
402 Index *= VT->getNumElements();
403 Index += CI->getZExtValue();
404 return Index;
405 }
406 return std::nullopt;
407 }
408
409 /// \returns inserting or extracting index of InsertElement, ExtractElement or
410 /// InsertValue instruction, using Offset as base offset for index.
411 /// \returns std::nullopt if the index is not an immediate.
getElementIndex(const Value * Inst,unsigned Offset=0)412 static std::optional<unsigned> getElementIndex(const Value *Inst,
413 unsigned Offset = 0) {
414 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
415 return Index;
416 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
417 return Index;
418
419 int Index = Offset;
420
421 const auto *IV = dyn_cast<InsertValueInst>(Inst);
422 if (!IV)
423 return std::nullopt;
424
425 Type *CurrentType = IV->getType();
426 for (unsigned I : IV->indices()) {
427 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
428 Index *= ST->getNumElements();
429 CurrentType = ST->getElementType(I);
430 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
431 Index *= AT->getNumElements();
432 CurrentType = AT->getElementType();
433 } else {
434 return std::nullopt;
435 }
436 Index += I;
437 }
438 return Index;
439 }
440
441 namespace {
442 /// Specifies the way the mask should be analyzed for undefs/poisonous elements
443 /// in the shuffle mask.
444 enum class UseMask {
445 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
446 ///< check for the mask elements for the first argument (mask
447 ///< indices are in range [0:VF)).
448 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
449 ///< for the mask elements for the second argument (mask indices
450 ///< are in range [VF:2*VF))
451 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
452 ///< future shuffle elements and mark them as ones as being used
453 ///< in future. Non-undef elements are considered as unused since
454 ///< they're already marked as used in the mask.
455 };
456 } // namespace
457
458 /// Prepares a use bitset for the given mask either for the first argument or
459 /// for the second.
buildUseMask(int VF,ArrayRef<int> Mask,UseMask MaskArg)460 static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
461 UseMask MaskArg) {
462 SmallBitVector UseMask(VF, true);
463 for (auto [Idx, Value] : enumerate(Mask)) {
464 if (Value == PoisonMaskElem) {
465 if (MaskArg == UseMask::UndefsAsMask)
466 UseMask.reset(Idx);
467 continue;
468 }
469 if (MaskArg == UseMask::FirstArg && Value < VF)
470 UseMask.reset(Value);
471 else if (MaskArg == UseMask::SecondArg && Value >= VF)
472 UseMask.reset(Value - VF);
473 }
474 return UseMask;
475 }
476
477 /// Checks if the given value is actually an undefined constant vector.
478 /// Also, if the \p UseMask is not empty, tries to check if the non-masked
479 /// elements actually mask the insertelement buildvector, if any.
480 template <bool IsPoisonOnly = false>
isUndefVector(const Value * V,const SmallBitVector & UseMask={})481 static SmallBitVector isUndefVector(const Value *V,
482 const SmallBitVector &UseMask = {}) {
483 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
484 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
485 if (isa<T>(V))
486 return Res;
487 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
488 if (!VecTy)
489 return Res.reset();
490 auto *C = dyn_cast<Constant>(V);
491 if (!C) {
492 if (!UseMask.empty()) {
493 const Value *Base = V;
494 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
495 Base = II->getOperand(0);
496 if (isa<T>(II->getOperand(1)))
497 continue;
498 std::optional<unsigned> Idx = getElementIndex(II);
499 if (!Idx) {
500 Res.reset();
501 return Res;
502 }
503 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
504 Res.reset(*Idx);
505 }
506 // TODO: Add analysis for shuffles here too.
507 if (V == Base) {
508 Res.reset();
509 } else {
510 SmallBitVector SubMask(UseMask.size(), false);
511 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
512 }
513 } else {
514 Res.reset();
515 }
516 return Res;
517 }
518 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
519 if (Constant *Elem = C->getAggregateElement(I))
520 if (!isa<T>(Elem) &&
521 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
522 Res.reset(I);
523 }
524 return Res;
525 }
526
527 /// Checks if the vector of instructions can be represented as a shuffle, like:
528 /// %x0 = extractelement <4 x i8> %x, i32 0
529 /// %x3 = extractelement <4 x i8> %x, i32 3
530 /// %y1 = extractelement <4 x i8> %y, i32 1
531 /// %y2 = extractelement <4 x i8> %y, i32 2
532 /// %x0x0 = mul i8 %x0, %x0
533 /// %x3x3 = mul i8 %x3, %x3
534 /// %y1y1 = mul i8 %y1, %y1
535 /// %y2y2 = mul i8 %y2, %y2
536 /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
537 /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
538 /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
539 /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
540 /// ret <4 x i8> %ins4
541 /// can be transformed into:
542 /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
543 /// i32 6>
544 /// %2 = mul <4 x i8> %1, %1
545 /// ret <4 x i8> %2
546 /// Mask will return the Shuffle Mask equivalent to the extracted elements.
547 /// TODO: Can we split off and reuse the shuffle mask detection from
548 /// ShuffleVectorInst/getShuffleCost?
549 static std::optional<TargetTransformInfo::ShuffleKind>
isFixedVectorShuffle(ArrayRef<Value * > VL,SmallVectorImpl<int> & Mask)550 isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
551 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
552 if (It == VL.end())
553 return std::nullopt;
554 unsigned Size =
555 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
556 auto *EI = dyn_cast<ExtractElementInst>(V);
557 if (!EI)
558 return S;
559 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
560 if (!VTy)
561 return S;
562 return std::max(S, VTy->getNumElements());
563 });
564
565 Value *Vec1 = nullptr;
566 Value *Vec2 = nullptr;
567 bool HasNonUndefVec = any_of(VL, [](Value *V) {
568 auto *EE = dyn_cast<ExtractElementInst>(V);
569 if (!EE)
570 return false;
571 Value *Vec = EE->getVectorOperand();
572 if (isa<UndefValue>(Vec))
573 return false;
574 return isGuaranteedNotToBePoison(Vec);
575 });
576 enum ShuffleMode { Unknown, Select, Permute };
577 ShuffleMode CommonShuffleMode = Unknown;
578 Mask.assign(VL.size(), PoisonMaskElem);
579 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
580 // Undef can be represented as an undef element in a vector.
581 if (isa<UndefValue>(VL[I]))
582 continue;
583 auto *EI = cast<ExtractElementInst>(VL[I]);
584 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
585 return std::nullopt;
586 auto *Vec = EI->getVectorOperand();
587 // We can extractelement from undef or poison vector.
588 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
589 continue;
590 // All vector operands must have the same number of vector elements.
591 if (isa<UndefValue>(Vec)) {
592 Mask[I] = I;
593 } else {
594 if (isa<UndefValue>(EI->getIndexOperand()))
595 continue;
596 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
597 if (!Idx)
598 return std::nullopt;
599 // Undefined behavior if Idx is negative or >= Size.
600 if (Idx->getValue().uge(Size))
601 continue;
602 unsigned IntIdx = Idx->getValue().getZExtValue();
603 Mask[I] = IntIdx;
604 }
605 if (isUndefVector(Vec).all() && HasNonUndefVec)
606 continue;
607 // For correct shuffling we have to have at most 2 different vector operands
608 // in all extractelement instructions.
609 if (!Vec1 || Vec1 == Vec) {
610 Vec1 = Vec;
611 } else if (!Vec2 || Vec2 == Vec) {
612 Vec2 = Vec;
613 Mask[I] += Size;
614 } else {
615 return std::nullopt;
616 }
617 if (CommonShuffleMode == Permute)
618 continue;
619 // If the extract index is not the same as the operation number, it is a
620 // permutation.
621 if (Mask[I] % Size != I) {
622 CommonShuffleMode = Permute;
623 continue;
624 }
625 CommonShuffleMode = Select;
626 }
627 // If we're not crossing lanes in different vectors, consider it as blending.
628 if (CommonShuffleMode == Select && Vec2)
629 return TargetTransformInfo::SK_Select;
630 // If Vec2 was never used, we have a permutation of a single vector, otherwise
631 // we have permutation of 2 vectors.
632 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
633 : TargetTransformInfo::SK_PermuteSingleSrc;
634 }
635
636 /// \returns True if Extract{Value,Element} instruction extracts element Idx.
getExtractIndex(Instruction * E)637 static std::optional<unsigned> getExtractIndex(Instruction *E) {
638 unsigned Opcode = E->getOpcode();
639 assert((Opcode == Instruction::ExtractElement ||
640 Opcode == Instruction::ExtractValue) &&
641 "Expected extractelement or extractvalue instruction.");
642 if (Opcode == Instruction::ExtractElement) {
643 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
644 if (!CI)
645 return std::nullopt;
646 return CI->getZExtValue();
647 }
648 auto *EI = cast<ExtractValueInst>(E);
649 if (EI->getNumIndices() != 1)
650 return std::nullopt;
651 return *EI->idx_begin();
652 }
653
654 namespace {
655
656 /// Main data required for vectorization of instructions.
657 struct InstructionsState {
658 /// The very first instruction in the list with the main opcode.
659 Value *OpValue = nullptr;
660
661 /// The main/alternate instruction.
662 Instruction *MainOp = nullptr;
663 Instruction *AltOp = nullptr;
664
665 /// The main/alternate opcodes for the list of instructions.
getOpcode__anon5824d9490611::InstructionsState666 unsigned getOpcode() const {
667 return MainOp ? MainOp->getOpcode() : 0;
668 }
669
getAltOpcode__anon5824d9490611::InstructionsState670 unsigned getAltOpcode() const {
671 return AltOp ? AltOp->getOpcode() : 0;
672 }
673
674 /// Some of the instructions in the list have alternate opcodes.
isAltShuffle__anon5824d9490611::InstructionsState675 bool isAltShuffle() const { return AltOp != MainOp; }
676
isOpcodeOrAlt__anon5824d9490611::InstructionsState677 bool isOpcodeOrAlt(Instruction *I) const {
678 unsigned CheckedOpcode = I->getOpcode();
679 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
680 }
681
682 InstructionsState() = delete;
InstructionsState__anon5824d9490611::InstructionsState683 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
684 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
685 };
686
687 } // end anonymous namespace
688
689 /// Chooses the correct key for scheduling data. If \p Op has the same (or
690 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
691 /// OpValue.
isOneOf(const InstructionsState & S,Value * Op)692 static Value *isOneOf(const InstructionsState &S, Value *Op) {
693 auto *I = dyn_cast<Instruction>(Op);
694 if (I && S.isOpcodeOrAlt(I))
695 return Op;
696 return S.OpValue;
697 }
698
699 /// \returns true if \p Opcode is allowed as part of the main/alternate
700 /// instruction for SLP vectorization.
701 ///
702 /// Example of unsupported opcode is SDIV that can potentially cause UB if the
703 /// "shuffled out" lane would result in division by zero.
isValidForAlternation(unsigned Opcode)704 static bool isValidForAlternation(unsigned Opcode) {
705 if (Instruction::isIntDivRem(Opcode))
706 return false;
707
708 return true;
709 }
710
711 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
712 const TargetLibraryInfo &TLI,
713 unsigned BaseIndex = 0);
714
715 /// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
716 /// compatible instructions or constants, or just some other regular values.
areCompatibleCmpOps(Value * BaseOp0,Value * BaseOp1,Value * Op0,Value * Op1,const TargetLibraryInfo & TLI)717 static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
718 Value *Op1, const TargetLibraryInfo &TLI) {
719 return (isConstant(BaseOp0) && isConstant(Op0)) ||
720 (isConstant(BaseOp1) && isConstant(Op1)) ||
721 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
722 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
723 BaseOp0 == Op0 || BaseOp1 == Op1 ||
724 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
725 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
726 }
727
728 /// \returns true if a compare instruction \p CI has similar "look" and
729 /// same predicate as \p BaseCI, "as is" or with its operands and predicate
730 /// swapped, false otherwise.
isCmpSameOrSwapped(const CmpInst * BaseCI,const CmpInst * CI,const TargetLibraryInfo & TLI)731 static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
732 const TargetLibraryInfo &TLI) {
733 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
734 "Assessing comparisons of different types?");
735 CmpInst::Predicate BasePred = BaseCI->getPredicate();
736 CmpInst::Predicate Pred = CI->getPredicate();
737 CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred);
738
739 Value *BaseOp0 = BaseCI->getOperand(0);
740 Value *BaseOp1 = BaseCI->getOperand(1);
741 Value *Op0 = CI->getOperand(0);
742 Value *Op1 = CI->getOperand(1);
743
744 return (BasePred == Pred &&
745 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
746 (BasePred == SwappedPred &&
747 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
748 }
749
750 /// \returns analysis of the Instructions in \p VL described in
751 /// InstructionsState, the Opcode that we suppose the whole list
752 /// could be vectorized even if its structure is diverse.
getSameOpcode(ArrayRef<Value * > VL,const TargetLibraryInfo & TLI,unsigned BaseIndex)753 static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
754 const TargetLibraryInfo &TLI,
755 unsigned BaseIndex) {
756 // Make sure these are all Instructions.
757 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
758 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
759
760 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
761 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
762 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
763 CmpInst::Predicate BasePred =
764 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
765 : CmpInst::BAD_ICMP_PREDICATE;
766 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
767 unsigned AltOpcode = Opcode;
768 unsigned AltIndex = BaseIndex;
769
770 bool SwappedPredsCompatible = [&]() {
771 if (!IsCmpOp)
772 return false;
773 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
774 UniquePreds.insert(BasePred);
775 UniqueNonSwappedPreds.insert(BasePred);
776 for (Value *V : VL) {
777 auto *I = dyn_cast<CmpInst>(V);
778 if (!I)
779 return false;
780 CmpInst::Predicate CurrentPred = I->getPredicate();
781 CmpInst::Predicate SwappedCurrentPred =
782 CmpInst::getSwappedPredicate(CurrentPred);
783 UniqueNonSwappedPreds.insert(CurrentPred);
784 if (!UniquePreds.contains(CurrentPred) &&
785 !UniquePreds.contains(SwappedCurrentPred))
786 UniquePreds.insert(CurrentPred);
787 }
788 // Total number of predicates > 2, but if consider swapped predicates
789 // compatible only 2, consider swappable predicates as compatible opcodes,
790 // not alternate.
791 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
792 }();
793 // Check for one alternate opcode from another BinaryOperator.
794 // TODO - generalize to support all operators (types, calls etc.).
795 auto *IBase = cast<Instruction>(VL[BaseIndex]);
796 Intrinsic::ID BaseID = 0;
797 SmallVector<VFInfo> BaseMappings;
798 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
799 BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI);
800 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
801 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
802 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
803 }
804 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
805 auto *I = cast<Instruction>(VL[Cnt]);
806 unsigned InstOpcode = I->getOpcode();
807 if (IsBinOp && isa<BinaryOperator>(I)) {
808 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
809 continue;
810 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
811 isValidForAlternation(Opcode)) {
812 AltOpcode = InstOpcode;
813 AltIndex = Cnt;
814 continue;
815 }
816 } else if (IsCastOp && isa<CastInst>(I)) {
817 Value *Op0 = IBase->getOperand(0);
818 Type *Ty0 = Op0->getType();
819 Value *Op1 = I->getOperand(0);
820 Type *Ty1 = Op1->getType();
821 if (Ty0 == Ty1) {
822 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
823 continue;
824 if (Opcode == AltOpcode) {
825 assert(isValidForAlternation(Opcode) &&
826 isValidForAlternation(InstOpcode) &&
827 "Cast isn't safe for alternation, logic needs to be updated!");
828 AltOpcode = InstOpcode;
829 AltIndex = Cnt;
830 continue;
831 }
832 }
833 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
834 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
835 Type *Ty0 = BaseInst->getOperand(0)->getType();
836 Type *Ty1 = Inst->getOperand(0)->getType();
837 if (Ty0 == Ty1) {
838 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
839 // Check for compatible operands. If the corresponding operands are not
840 // compatible - need to perform alternate vectorization.
841 CmpInst::Predicate CurrentPred = Inst->getPredicate();
842 CmpInst::Predicate SwappedCurrentPred =
843 CmpInst::getSwappedPredicate(CurrentPred);
844
845 if ((E == 2 || SwappedPredsCompatible) &&
846 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
847 continue;
848
849 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
850 continue;
851 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
852 if (AltIndex != BaseIndex) {
853 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
854 continue;
855 } else if (BasePred != CurrentPred) {
856 assert(
857 isValidForAlternation(InstOpcode) &&
858 "CmpInst isn't safe for alternation, logic needs to be updated!");
859 AltIndex = Cnt;
860 continue;
861 }
862 CmpInst::Predicate AltPred = AltInst->getPredicate();
863 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
864 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
865 continue;
866 }
867 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
868 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
869 if (Gep->getNumOperands() != 2 ||
870 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
871 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
872 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
873 if (!isVectorLikeInstWithConstOps(EI))
874 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
875 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
876 auto *BaseLI = cast<LoadInst>(IBase);
877 if (!LI->isSimple() || !BaseLI->isSimple())
878 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
879 } else if (auto *Call = dyn_cast<CallInst>(I)) {
880 auto *CallBase = cast<CallInst>(IBase);
881 if (Call->getCalledFunction() != CallBase->getCalledFunction())
882 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
883 if (Call->hasOperandBundles() && (!CallBase->hasOperandBundles() ||
884 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
885 Call->op_begin() + Call->getBundleOperandsEndIndex(),
886 CallBase->op_begin() +
887 CallBase->getBundleOperandsStartIndex())))
888 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
889 Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI);
890 if (ID != BaseID)
891 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
892 if (!ID) {
893 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
894 if (Mappings.size() != BaseMappings.size() ||
895 Mappings.front().ISA != BaseMappings.front().ISA ||
896 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
897 Mappings.front().VectorName != BaseMappings.front().VectorName ||
898 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
899 Mappings.front().Shape.Parameters !=
900 BaseMappings.front().Shape.Parameters)
901 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
902 }
903 }
904 continue;
905 }
906 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
907 }
908
909 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
910 cast<Instruction>(VL[AltIndex]));
911 }
912
913 /// \returns true if all of the values in \p VL have the same type or false
914 /// otherwise.
allSameType(ArrayRef<Value * > VL)915 static bool allSameType(ArrayRef<Value *> VL) {
916 Type *Ty = VL.front()->getType();
917 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
918 }
919
920 /// \returns True if in-tree use also needs extract. This refers to
921 /// possible scalar operand in vectorized instruction.
doesInTreeUserNeedToExtract(Value * Scalar,Instruction * UserInst,TargetLibraryInfo * TLI)922 static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
923 TargetLibraryInfo *TLI) {
924 unsigned Opcode = UserInst->getOpcode();
925 switch (Opcode) {
926 case Instruction::Load: {
927 LoadInst *LI = cast<LoadInst>(UserInst);
928 return (LI->getPointerOperand() == Scalar);
929 }
930 case Instruction::Store: {
931 StoreInst *SI = cast<StoreInst>(UserInst);
932 return (SI->getPointerOperand() == Scalar);
933 }
934 case Instruction::Call: {
935 CallInst *CI = cast<CallInst>(UserInst);
936 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
937 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
938 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
939 Arg.value().get() == Scalar;
940 });
941 }
942 default:
943 return false;
944 }
945 }
946
947 /// \returns the AA location that is being access by the instruction.
getLocation(Instruction * I)948 static MemoryLocation getLocation(Instruction *I) {
949 if (StoreInst *SI = dyn_cast<StoreInst>(I))
950 return MemoryLocation::get(SI);
951 if (LoadInst *LI = dyn_cast<LoadInst>(I))
952 return MemoryLocation::get(LI);
953 return MemoryLocation();
954 }
955
956 /// \returns True if the instruction is not a volatile or atomic load/store.
isSimple(Instruction * I)957 static bool isSimple(Instruction *I) {
958 if (LoadInst *LI = dyn_cast<LoadInst>(I))
959 return LI->isSimple();
960 if (StoreInst *SI = dyn_cast<StoreInst>(I))
961 return SI->isSimple();
962 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
963 return !MI->isVolatile();
964 return true;
965 }
966
967 /// Shuffles \p Mask in accordance with the given \p SubMask.
968 /// \param ExtendingManyInputs Supports reshuffling of the mask with not only
969 /// one but two input vectors.
addMask(SmallVectorImpl<int> & Mask,ArrayRef<int> SubMask,bool ExtendingManyInputs=false)970 static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
971 bool ExtendingManyInputs = false) {
972 if (SubMask.empty())
973 return;
974 assert(
975 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
976 // Check if input scalars were extended to match the size of other node.
977 (SubMask.size() == Mask.size() &&
978 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
979 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
980 "SubMask with many inputs support must be larger than the mask.");
981 if (Mask.empty()) {
982 Mask.append(SubMask.begin(), SubMask.end());
983 return;
984 }
985 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
986 int TermValue = std::min(Mask.size(), SubMask.size());
987 for (int I = 0, E = SubMask.size(); I < E; ++I) {
988 if (SubMask[I] == PoisonMaskElem ||
989 (!ExtendingManyInputs &&
990 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
991 continue;
992 NewMask[I] = Mask[SubMask[I]];
993 }
994 Mask.swap(NewMask);
995 }
996
997 /// Order may have elements assigned special value (size) which is out of
998 /// bounds. Such indices only appear on places which correspond to undef values
999 /// (see canReuseExtract for details) and used in order to avoid undef values
1000 /// have effect on operands ordering.
1001 /// The first loop below simply finds all unused indices and then the next loop
1002 /// nest assigns these indices for undef values positions.
1003 /// As an example below Order has two undef positions and they have assigned
1004 /// values 3 and 7 respectively:
1005 /// before: 6 9 5 4 9 2 1 0
1006 /// after: 6 3 5 4 7 2 1 0
fixupOrderingIndices(MutableArrayRef<unsigned> Order)1007 static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
1008 const unsigned Sz = Order.size();
1009 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1010 SmallBitVector MaskedIndices(Sz);
1011 for (unsigned I = 0; I < Sz; ++I) {
1012 if (Order[I] < Sz)
1013 UnusedIndices.reset(Order[I]);
1014 else
1015 MaskedIndices.set(I);
1016 }
1017 if (MaskedIndices.none())
1018 return;
1019 assert(UnusedIndices.count() == MaskedIndices.count() &&
1020 "Non-synced masked/available indices.");
1021 int Idx = UnusedIndices.find_first();
1022 int MIdx = MaskedIndices.find_first();
1023 while (MIdx >= 0) {
1024 assert(Idx >= 0 && "Indices must be synced.");
1025 Order[MIdx] = Idx;
1026 Idx = UnusedIndices.find_next(Idx);
1027 MIdx = MaskedIndices.find_next(MIdx);
1028 }
1029 }
1030
1031 /// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1032 /// Opcode1.
getAltInstrMask(ArrayRef<Value * > VL,unsigned Opcode0,unsigned Opcode1)1033 SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, unsigned Opcode0,
1034 unsigned Opcode1) {
1035 SmallBitVector OpcodeMask(VL.size(), false);
1036 for (unsigned Lane : seq<unsigned>(VL.size()))
1037 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1038 OpcodeMask.set(Lane);
1039 return OpcodeMask;
1040 }
1041
1042 namespace llvm {
1043
inversePermutation(ArrayRef<unsigned> Indices,SmallVectorImpl<int> & Mask)1044 static void inversePermutation(ArrayRef<unsigned> Indices,
1045 SmallVectorImpl<int> &Mask) {
1046 Mask.clear();
1047 const unsigned E = Indices.size();
1048 Mask.resize(E, PoisonMaskElem);
1049 for (unsigned I = 0; I < E; ++I)
1050 Mask[Indices[I]] = I;
1051 }
1052
1053 /// Reorders the list of scalars in accordance with the given \p Mask.
reorderScalars(SmallVectorImpl<Value * > & Scalars,ArrayRef<int> Mask)1054 static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
1055 ArrayRef<int> Mask) {
1056 assert(!Mask.empty() && "Expected non-empty mask.");
1057 SmallVector<Value *> Prev(Scalars.size(),
1058 PoisonValue::get(Scalars.front()->getType()));
1059 Prev.swap(Scalars);
1060 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1061 if (Mask[I] != PoisonMaskElem)
1062 Scalars[Mask[I]] = Prev[I];
1063 }
1064
1065 /// Checks if the provided value does not require scheduling. It does not
1066 /// require scheduling if this is not an instruction or it is an instruction
1067 /// that does not read/write memory and all operands are either not instructions
1068 /// or phi nodes or instructions from different blocks.
areAllOperandsNonInsts(Value * V)1069 static bool areAllOperandsNonInsts(Value *V) {
1070 auto *I = dyn_cast<Instruction>(V);
1071 if (!I)
1072 return true;
1073 return !mayHaveNonDefUseDependency(*I) &&
1074 all_of(I->operands(), [I](Value *V) {
1075 auto *IO = dyn_cast<Instruction>(V);
1076 if (!IO)
1077 return true;
1078 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1079 });
1080 }
1081
1082 /// Checks if the provided value does not require scheduling. It does not
1083 /// require scheduling if this is not an instruction or it is an instruction
1084 /// that does not read/write memory and all users are phi nodes or instructions
1085 /// from the different blocks.
isUsedOutsideBlock(Value * V)1086 static bool isUsedOutsideBlock(Value *V) {
1087 auto *I = dyn_cast<Instruction>(V);
1088 if (!I)
1089 return true;
1090 // Limits the number of uses to save compile time.
1091 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1092 all_of(I->users(), [I](User *U) {
1093 auto *IU = dyn_cast<Instruction>(U);
1094 if (!IU)
1095 return true;
1096 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1097 });
1098 }
1099
1100 /// Checks if the specified value does not require scheduling. It does not
1101 /// require scheduling if all operands and all users do not need to be scheduled
1102 /// in the current basic block.
doesNotNeedToBeScheduled(Value * V)1103 static bool doesNotNeedToBeScheduled(Value *V) {
1104 return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
1105 }
1106
1107 /// Checks if the specified array of instructions does not require scheduling.
1108 /// It is so if all either instructions have operands that do not require
1109 /// scheduling or their users do not require scheduling since they are phis or
1110 /// in other basic blocks.
doesNotNeedToSchedule(ArrayRef<Value * > VL)1111 static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1112 return !VL.empty() &&
1113 (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
1114 }
1115
1116 namespace slpvectorizer {
1117
1118 /// Bottom Up SLP Vectorizer.
1119 class BoUpSLP {
1120 struct TreeEntry;
1121 struct ScheduleData;
1122 class ShuffleCostEstimator;
1123 class ShuffleInstructionBuilder;
1124
1125 public:
1126 /// Tracks the state we can represent the loads in the given sequence.
1127 enum class LoadsState {
1128 Gather,
1129 Vectorize,
1130 ScatterVectorize,
1131 StridedVectorize
1132 };
1133
1134 using ValueList = SmallVector<Value *, 8>;
1135 using InstrList = SmallVector<Instruction *, 16>;
1136 using ValueSet = SmallPtrSet<Value *, 16>;
1137 using StoreList = SmallVector<StoreInst *, 8>;
1138 using ExtraValueToDebugLocsMap =
1139 MapVector<Value *, SmallVector<Instruction *, 2>>;
1140 using OrdersType = SmallVector<unsigned, 4>;
1141
BoUpSLP(Function * Func,ScalarEvolution * Se,TargetTransformInfo * Tti,TargetLibraryInfo * TLi,AAResults * Aa,LoopInfo * Li,DominatorTree * Dt,AssumptionCache * AC,DemandedBits * DB,const DataLayout * DL,OptimizationRemarkEmitter * ORE)1142 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
1143 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1144 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
1145 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1146 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1147 AC(AC), DB(DB), DL(DL), ORE(ORE),
1148 Builder(Se->getContext(), TargetFolder(*DL)) {
1149 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1150 // Use the vector register size specified by the target unless overridden
1151 // by a command-line option.
1152 // TODO: It would be better to limit the vectorization factor based on
1153 // data type rather than just register size. For example, x86 AVX has
1154 // 256-bit registers, but it does not support integer operations
1155 // at that width (that requires AVX2).
1156 if (MaxVectorRegSizeOption.getNumOccurrences())
1157 MaxVecRegSize = MaxVectorRegSizeOption;
1158 else
1159 MaxVecRegSize =
1160 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
1161 .getFixedValue();
1162
1163 if (MinVectorRegSizeOption.getNumOccurrences())
1164 MinVecRegSize = MinVectorRegSizeOption;
1165 else
1166 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1167 }
1168
1169 /// Vectorize the tree that starts with the elements in \p VL.
1170 /// Returns the vectorized root.
1171 Value *vectorizeTree();
1172
1173 /// Vectorize the tree but with the list of externally used values \p
1174 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1175 /// generated extractvalue instructions.
1176 /// \param ReplacedExternals containd list of replaced external values
1177 /// {scalar, replace} after emitting extractelement for external uses.
1178 Value *
1179 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1180 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1181 Instruction *ReductionRoot = nullptr);
1182
1183 /// \returns the cost incurred by unwanted spills and fills, caused by
1184 /// holding live values over call sites.
1185 InstructionCost getSpillCost() const;
1186
1187 /// \returns the vectorization cost of the subtree that starts at \p VL.
1188 /// A negative number means that this is profitable.
1189 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1190
1191 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1192 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1193 void buildTree(ArrayRef<Value *> Roots,
1194 const SmallDenseSet<Value *> &UserIgnoreLst);
1195
1196 /// Construct a vectorizable tree that starts at \p Roots.
1197 void buildTree(ArrayRef<Value *> Roots);
1198
1199 /// Returns whether the root node has in-tree uses.
doesRootHaveInTreeUses() const1200 bool doesRootHaveInTreeUses() const {
1201 return !VectorizableTree.empty() &&
1202 !VectorizableTree.front()->UserTreeIndices.empty();
1203 }
1204
1205 /// Return the scalars of the root node.
getRootNodeScalars() const1206 ArrayRef<Value *> getRootNodeScalars() const {
1207 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1208 return VectorizableTree.front()->Scalars;
1209 }
1210
1211 /// Checks if the root graph node can be emitted with narrower bitwidth at
1212 /// codegen and returns it signedness, if so.
isSignedMinBitwidthRootNode() const1213 bool isSignedMinBitwidthRootNode() const {
1214 return MinBWs.at(VectorizableTree.front().get()).second;
1215 }
1216
1217 /// Builds external uses of the vectorized scalars, i.e. the list of
1218 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1219 /// ExternallyUsedValues contains additional list of external uses to handle
1220 /// vectorization of reductions.
1221 void
1222 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1223
1224 /// Transforms graph nodes to target specific representations, if profitable.
1225 void transformNodes();
1226
1227 /// Clear the internal data structures that are created by 'buildTree'.
deleteTree()1228 void deleteTree() {
1229 VectorizableTree.clear();
1230 ScalarToTreeEntry.clear();
1231 MultiNodeScalars.clear();
1232 MustGather.clear();
1233 NonScheduledFirst.clear();
1234 EntryToLastInstruction.clear();
1235 ExternalUses.clear();
1236 ExternalUsesAsGEPs.clear();
1237 for (auto &Iter : BlocksSchedules) {
1238 BlockScheduling *BS = Iter.second.get();
1239 BS->clear();
1240 }
1241 MinBWs.clear();
1242 ReductionBitWidth = 0;
1243 CastMaxMinBWSizes.reset();
1244 ExtraBitWidthNodes.clear();
1245 InstrElementSize.clear();
1246 UserIgnoreList = nullptr;
1247 PostponedGathers.clear();
1248 ValueToGatherNodes.clear();
1249 }
1250
getTreeSize() const1251 unsigned getTreeSize() const { return VectorizableTree.size(); }
1252
1253 /// Perform LICM and CSE on the newly generated gather sequences.
1254 void optimizeGatherSequence();
1255
1256 /// Checks if the specified gather tree entry \p TE can be represented as a
1257 /// shuffled vector entry + (possibly) permutation with other gathers. It
1258 /// implements the checks only for possibly ordered scalars (Loads,
1259 /// ExtractElement, ExtractValue), which can be part of the graph.
1260 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1261
1262 /// Sort loads into increasing pointers offsets to allow greater clustering.
1263 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1264
1265 /// Gets reordering data for the given tree entry. If the entry is vectorized
1266 /// - just return ReorderIndices, otherwise check if the scalars can be
1267 /// reordered and return the most optimal order.
1268 /// \return std::nullopt if ordering is not important, empty order, if
1269 /// identity order is important, or the actual order.
1270 /// \param TopToBottom If true, include the order of vectorized stores and
1271 /// insertelement nodes, otherwise skip them.
1272 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1273 bool TopToBottom);
1274
1275 /// Reorders the current graph to the most profitable order starting from the
1276 /// root node to the leaf nodes. The best order is chosen only from the nodes
1277 /// of the same size (vectorization factor). Smaller nodes are considered
1278 /// parts of subgraph with smaller VF and they are reordered independently. We
1279 /// can make it because we still need to extend smaller nodes to the wider VF
1280 /// and we can merge reordering shuffles with the widening shuffles.
1281 void reorderTopToBottom();
1282
1283 /// Reorders the current graph to the most profitable order starting from
1284 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1285 /// number of reshuffles if the leaf nodes use the same order. In this case we
1286 /// can merge the orders and just shuffle user node instead of shuffling its
1287 /// operands. Plus, even the leaf nodes have different orders, it allows to
1288 /// sink reordering in the graph closer to the root node and merge it later
1289 /// during analysis.
1290 void reorderBottomToTop(bool IgnoreReorder = false);
1291
1292 /// \return The vector element size in bits to use when vectorizing the
1293 /// expression tree ending at \p V. If V is a store, the size is the width of
1294 /// the stored value. Otherwise, the size is the width of the largest loaded
1295 /// value reaching V. This method is used by the vectorizer to calculate
1296 /// vectorization factors.
1297 unsigned getVectorElementSize(Value *V);
1298
1299 /// Compute the minimum type sizes required to represent the entries in a
1300 /// vectorizable tree.
1301 void computeMinimumValueSizes();
1302
1303 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
getMaxVecRegSize() const1304 unsigned getMaxVecRegSize() const {
1305 return MaxVecRegSize;
1306 }
1307
1308 // \returns minimum vector register size as set by cl::opt.
getMinVecRegSize() const1309 unsigned getMinVecRegSize() const {
1310 return MinVecRegSize;
1311 }
1312
getMinVF(unsigned Sz) const1313 unsigned getMinVF(unsigned Sz) const {
1314 return std::max(2U, getMinVecRegSize() / Sz);
1315 }
1316
getMaximumVF(unsigned ElemWidth,unsigned Opcode) const1317 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1318 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1319 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1320 return MaxVF ? MaxVF : UINT_MAX;
1321 }
1322
1323 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1324 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1325 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1326 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1327 ///
1328 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1329 unsigned canMapToVector(Type *T) const;
1330
1331 /// \returns True if the VectorizableTree is both tiny and not fully
1332 /// vectorizable. We do not vectorize such trees.
1333 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1334
1335 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1336 /// can be load combined in the backend. Load combining may not be allowed in
1337 /// the IR optimizer, so we do not want to alter the pattern. For example,
1338 /// partially transforming a scalar bswap() pattern into vector code is
1339 /// effectively impossible for the backend to undo.
1340 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1341 /// may not be necessary.
1342 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1343
1344 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1345 /// can be load combined in the backend. Load combining may not be allowed in
1346 /// the IR optimizer, so we do not want to alter the pattern. For example,
1347 /// partially transforming a scalar bswap() pattern into vector code is
1348 /// effectively impossible for the backend to undo.
1349 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1350 /// may not be necessary.
1351 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1352
1353 /// Checks if the given array of loads can be represented as a vectorized,
1354 /// scatter or just simple gather.
1355 /// \param VL list of loads.
1356 /// \param VL0 main load value.
1357 /// \param Order returned order of load instructions.
1358 /// \param PointerOps returned list of pointer operands.
1359 /// \param TryRecursiveCheck used to check if long masked gather can be
1360 /// represented as a serie of loads/insert subvector, if profitable.
1361 LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
1362 SmallVectorImpl<unsigned> &Order,
1363 SmallVectorImpl<Value *> &PointerOps,
1364 bool TryRecursiveCheck = true) const;
1365
getORE()1366 OptimizationRemarkEmitter *getORE() { return ORE; }
1367
1368 /// This structure holds any data we need about the edges being traversed
1369 /// during buildTree_rec(). We keep track of:
1370 /// (i) the user TreeEntry index, and
1371 /// (ii) the index of the edge.
1372 struct EdgeInfo {
1373 EdgeInfo() = default;
EdgeInfollvm::slpvectorizer::BoUpSLP::EdgeInfo1374 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1375 : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
1376 /// The user TreeEntry.
1377 TreeEntry *UserTE = nullptr;
1378 /// The operand index of the use.
1379 unsigned EdgeIdx = UINT_MAX;
1380 #ifndef NDEBUG
operator <<(raw_ostream & OS,const BoUpSLP::EdgeInfo & EI)1381 friend inline raw_ostream &operator<<(raw_ostream &OS,
1382 const BoUpSLP::EdgeInfo &EI) {
1383 EI.dump(OS);
1384 return OS;
1385 }
1386 /// Debug print.
dumpllvm::slpvectorizer::BoUpSLP::EdgeInfo1387 void dump(raw_ostream &OS) const {
1388 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1389 << " EdgeIdx:" << EdgeIdx << "}";
1390 }
dumpllvm::slpvectorizer::BoUpSLP::EdgeInfo1391 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1392 #endif
operator ==llvm::slpvectorizer::BoUpSLP::EdgeInfo1393 bool operator == (const EdgeInfo &Other) const {
1394 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1395 }
1396 };
1397
1398 /// A helper class used for scoring candidates for two consecutive lanes.
1399 class LookAheadHeuristics {
1400 const TargetLibraryInfo &TLI;
1401 const DataLayout &DL;
1402 ScalarEvolution &SE;
1403 const BoUpSLP &R;
1404 int NumLanes; // Total number of lanes (aka vectorization factor).
1405 int MaxLevel; // The maximum recursion depth for accumulating score.
1406
1407 public:
LookAheadHeuristics(const TargetLibraryInfo & TLI,const DataLayout & DL,ScalarEvolution & SE,const BoUpSLP & R,int NumLanes,int MaxLevel)1408 LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
1409 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1410 int MaxLevel)
1411 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1412 MaxLevel(MaxLevel) {}
1413
1414 // The hard-coded scores listed here are not very important, though it shall
1415 // be higher for better matches to improve the resulting cost. When
1416 // computing the scores of matching one sub-tree with another, we are
1417 // basically counting the number of values that are matching. So even if all
1418 // scores are set to 1, we would still get a decent matching result.
1419 // However, sometimes we have to break ties. For example we may have to
1420 // choose between matching loads vs matching opcodes. This is what these
1421 // scores are helping us with: they provide the order of preference. Also,
1422 // this is important if the scalar is externally used or used in another
1423 // tree entry node in the different lane.
1424
1425 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1426 static const int ScoreConsecutiveLoads = 4;
1427 /// The same load multiple times. This should have a better score than
1428 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1429 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1430 /// a vector load and 1.0 for a broadcast.
1431 static const int ScoreSplatLoads = 3;
1432 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1433 static const int ScoreReversedLoads = 3;
1434 /// A load candidate for masked gather.
1435 static const int ScoreMaskedGatherCandidate = 1;
1436 /// ExtractElementInst from same vector and consecutive indexes.
1437 static const int ScoreConsecutiveExtracts = 4;
1438 /// ExtractElementInst from same vector and reversed indices.
1439 static const int ScoreReversedExtracts = 3;
1440 /// Constants.
1441 static const int ScoreConstants = 2;
1442 /// Instructions with the same opcode.
1443 static const int ScoreSameOpcode = 2;
1444 /// Instructions with alt opcodes (e.g, add + sub).
1445 static const int ScoreAltOpcodes = 1;
1446 /// Identical instructions (a.k.a. splat or broadcast).
1447 static const int ScoreSplat = 1;
1448 /// Matching with an undef is preferable to failing.
1449 static const int ScoreUndef = 1;
1450 /// Score for failing to find a decent match.
1451 static const int ScoreFail = 0;
1452 /// Score if all users are vectorized.
1453 static const int ScoreAllUserVectorized = 1;
1454
1455 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1456 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1457 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1458 /// MainAltOps.
getShallowScore(Value * V1,Value * V2,Instruction * U1,Instruction * U2,ArrayRef<Value * > MainAltOps) const1459 int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
1460 ArrayRef<Value *> MainAltOps) const {
1461 if (!isValidElementType(V1->getType()) ||
1462 !isValidElementType(V2->getType()))
1463 return LookAheadHeuristics::ScoreFail;
1464
1465 if (V1 == V2) {
1466 if (isa<LoadInst>(V1)) {
1467 // Retruns true if the users of V1 and V2 won't need to be extracted.
1468 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1469 // Bail out if we have too many uses to save compilation time.
1470 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1471 return false;
1472
1473 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1474 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1475 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1476 });
1477 };
1478 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1479 };
1480 // A broadcast of a load can be cheaper on some targets.
1481 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1482 ElementCount::getFixed(NumLanes)) &&
1483 ((int)V1->getNumUses() == NumLanes ||
1484 AllUsersAreInternal(V1, V2)))
1485 return LookAheadHeuristics::ScoreSplatLoads;
1486 }
1487 return LookAheadHeuristics::ScoreSplat;
1488 }
1489
1490 auto CheckSameEntryOrFail = [&]() {
1491 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1492 TE1 && TE1 == R.getTreeEntry(V2))
1493 return LookAheadHeuristics::ScoreSplatLoads;
1494 return LookAheadHeuristics::ScoreFail;
1495 };
1496
1497 auto *LI1 = dyn_cast<LoadInst>(V1);
1498 auto *LI2 = dyn_cast<LoadInst>(V2);
1499 if (LI1 && LI2) {
1500 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1501 !LI2->isSimple())
1502 return CheckSameEntryOrFail();
1503
1504 std::optional<int> Dist = getPointersDiff(
1505 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1506 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1507 if (!Dist || *Dist == 0) {
1508 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1509 getUnderlyingObject(LI2->getPointerOperand()) &&
1510 R.TTI->isLegalMaskedGather(
1511 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1512 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1513 return CheckSameEntryOrFail();
1514 }
1515 // The distance is too large - still may be profitable to use masked
1516 // loads/gathers.
1517 if (std::abs(*Dist) > NumLanes / 2)
1518 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1519 // This still will detect consecutive loads, but we might have "holes"
1520 // in some cases. It is ok for non-power-2 vectorization and may produce
1521 // better results. It should not affect current vectorization.
1522 return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
1523 : LookAheadHeuristics::ScoreReversedLoads;
1524 }
1525
1526 auto *C1 = dyn_cast<Constant>(V1);
1527 auto *C2 = dyn_cast<Constant>(V2);
1528 if (C1 && C2)
1529 return LookAheadHeuristics::ScoreConstants;
1530
1531 // Extracts from consecutive indexes of the same vector better score as
1532 // the extracts could be optimized away.
1533 Value *EV1;
1534 ConstantInt *Ex1Idx;
1535 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1536 // Undefs are always profitable for extractelements.
1537 // Compiler can easily combine poison and extractelement <non-poison> or
1538 // undef and extractelement <poison>. But combining undef +
1539 // extractelement <non-poison-but-may-produce-poison> requires some
1540 // extra operations.
1541 if (isa<UndefValue>(V2))
1542 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1543 ? LookAheadHeuristics::ScoreConsecutiveExtracts
1544 : LookAheadHeuristics::ScoreSameOpcode;
1545 Value *EV2 = nullptr;
1546 ConstantInt *Ex2Idx = nullptr;
1547 if (match(V2,
1548 m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),
1549 m_Undef())))) {
1550 // Undefs are always profitable for extractelements.
1551 if (!Ex2Idx)
1552 return LookAheadHeuristics::ScoreConsecutiveExtracts;
1553 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1554 return LookAheadHeuristics::ScoreConsecutiveExtracts;
1555 if (EV2 == EV1) {
1556 int Idx1 = Ex1Idx->getZExtValue();
1557 int Idx2 = Ex2Idx->getZExtValue();
1558 int Dist = Idx2 - Idx1;
1559 // The distance is too large - still may be profitable to use
1560 // shuffles.
1561 if (std::abs(Dist) == 0)
1562 return LookAheadHeuristics::ScoreSplat;
1563 if (std::abs(Dist) > NumLanes / 2)
1564 return LookAheadHeuristics::ScoreSameOpcode;
1565 return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
1566 : LookAheadHeuristics::ScoreReversedExtracts;
1567 }
1568 return LookAheadHeuristics::ScoreAltOpcodes;
1569 }
1570 return CheckSameEntryOrFail();
1571 }
1572
1573 auto *I1 = dyn_cast<Instruction>(V1);
1574 auto *I2 = dyn_cast<Instruction>(V2);
1575 if (I1 && I2) {
1576 if (I1->getParent() != I2->getParent())
1577 return CheckSameEntryOrFail();
1578 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1579 Ops.push_back(I1);
1580 Ops.push_back(I2);
1581 InstructionsState S = getSameOpcode(Ops, TLI);
1582 // Note: Only consider instructions with <= 2 operands to avoid
1583 // complexity explosion.
1584 if (S.getOpcode() &&
1585 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1586 !S.isAltShuffle()) &&
1587 all_of(Ops, [&S](Value *V) {
1588 return cast<Instruction>(V)->getNumOperands() ==
1589 S.MainOp->getNumOperands();
1590 }))
1591 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1592 : LookAheadHeuristics::ScoreSameOpcode;
1593 }
1594
1595 if (isa<UndefValue>(V2))
1596 return LookAheadHeuristics::ScoreUndef;
1597
1598 return CheckSameEntryOrFail();
1599 }
1600
1601 /// Go through the operands of \p LHS and \p RHS recursively until
1602 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1603 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1604 /// of \p U1 and \p U2), except at the beginning of the recursion where
1605 /// these are set to nullptr.
1606 ///
1607 /// For example:
1608 /// \verbatim
1609 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1610 /// \ / \ / \ / \ /
1611 /// + + + +
1612 /// G1 G2 G3 G4
1613 /// \endverbatim
1614 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1615 /// each level recursively, accumulating the score. It starts from matching
1616 /// the additions at level 0, then moves on to the loads (level 1). The
1617 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1618 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1619 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1620 /// Please note that the order of the operands does not matter, as we
1621 /// evaluate the score of all profitable combinations of operands. In
1622 /// other words the score of G1 and G4 is the same as G1 and G2. This
1623 /// heuristic is based on ideas described in:
1624 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1625 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1626 /// Luís F. W. Góes
getScoreAtLevelRec(Value * LHS,Value * RHS,Instruction * U1,Instruction * U2,int CurrLevel,ArrayRef<Value * > MainAltOps) const1627 int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
1628 Instruction *U2, int CurrLevel,
1629 ArrayRef<Value *> MainAltOps) const {
1630
1631 // Get the shallow score of V1 and V2.
1632 int ShallowScoreAtThisLevel =
1633 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1634
1635 // If reached MaxLevel,
1636 // or if V1 and V2 are not instructions,
1637 // or if they are SPLAT,
1638 // or if they are not consecutive,
1639 // or if profitable to vectorize loads or extractelements, early return
1640 // the current cost.
1641 auto *I1 = dyn_cast<Instruction>(LHS);
1642 auto *I2 = dyn_cast<Instruction>(RHS);
1643 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1644 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1645 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1646 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1647 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1648 ShallowScoreAtThisLevel))
1649 return ShallowScoreAtThisLevel;
1650 assert(I1 && I2 && "Should have early exited.");
1651
1652 // Contains the I2 operand indexes that got matched with I1 operands.
1653 SmallSet<unsigned, 4> Op2Used;
1654
1655 // Recursion towards the operands of I1 and I2. We are trying all possible
1656 // operand pairs, and keeping track of the best score.
1657 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1658 OpIdx1 != NumOperands1; ++OpIdx1) {
1659 // Try to pair op1I with the best operand of I2.
1660 int MaxTmpScore = 0;
1661 unsigned MaxOpIdx2 = 0;
1662 bool FoundBest = false;
1663 // If I2 is commutative try all combinations.
1664 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1665 unsigned ToIdx = isCommutative(I2)
1666 ? I2->getNumOperands()
1667 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1668 assert(FromIdx <= ToIdx && "Bad index");
1669 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1670 // Skip operands already paired with OpIdx1.
1671 if (Op2Used.count(OpIdx2))
1672 continue;
1673 // Recursively calculate the cost at each level
1674 int TmpScore =
1675 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1676 I1, I2, CurrLevel + 1, std::nullopt);
1677 // Look for the best score.
1678 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1679 TmpScore > MaxTmpScore) {
1680 MaxTmpScore = TmpScore;
1681 MaxOpIdx2 = OpIdx2;
1682 FoundBest = true;
1683 }
1684 }
1685 if (FoundBest) {
1686 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1687 Op2Used.insert(MaxOpIdx2);
1688 ShallowScoreAtThisLevel += MaxTmpScore;
1689 }
1690 }
1691 return ShallowScoreAtThisLevel;
1692 }
1693 };
1694 /// A helper data structure to hold the operands of a vector of instructions.
1695 /// This supports a fixed vector length for all operand vectors.
1696 class VLOperands {
1697 /// For each operand we need (i) the value, and (ii) the opcode that it
1698 /// would be attached to if the expression was in a left-linearized form.
1699 /// This is required to avoid illegal operand reordering.
1700 /// For example:
1701 /// \verbatim
1702 /// 0 Op1
1703 /// |/
1704 /// Op1 Op2 Linearized + Op2
1705 /// \ / ----------> |/
1706 /// - -
1707 ///
1708 /// Op1 - Op2 (0 + Op1) - Op2
1709 /// \endverbatim
1710 ///
1711 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1712 ///
1713 /// Another way to think of this is to track all the operations across the
1714 /// path from the operand all the way to the root of the tree and to
1715 /// calculate the operation that corresponds to this path. For example, the
1716 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1717 /// corresponding operation is a '-' (which matches the one in the
1718 /// linearized tree, as shown above).
1719 ///
1720 /// For lack of a better term, we refer to this operation as Accumulated
1721 /// Path Operation (APO).
1722 struct OperandData {
1723 OperandData() = default;
OperandDatallvm::slpvectorizer::BoUpSLP::VLOperands::OperandData1724 OperandData(Value *V, bool APO, bool IsUsed)
1725 : V(V), APO(APO), IsUsed(IsUsed) {}
1726 /// The operand value.
1727 Value *V = nullptr;
1728 /// TreeEntries only allow a single opcode, or an alternate sequence of
1729 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1730 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1731 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1732 /// (e.g., Add/Mul)
1733 bool APO = false;
1734 /// Helper data for the reordering function.
1735 bool IsUsed = false;
1736 };
1737
1738 /// During operand reordering, we are trying to select the operand at lane
1739 /// that matches best with the operand at the neighboring lane. Our
1740 /// selection is based on the type of value we are looking for. For example,
1741 /// if the neighboring lane has a load, we need to look for a load that is
1742 /// accessing a consecutive address. These strategies are summarized in the
1743 /// 'ReorderingMode' enumerator.
1744 enum class ReorderingMode {
1745 Load, ///< Matching loads to consecutive memory addresses
1746 Opcode, ///< Matching instructions based on opcode (same or alternate)
1747 Constant, ///< Matching constants
1748 Splat, ///< Matching the same instruction multiple times (broadcast)
1749 Failed, ///< We failed to create a vectorizable group
1750 };
1751
1752 using OperandDataVec = SmallVector<OperandData, 2>;
1753
1754 /// A vector of operand vectors.
1755 SmallVector<OperandDataVec, 4> OpsVec;
1756
1757 const TargetLibraryInfo &TLI;
1758 const DataLayout &DL;
1759 ScalarEvolution &SE;
1760 const BoUpSLP &R;
1761 const Loop *L = nullptr;
1762
1763 /// \returns the operand data at \p OpIdx and \p Lane.
getData(unsigned OpIdx,unsigned Lane)1764 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1765 return OpsVec[OpIdx][Lane];
1766 }
1767
1768 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
getData(unsigned OpIdx,unsigned Lane) const1769 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1770 return OpsVec[OpIdx][Lane];
1771 }
1772
1773 /// Clears the used flag for all entries.
clearUsed()1774 void clearUsed() {
1775 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1776 OpIdx != NumOperands; ++OpIdx)
1777 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1778 ++Lane)
1779 OpsVec[OpIdx][Lane].IsUsed = false;
1780 }
1781
1782 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
swap(unsigned OpIdx1,unsigned OpIdx2,unsigned Lane)1783 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1784 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1785 }
1786
1787 /// \param Lane lane of the operands under analysis.
1788 /// \param OpIdx operand index in \p Lane lane we're looking the best
1789 /// candidate for.
1790 /// \param Idx operand index of the current candidate value.
1791 /// \returns The additional score due to possible broadcasting of the
1792 /// elements in the lane. It is more profitable to have power-of-2 unique
1793 /// elements in the lane, it will be vectorized with higher probability
1794 /// after removing duplicates. Currently the SLP vectorizer supports only
1795 /// vectorization of the power-of-2 number of unique scalars.
getSplatScore(unsigned Lane,unsigned OpIdx,unsigned Idx) const1796 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1797 Value *IdxLaneV = getData(Idx, Lane).V;
1798 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1799 return 0;
1800 SmallPtrSet<Value *, 4> Uniques;
1801 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1802 if (Ln == Lane)
1803 continue;
1804 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1805 if (!isa<Instruction>(OpIdxLnV))
1806 return 0;
1807 Uniques.insert(OpIdxLnV);
1808 }
1809 int UniquesCount = Uniques.size();
1810 int UniquesCntWithIdxLaneV =
1811 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1812 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1813 int UniquesCntWithOpIdxLaneV =
1814 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1815 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1816 return 0;
1817 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1818 UniquesCntWithOpIdxLaneV) -
1819 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1820 }
1821
1822 /// \param Lane lane of the operands under analysis.
1823 /// \param OpIdx operand index in \p Lane lane we're looking the best
1824 /// candidate for.
1825 /// \param Idx operand index of the current candidate value.
1826 /// \returns The additional score for the scalar which users are all
1827 /// vectorized.
getExternalUseScore(unsigned Lane,unsigned OpIdx,unsigned Idx) const1828 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1829 Value *IdxLaneV = getData(Idx, Lane).V;
1830 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1831 // Do not care about number of uses for vector-like instructions
1832 // (extractelement/extractvalue with constant indices), they are extracts
1833 // themselves and already externally used. Vectorization of such
1834 // instructions does not add extra extractelement instruction, just may
1835 // remove it.
1836 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1837 isVectorLikeInstWithConstOps(OpIdxLaneV))
1838 return LookAheadHeuristics::ScoreAllUserVectorized;
1839 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1840 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1841 return 0;
1842 return R.areAllUsersVectorized(IdxLaneI)
1843 ? LookAheadHeuristics::ScoreAllUserVectorized
1844 : 0;
1845 }
1846
1847 /// Score scaling factor for fully compatible instructions but with
1848 /// different number of external uses. Allows better selection of the
1849 /// instructions with less external uses.
1850 static const int ScoreScaleFactor = 10;
1851
1852 /// \Returns the look-ahead score, which tells us how much the sub-trees
1853 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1854 /// score. This helps break ties in an informed way when we cannot decide on
1855 /// the order of the operands by just considering the immediate
1856 /// predecessors.
getLookAheadScore(Value * LHS,Value * RHS,ArrayRef<Value * > MainAltOps,int Lane,unsigned OpIdx,unsigned Idx,bool & IsUsed)1857 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1858 int Lane, unsigned OpIdx, unsigned Idx,
1859 bool &IsUsed) {
1860 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1861 LookAheadMaxDepth);
1862 // Keep track of the instruction stack as we recurse into the operands
1863 // during the look-ahead score exploration.
1864 int Score =
1865 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1866 /*CurrLevel=*/1, MainAltOps);
1867 if (Score) {
1868 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1869 if (Score <= -SplatScore) {
1870 // Set the minimum score for splat-like sequence to avoid setting
1871 // failed state.
1872 Score = 1;
1873 } else {
1874 Score += SplatScore;
1875 // Scale score to see the difference between different operands
1876 // and similar operands but all vectorized/not all vectorized
1877 // uses. It does not affect actual selection of the best
1878 // compatible operand in general, just allows to select the
1879 // operand with all vectorized uses.
1880 Score *= ScoreScaleFactor;
1881 Score += getExternalUseScore(Lane, OpIdx, Idx);
1882 IsUsed = true;
1883 }
1884 }
1885 return Score;
1886 }
1887
1888 /// Best defined scores per lanes between the passes. Used to choose the
1889 /// best operand (with the highest score) between the passes.
1890 /// The key - {Operand Index, Lane}.
1891 /// The value - the best score between the passes for the lane and the
1892 /// operand.
1893 SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
1894 BestScoresPerLanes;
1895
1896 // Search all operands in Ops[*][Lane] for the one that matches best
1897 // Ops[OpIdx][LastLane] and return its opreand index.
1898 // If no good match can be found, return std::nullopt.
1899 std::optional<unsigned>
getBestOperand(unsigned OpIdx,int Lane,int LastLane,ArrayRef<ReorderingMode> ReorderingModes,ArrayRef<Value * > MainAltOps)1900 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1901 ArrayRef<ReorderingMode> ReorderingModes,
1902 ArrayRef<Value *> MainAltOps) {
1903 unsigned NumOperands = getNumOperands();
1904
1905 // The operand of the previous lane at OpIdx.
1906 Value *OpLastLane = getData(OpIdx, LastLane).V;
1907
1908 // Our strategy mode for OpIdx.
1909 ReorderingMode RMode = ReorderingModes[OpIdx];
1910 if (RMode == ReorderingMode::Failed)
1911 return std::nullopt;
1912
1913 // The linearized opcode of the operand at OpIdx, Lane.
1914 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1915
1916 // The best operand index and its score.
1917 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1918 // are using the score to differentiate between the two.
1919 struct BestOpData {
1920 std::optional<unsigned> Idx;
1921 unsigned Score = 0;
1922 } BestOp;
1923 BestOp.Score =
1924 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1925 .first->second;
1926
1927 // Track if the operand must be marked as used. If the operand is set to
1928 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1929 // want to reestimate the operands again on the following iterations).
1930 bool IsUsed = RMode == ReorderingMode::Splat ||
1931 RMode == ReorderingMode::Constant ||
1932 RMode == ReorderingMode::Load;
1933 // Iterate through all unused operands and look for the best.
1934 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1935 // Get the operand at Idx and Lane.
1936 OperandData &OpData = getData(Idx, Lane);
1937 Value *Op = OpData.V;
1938 bool OpAPO = OpData.APO;
1939
1940 // Skip already selected operands.
1941 if (OpData.IsUsed)
1942 continue;
1943
1944 // Skip if we are trying to move the operand to a position with a
1945 // different opcode in the linearized tree form. This would break the
1946 // semantics.
1947 if (OpAPO != OpIdxAPO)
1948 continue;
1949
1950 // Look for an operand that matches the current mode.
1951 switch (RMode) {
1952 case ReorderingMode::Load:
1953 case ReorderingMode::Opcode: {
1954 bool LeftToRight = Lane > LastLane;
1955 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1956 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1957 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1958 OpIdx, Idx, IsUsed);
1959 if (Score > static_cast<int>(BestOp.Score) ||
1960 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
1961 Idx == OpIdx)) {
1962 BestOp.Idx = Idx;
1963 BestOp.Score = Score;
1964 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1965 }
1966 break;
1967 }
1968 case ReorderingMode::Constant:
1969 if (isa<Constant>(Op) ||
1970 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
1971 BestOp.Idx = Idx;
1972 if (isa<Constant>(Op)) {
1973 BestOp.Score = LookAheadHeuristics::ScoreConstants;
1974 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1975 LookAheadHeuristics::ScoreConstants;
1976 }
1977 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
1978 IsUsed = false;
1979 }
1980 break;
1981 case ReorderingMode::Splat:
1982 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
1983 IsUsed = Op == OpLastLane;
1984 if (Op == OpLastLane) {
1985 BestOp.Score = LookAheadHeuristics::ScoreSplat;
1986 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1987 LookAheadHeuristics::ScoreSplat;
1988 }
1989 BestOp.Idx = Idx;
1990 }
1991 break;
1992 case ReorderingMode::Failed:
1993 llvm_unreachable("Not expected Failed reordering mode.");
1994 }
1995 }
1996
1997 if (BestOp.Idx) {
1998 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1999 return BestOp.Idx;
2000 }
2001 // If we could not find a good match return std::nullopt.
2002 return std::nullopt;
2003 }
2004
2005 /// Helper for reorderOperandVecs.
2006 /// \returns the lane that we should start reordering from. This is the one
2007 /// which has the least number of operands that can freely move about or
2008 /// less profitable because it already has the most optimal set of operands.
getBestLaneToStartReordering() const2009 unsigned getBestLaneToStartReordering() const {
2010 unsigned Min = UINT_MAX;
2011 unsigned SameOpNumber = 0;
2012 // std::pair<unsigned, unsigned> is used to implement a simple voting
2013 // algorithm and choose the lane with the least number of operands that
2014 // can freely move about or less profitable because it already has the
2015 // most optimal set of operands. The first unsigned is a counter for
2016 // voting, the second unsigned is the counter of lanes with instructions
2017 // with same/alternate opcodes and same parent basic block.
2018 MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
2019 // Try to be closer to the original results, if we have multiple lanes
2020 // with same cost. If 2 lanes have the same cost, use the one with the
2021 // lowest index.
2022 for (int I = getNumLanes(); I > 0; --I) {
2023 unsigned Lane = I - 1;
2024 OperandsOrderData NumFreeOpsHash =
2025 getMaxNumOperandsThatCanBeReordered(Lane);
2026 // Compare the number of operands that can move and choose the one with
2027 // the least number.
2028 if (NumFreeOpsHash.NumOfAPOs < Min) {
2029 Min = NumFreeOpsHash.NumOfAPOs;
2030 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2031 HashMap.clear();
2032 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2033 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2034 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2035 // Select the most optimal lane in terms of number of operands that
2036 // should be moved around.
2037 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2038 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2039 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2040 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2041 auto *It = HashMap.find(NumFreeOpsHash.Hash);
2042 if (It == HashMap.end())
2043 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2044 else
2045 ++It->second.first;
2046 }
2047 }
2048 // Select the lane with the minimum counter.
2049 unsigned BestLane = 0;
2050 unsigned CntMin = UINT_MAX;
2051 for (const auto &Data : reverse(HashMap)) {
2052 if (Data.second.first < CntMin) {
2053 CntMin = Data.second.first;
2054 BestLane = Data.second.second;
2055 }
2056 }
2057 return BestLane;
2058 }
2059
2060 /// Data structure that helps to reorder operands.
2061 struct OperandsOrderData {
2062 /// The best number of operands with the same APOs, which can be
2063 /// reordered.
2064 unsigned NumOfAPOs = UINT_MAX;
2065 /// Number of operands with the same/alternate instruction opcode and
2066 /// parent.
2067 unsigned NumOpsWithSameOpcodeParent = 0;
2068 /// Hash for the actual operands ordering.
2069 /// Used to count operands, actually their position id and opcode
2070 /// value. It is used in the voting mechanism to find the lane with the
2071 /// least number of operands that can freely move about or less profitable
2072 /// because it already has the most optimal set of operands. Can be
2073 /// replaced with SmallVector<unsigned> instead but hash code is faster
2074 /// and requires less memory.
2075 unsigned Hash = 0;
2076 };
2077 /// \returns the maximum number of operands that are allowed to be reordered
2078 /// for \p Lane and the number of compatible instructions(with the same
2079 /// parent/opcode). This is used as a heuristic for selecting the first lane
2080 /// to start operand reordering.
getMaxNumOperandsThatCanBeReordered(unsigned Lane) const2081 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2082 unsigned CntTrue = 0;
2083 unsigned NumOperands = getNumOperands();
2084 // Operands with the same APO can be reordered. We therefore need to count
2085 // how many of them we have for each APO, like this: Cnt[APO] = x.
2086 // Since we only have two APOs, namely true and false, we can avoid using
2087 // a map. Instead we can simply count the number of operands that
2088 // correspond to one of them (in this case the 'true' APO), and calculate
2089 // the other by subtracting it from the total number of operands.
2090 // Operands with the same instruction opcode and parent are more
2091 // profitable since we don't need to move them in many cases, with a high
2092 // probability such lane already can be vectorized effectively.
2093 bool AllUndefs = true;
2094 unsigned NumOpsWithSameOpcodeParent = 0;
2095 Instruction *OpcodeI = nullptr;
2096 BasicBlock *Parent = nullptr;
2097 unsigned Hash = 0;
2098 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2099 const OperandData &OpData = getData(OpIdx, Lane);
2100 if (OpData.APO)
2101 ++CntTrue;
2102 // Use Boyer-Moore majority voting for finding the majority opcode and
2103 // the number of times it occurs.
2104 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2105 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2106 I->getParent() != Parent) {
2107 if (NumOpsWithSameOpcodeParent == 0) {
2108 NumOpsWithSameOpcodeParent = 1;
2109 OpcodeI = I;
2110 Parent = I->getParent();
2111 } else {
2112 --NumOpsWithSameOpcodeParent;
2113 }
2114 } else {
2115 ++NumOpsWithSameOpcodeParent;
2116 }
2117 }
2118 Hash = hash_combine(
2119 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2120 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2121 }
2122 if (AllUndefs)
2123 return {};
2124 OperandsOrderData Data;
2125 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2126 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2127 Data.Hash = Hash;
2128 return Data;
2129 }
2130
2131 /// Go through the instructions in VL and append their operands.
appendOperandsOfVL(ArrayRef<Value * > VL)2132 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2133 assert(!VL.empty() && "Bad VL");
2134 assert((empty() || VL.size() == getNumLanes()) &&
2135 "Expected same number of lanes");
2136 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2137 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2138 constexpr unsigned IntrinsicNumOperands = 2;
2139 if (isa<IntrinsicInst>(VL[0]))
2140 NumOperands = IntrinsicNumOperands;
2141 OpsVec.resize(NumOperands);
2142 unsigned NumLanes = VL.size();
2143 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2144 OpsVec[OpIdx].resize(NumLanes);
2145 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2146 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2147 // Our tree has just 3 nodes: the root and two operands.
2148 // It is therefore trivial to get the APO. We only need to check the
2149 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2150 // RHS operand. The LHS operand of both add and sub is never attached
2151 // to an inversese operation in the linearized form, therefore its APO
2152 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2153
2154 // Since operand reordering is performed on groups of commutative
2155 // operations or alternating sequences (e.g., +, -), we can safely
2156 // tell the inverse operations by checking commutativity.
2157 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2158 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2159 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2160 APO, false};
2161 }
2162 }
2163 }
2164
2165 /// \returns the number of operands.
getNumOperands() const2166 unsigned getNumOperands() const { return OpsVec.size(); }
2167
2168 /// \returns the number of lanes.
getNumLanes() const2169 unsigned getNumLanes() const { return OpsVec[0].size(); }
2170
2171 /// \returns the operand value at \p OpIdx and \p Lane.
getValue(unsigned OpIdx,unsigned Lane) const2172 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2173 return getData(OpIdx, Lane).V;
2174 }
2175
2176 /// \returns true if the data structure is empty.
empty() const2177 bool empty() const { return OpsVec.empty(); }
2178
2179 /// Clears the data.
clear()2180 void clear() { OpsVec.clear(); }
2181
2182 /// \Returns true if there are enough operands identical to \p Op to fill
2183 /// the whole vector (it is mixed with constants or loop invariant values).
2184 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
shouldBroadcast(Value * Op,unsigned OpIdx,unsigned Lane)2185 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2186 bool OpAPO = getData(OpIdx, Lane).APO;
2187 bool IsInvariant = L && L->isLoopInvariant(Op);
2188 unsigned Cnt = 0;
2189 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2190 if (Ln == Lane)
2191 continue;
2192 // This is set to true if we found a candidate for broadcast at Lane.
2193 bool FoundCandidate = false;
2194 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2195 OperandData &Data = getData(OpI, Ln);
2196 if (Data.APO != OpAPO || Data.IsUsed)
2197 continue;
2198 Value *OpILane = getValue(OpI, Lane);
2199 bool IsConstantOp = isa<Constant>(OpILane);
2200 // Consider the broadcast candidate if:
2201 // 1. Same value is found in one of the operands.
2202 if (Data.V == Op ||
2203 // 2. The operand in the given lane is not constant but there is a
2204 // constant operand in another lane (which can be moved to the
2205 // given lane). In this case we can represent it as a simple
2206 // permutation of constant and broadcast.
2207 (!IsConstantOp &&
2208 ((Lns > 2 && isa<Constant>(Data.V)) ||
2209 // 2.1. If we have only 2 lanes, need to check that value in the
2210 // next lane does not build same opcode sequence.
2211 (Lns == 2 &&
2212 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2213 .getOpcode() &&
2214 isa<Constant>(Data.V)))) ||
2215 // 3. The operand in the current lane is loop invariant (can be
2216 // hoisted out) and another operand is also a loop invariant
2217 // (though not a constant). In this case the whole vector can be
2218 // hoisted out.
2219 // FIXME: need to teach the cost model about this case for better
2220 // estimation.
2221 (IsInvariant && !isa<Constant>(Data.V) &&
2222 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2223 L->isLoopInvariant(Data.V))) {
2224 FoundCandidate = true;
2225 Data.IsUsed = Data.V == Op;
2226 if (Data.V == Op)
2227 ++Cnt;
2228 break;
2229 }
2230 }
2231 if (!FoundCandidate)
2232 return false;
2233 }
2234 return getNumLanes() == 2 || Cnt > 1;
2235 }
2236
2237 /// Checks if there is at least single compatible operand in lanes other
2238 /// than \p Lane, compatible with the operand \p Op.
canBeVectorized(Instruction * Op,unsigned OpIdx,unsigned Lane) const2239 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2240 bool OpAPO = getData(OpIdx, Lane).APO;
2241 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2242 if (Ln == Lane)
2243 continue;
2244 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2245 const OperandData &Data = getData(OpI, Ln);
2246 if (Data.APO != OpAPO || Data.IsUsed)
2247 return true;
2248 Value *OpILn = getValue(OpI, Ln);
2249 return (L && L->isLoopInvariant(OpILn)) ||
2250 (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
2251 Op->getParent() == cast<Instruction>(OpILn)->getParent());
2252 }))
2253 return true;
2254 }
2255 return false;
2256 }
2257
2258 public:
2259 /// Initialize with all the operands of the instruction vector \p RootVL.
VLOperands(ArrayRef<Value * > RootVL,const BoUpSLP & R)2260 VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
2261 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2262 L(R.LI->getLoopFor(
2263 (cast<Instruction>(RootVL.front())->getParent()))) {
2264 // Append all the operands of RootVL.
2265 appendOperandsOfVL(RootVL);
2266 }
2267
2268 /// \Returns a value vector with the operands across all lanes for the
2269 /// opearnd at \p OpIdx.
getVL(unsigned OpIdx) const2270 ValueList getVL(unsigned OpIdx) const {
2271 ValueList OpVL(OpsVec[OpIdx].size());
2272 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2273 "Expected same num of lanes across all operands");
2274 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2275 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2276 return OpVL;
2277 }
2278
2279 // Performs operand reordering for 2 or more operands.
2280 // The original operands are in OrigOps[OpIdx][Lane].
2281 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
reorder()2282 void reorder() {
2283 unsigned NumOperands = getNumOperands();
2284 unsigned NumLanes = getNumLanes();
2285 // Each operand has its own mode. We are using this mode to help us select
2286 // the instructions for each lane, so that they match best with the ones
2287 // we have selected so far.
2288 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2289
2290 // This is a greedy single-pass algorithm. We are going over each lane
2291 // once and deciding on the best order right away with no back-tracking.
2292 // However, in order to increase its effectiveness, we start with the lane
2293 // that has operands that can move the least. For example, given the
2294 // following lanes:
2295 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2296 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2297 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2298 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2299 // we will start at Lane 1, since the operands of the subtraction cannot
2300 // be reordered. Then we will visit the rest of the lanes in a circular
2301 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2302
2303 // Find the first lane that we will start our search from.
2304 unsigned FirstLane = getBestLaneToStartReordering();
2305
2306 // Initialize the modes.
2307 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2308 Value *OpLane0 = getValue(OpIdx, FirstLane);
2309 // Keep track if we have instructions with all the same opcode on one
2310 // side.
2311 if (isa<LoadInst>(OpLane0))
2312 ReorderingModes[OpIdx] = ReorderingMode::Load;
2313 else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2314 // Check if OpLane0 should be broadcast.
2315 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2316 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2317 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2318 else
2319 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2320 } else if (isa<Constant>(OpLane0))
2321 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2322 else if (isa<Argument>(OpLane0))
2323 // Our best hope is a Splat. It may save some cost in some cases.
2324 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2325 else
2326 // NOTE: This should be unreachable.
2327 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2328 }
2329
2330 // Check that we don't have same operands. No need to reorder if operands
2331 // are just perfect diamond or shuffled diamond match. Do not do it only
2332 // for possible broadcasts or non-power of 2 number of scalars (just for
2333 // now).
2334 auto &&SkipReordering = [this]() {
2335 SmallPtrSet<Value *, 4> UniqueValues;
2336 ArrayRef<OperandData> Op0 = OpsVec.front();
2337 for (const OperandData &Data : Op0)
2338 UniqueValues.insert(Data.V);
2339 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2340 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2341 return !UniqueValues.contains(Data.V);
2342 }))
2343 return false;
2344 }
2345 // TODO: Check if we can remove a check for non-power-2 number of
2346 // scalars after full support of non-power-2 vectorization.
2347 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2348 };
2349
2350 // If the initial strategy fails for any of the operand indexes, then we
2351 // perform reordering again in a second pass. This helps avoid assigning
2352 // high priority to the failed strategy, and should improve reordering for
2353 // the non-failed operand indexes.
2354 for (int Pass = 0; Pass != 2; ++Pass) {
2355 // Check if no need to reorder operands since they're are perfect or
2356 // shuffled diamond match.
2357 // Need to do it to avoid extra external use cost counting for
2358 // shuffled matches, which may cause regressions.
2359 if (SkipReordering())
2360 break;
2361 // Skip the second pass if the first pass did not fail.
2362 bool StrategyFailed = false;
2363 // Mark all operand data as free to use.
2364 clearUsed();
2365 // We keep the original operand order for the FirstLane, so reorder the
2366 // rest of the lanes. We are visiting the nodes in a circular fashion,
2367 // using FirstLane as the center point and increasing the radius
2368 // distance.
2369 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2370 for (unsigned I = 0; I < NumOperands; ++I)
2371 MainAltOps[I].push_back(getData(I, FirstLane).V);
2372
2373 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2374 // Visit the lane on the right and then the lane on the left.
2375 for (int Direction : {+1, -1}) {
2376 int Lane = FirstLane + Direction * Distance;
2377 if (Lane < 0 || Lane >= (int)NumLanes)
2378 continue;
2379 int LastLane = Lane - Direction;
2380 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2381 "Out of bounds");
2382 // Look for a good match for each operand.
2383 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2385 std::optional<unsigned> BestIdx = getBestOperand(
2386 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2387 // By not selecting a value, we allow the operands that follow to
2388 // select a better matching value. We will get a non-null value in
2389 // the next run of getBestOperand().
2390 if (BestIdx) {
2391 // Swap the current operand with the one returned by
2392 // getBestOperand().
2393 swap(OpIdx, *BestIdx, Lane);
2394 } else {
2395 // Enable the second pass.
2396 StrategyFailed = true;
2397 }
2398 // Try to get the alternate opcode and follow it during analysis.
2399 if (MainAltOps[OpIdx].size() != 2) {
2400 OperandData &AltOp = getData(OpIdx, Lane);
2401 InstructionsState OpS =
2402 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2403 if (OpS.getOpcode() && OpS.isAltShuffle())
2404 MainAltOps[OpIdx].push_back(AltOp.V);
2405 }
2406 }
2407 }
2408 }
2409 // Skip second pass if the strategy did not fail.
2410 if (!StrategyFailed)
2411 break;
2412 }
2413 }
2414
2415 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
getModeStr(ReorderingMode RMode)2416 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2417 switch (RMode) {
2418 case ReorderingMode::Load:
2419 return "Load";
2420 case ReorderingMode::Opcode:
2421 return "Opcode";
2422 case ReorderingMode::Constant:
2423 return "Constant";
2424 case ReorderingMode::Splat:
2425 return "Splat";
2426 case ReorderingMode::Failed:
2427 return "Failed";
2428 }
2429 llvm_unreachable("Unimplemented Reordering Type");
2430 }
2431
printMode(ReorderingMode RMode,raw_ostream & OS)2432 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2433 raw_ostream &OS) {
2434 return OS << getModeStr(RMode);
2435 }
2436
2437 /// Debug print.
dumpMode(ReorderingMode RMode)2438 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2439 printMode(RMode, dbgs());
2440 }
2441
operator <<(raw_ostream & OS,ReorderingMode RMode)2442 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2443 return printMode(RMode, OS);
2444 }
2445
print(raw_ostream & OS) const2446 LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
2447 const unsigned Indent = 2;
2448 unsigned Cnt = 0;
2449 for (const OperandDataVec &OpDataVec : OpsVec) {
2450 OS << "Operand " << Cnt++ << "\n";
2451 for (const OperandData &OpData : OpDataVec) {
2452 OS.indent(Indent) << "{";
2453 if (Value *V = OpData.V)
2454 OS << *V;
2455 else
2456 OS << "null";
2457 OS << ", APO:" << OpData.APO << "}\n";
2458 }
2459 OS << "\n";
2460 }
2461 return OS;
2462 }
2463
2464 /// Debug print.
dump() const2465 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2466 #endif
2467 };
2468
2469 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2470 /// for a pair which have highest score deemed to have best chance to form
2471 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2472 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2473 /// of the cost, considered to be good enough score.
2474 std::optional<int>
findBestRootPair(ArrayRef<std::pair<Value *,Value * >> Candidates,int Limit=LookAheadHeuristics::ScoreFail) const2475 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2476 int Limit = LookAheadHeuristics::ScoreFail) const {
2477 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2478 RootLookAheadMaxDepth);
2479 int BestScore = Limit;
2480 std::optional<int> Index;
2481 for (int I : seq<int>(0, Candidates.size())) {
2482 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2483 Candidates[I].second,
2484 /*U1=*/nullptr, /*U2=*/nullptr,
2485 /*Level=*/1, std::nullopt);
2486 if (Score > BestScore) {
2487 BestScore = Score;
2488 Index = I;
2489 }
2490 }
2491 return Index;
2492 }
2493
2494 /// Checks if the instruction is marked for deletion.
isDeleted(Instruction * I) const2495 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2496
2497 /// Removes an instruction from its block and eventually deletes it.
2498 /// It's like Instruction::eraseFromParent() except that the actual deletion
2499 /// is delayed until BoUpSLP is destructed.
eraseInstruction(Instruction * I)2500 void eraseInstruction(Instruction *I) {
2501 DeletedInstructions.insert(I);
2502 }
2503
2504 /// Remove instructions from the parent function and clear the operands of \p
2505 /// DeadVals instructions, marking for deletion trivially dead operands.
2506 template <typename T>
removeInstructionsAndOperands(ArrayRef<T * > DeadVals)2507 void removeInstructionsAndOperands(ArrayRef<T *> DeadVals) {
2508 SmallVector<WeakTrackingVH> DeadInsts;
2509 for (T *V : DeadVals) {
2510 auto *I = cast<Instruction>(V);
2511 DeletedInstructions.insert(I);
2512 }
2513 DenseSet<Value *> Processed;
2514 for (T *V : DeadVals) {
2515 if (!V || !Processed.insert(V).second)
2516 continue;
2517 auto *I = cast<Instruction>(V);
2518 salvageDebugInfo(*I);
2519 SmallVector<const TreeEntry *> Entries;
2520 if (const TreeEntry *Entry = getTreeEntry(I)) {
2521 Entries.push_back(Entry);
2522 auto It = MultiNodeScalars.find(I);
2523 if (It != MultiNodeScalars.end())
2524 Entries.append(It->second.begin(), It->second.end());
2525 }
2526 for (Use &U : I->operands()) {
2527 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2528 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2529 wouldInstructionBeTriviallyDead(OpI, TLI) &&
2530 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2531 return Entry->VectorizedValue == OpI;
2532 })))
2533 DeadInsts.push_back(OpI);
2534 }
2535 I->dropAllReferences();
2536 }
2537 for (T *V : DeadVals) {
2538 auto *I = cast<Instruction>(V);
2539 if (!I->getParent())
2540 continue;
2541 assert((I->use_empty() || all_of(I->uses(),
2542 [&](Use &U) {
2543 return isDeleted(
2544 cast<Instruction>(U.getUser()));
2545 })) &&
2546 "trying to erase instruction with users.");
2547 I->removeFromParent();
2548 SE->forgetValue(I);
2549 }
2550 // Process the dead instruction list until empty.
2551 while (!DeadInsts.empty()) {
2552 Value *V = DeadInsts.pop_back_val();
2553 Instruction *VI = cast_or_null<Instruction>(V);
2554 if (!VI || !VI->getParent())
2555 continue;
2556 assert(isInstructionTriviallyDead(VI, TLI) &&
2557 "Live instruction found in dead worklist!");
2558 assert(VI->use_empty() && "Instructions with uses are not dead.");
2559
2560 // Don't lose the debug info while deleting the instructions.
2561 salvageDebugInfo(*VI);
2562
2563 // Null out all of the instruction's operands to see if any operand
2564 // becomes dead as we go.
2565 for (Use &OpU : VI->operands()) {
2566 Value *OpV = OpU.get();
2567 if (!OpV)
2568 continue;
2569 OpU.set(nullptr);
2570
2571 if (!OpV->use_empty())
2572 continue;
2573
2574 // If the operand is an instruction that became dead as we nulled out
2575 // the operand, and if it is 'trivially' dead, delete it in a future
2576 // loop iteration.
2577 if (auto *OpI = dyn_cast<Instruction>(OpV))
2578 if (!DeletedInstructions.contains(OpI) &&
2579 isInstructionTriviallyDead(OpI, TLI))
2580 DeadInsts.push_back(OpI);
2581 }
2582
2583 VI->removeFromParent();
2584 DeletedInstructions.insert(VI);
2585 SE->forgetValue(VI);
2586 }
2587 }
2588
2589 /// Checks if the instruction was already analyzed for being possible
2590 /// reduction root.
isAnalyzedReductionRoot(Instruction * I) const2591 bool isAnalyzedReductionRoot(Instruction *I) const {
2592 return AnalyzedReductionsRoots.count(I);
2593 }
2594 /// Register given instruction as already analyzed for being possible
2595 /// reduction root.
analyzedReductionRoot(Instruction * I)2596 void analyzedReductionRoot(Instruction *I) {
2597 AnalyzedReductionsRoots.insert(I);
2598 }
2599 /// Checks if the provided list of reduced values was checked already for
2600 /// vectorization.
areAnalyzedReductionVals(ArrayRef<Value * > VL) const2601 bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {
2602 return AnalyzedReductionVals.contains(hash_value(VL));
2603 }
2604 /// Adds the list of reduced values to list of already checked values for the
2605 /// vectorization.
analyzedReductionVals(ArrayRef<Value * > VL)2606 void analyzedReductionVals(ArrayRef<Value *> VL) {
2607 AnalyzedReductionVals.insert(hash_value(VL));
2608 }
2609 /// Clear the list of the analyzed reduction root instructions.
clearReductionData()2610 void clearReductionData() {
2611 AnalyzedReductionsRoots.clear();
2612 AnalyzedReductionVals.clear();
2613 AnalyzedMinBWVals.clear();
2614 }
2615 /// Checks if the given value is gathered in one of the nodes.
isAnyGathered(const SmallDenseSet<Value * > & Vals) const2616 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2617 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2618 }
2619 /// Checks if the given value is gathered in one of the nodes.
isGathered(const Value * V) const2620 bool isGathered(const Value *V) const {
2621 return MustGather.contains(V);
2622 }
2623 /// Checks if the specified value was not schedule.
isNotScheduled(const Value * V) const2624 bool isNotScheduled(const Value *V) const {
2625 return NonScheduledFirst.contains(V);
2626 }
2627
2628 /// Check if the value is vectorized in the tree.
isVectorized(Value * V) const2629 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2630
2631 ~BoUpSLP();
2632
2633 private:
2634 /// Determine if a node \p E in can be demoted to a smaller type with a
2635 /// truncation. We collect the entries that will be demoted in ToDemote.
2636 /// \param E Node for analysis
2637 /// \param ToDemote indices of the nodes to be demoted.
2638 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2639 unsigned &BitWidth,
2640 SmallVectorImpl<unsigned> &ToDemote,
2641 DenseSet<const TreeEntry *> &Visited,
2642 unsigned &MaxDepthLevel,
2643 bool &IsProfitableToDemote,
2644 bool IsTruncRoot) const;
2645
2646 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2647 /// reordering (i.e. the operands can be reordered because they have only one
2648 /// user and reordarable).
2649 /// \param ReorderableGathers List of all gather nodes that require reordering
2650 /// (e.g., gather of extractlements or partially vectorizable loads).
2651 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2652 /// reordering, subset of \p NonVectorized.
2653 bool
2654 canReorderOperands(TreeEntry *UserTE,
2655 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2656 ArrayRef<TreeEntry *> ReorderableGathers,
2657 SmallVectorImpl<TreeEntry *> &GatherOps);
2658
2659 /// Checks if the given \p TE is a gather node with clustered reused scalars
2660 /// and reorders it per given \p Mask.
2661 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2662
2663 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2664 /// if any. If it is not vectorized (gather node), returns nullptr.
getVectorizedOperand(TreeEntry * UserTE,unsigned OpIdx)2665 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2666 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2667 TreeEntry *TE = nullptr;
2668 const auto *It = find_if(VL, [&](Value *V) {
2669 TE = getTreeEntry(V);
2670 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2671 return true;
2672 auto It = MultiNodeScalars.find(V);
2673 if (It != MultiNodeScalars.end()) {
2674 for (TreeEntry *E : It->second) {
2675 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2676 TE = E;
2677 return true;
2678 }
2679 }
2680 }
2681 return false;
2682 });
2683 if (It != VL.end()) {
2684 assert(TE->isSame(VL) && "Expected same scalars.");
2685 return TE;
2686 }
2687 return nullptr;
2688 }
2689
2690 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2691 /// if any. If it is not vectorized (gather node), returns nullptr.
getVectorizedOperand(const TreeEntry * UserTE,unsigned OpIdx) const2692 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2693 unsigned OpIdx) const {
2694 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2695 const_cast<TreeEntry *>(UserTE), OpIdx);
2696 }
2697
2698 /// Checks if all users of \p I are the part of the vectorization tree.
2699 bool areAllUsersVectorized(
2700 Instruction *I,
2701 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2702
2703 /// Return information about the vector formed for the specified index
2704 /// of a vector of (the same) instruction.
2705 TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
2706
2707 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2708 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2709
2710 /// \returns Cast context for the given graph node.
2711 TargetTransformInfo::CastContextHint
2712 getCastContextHint(const TreeEntry &TE) const;
2713
2714 /// \returns the cost of the vectorizable entry.
2715 InstructionCost getEntryCost(const TreeEntry *E,
2716 ArrayRef<Value *> VectorizedVals,
2717 SmallPtrSetImpl<Value *> &CheckedExtracts);
2718
2719 /// This is the recursive part of buildTree.
2720 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2721 const EdgeInfo &EI);
2722
2723 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2724 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2725 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2726 /// returns false, setting \p CurrentOrder to either an empty vector or a
2727 /// non-identity permutation that allows to reuse extract instructions.
2728 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2729 /// extract order.
2730 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2731 SmallVectorImpl<unsigned> &CurrentOrder,
2732 bool ResizeAllowed = false) const;
2733
2734 /// Vectorize a single entry in the tree.
2735 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2736 /// avoid issues with def-use order.
2737 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2738
2739 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2740 /// \p E.
2741 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2742 /// avoid issues with def-use order.
2743 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2744
2745 /// Create a new vector from a list of scalar values. Produces a sequence
2746 /// which exploits values reused across lanes, and arranges the inserts
2747 /// for ease of later optimization.
2748 template <typename BVTy, typename ResTy, typename... Args>
2749 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2750
2751 /// Create a new vector from a list of scalar values. Produces a sequence
2752 /// which exploits values reused across lanes, and arranges the inserts
2753 /// for ease of later optimization.
2754 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2755
2756 /// Returns the instruction in the bundle, which can be used as a base point
2757 /// for scheduling. Usually it is the last instruction in the bundle, except
2758 /// for the case when all operands are external (in this case, it is the first
2759 /// instruction in the list).
2760 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2761
2762 /// Tries to find extractelement instructions with constant indices from fixed
2763 /// vector type and gather such instructions into a bunch, which highly likely
2764 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2765 /// was successful, the matched scalars are replaced by poison values in \p VL
2766 /// for future analysis.
2767 std::optional<TargetTransformInfo::ShuffleKind>
2768 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2769 SmallVectorImpl<int> &Mask) const;
2770
2771 /// Tries to find extractelement instructions with constant indices from fixed
2772 /// vector type and gather such instructions into a bunch, which highly likely
2773 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2774 /// was successful, the matched scalars are replaced by poison values in \p VL
2775 /// for future analysis.
2776 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2777 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2778 SmallVectorImpl<int> &Mask,
2779 unsigned NumParts) const;
2780
2781 /// Checks if the gathered \p VL can be represented as a single register
2782 /// shuffle(s) of previous tree entries.
2783 /// \param TE Tree entry checked for permutation.
2784 /// \param VL List of scalars (a subset of the TE scalar), checked for
2785 /// permutations. Must form single-register vector.
2786 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2787 /// commands to build the mask using the original vector value, without
2788 /// relying on the potential reordering.
2789 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2790 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2791 std::optional<TargetTransformInfo::ShuffleKind>
2792 isGatherShuffledSingleRegisterEntry(
2793 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2794 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2795 bool ForOrder);
2796
2797 /// Checks if the gathered \p VL can be represented as multi-register
2798 /// shuffle(s) of previous tree entries.
2799 /// \param TE Tree entry checked for permutation.
2800 /// \param VL List of scalars (a subset of the TE scalar), checked for
2801 /// permutations.
2802 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2803 /// commands to build the mask using the original vector value, without
2804 /// relying on the potential reordering.
2805 /// \returns per-register series of ShuffleKind, if gathered values can be
2806 /// represented as shuffles of previous tree entries. \p Mask is filled with
2807 /// the shuffle mask (also on per-register base).
2808 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2809 isGatherShuffledEntry(
2810 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2811 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
2812 unsigned NumParts, bool ForOrder = false);
2813
2814 /// \returns the scalarization cost for this list of values. Assuming that
2815 /// this subtree gets vectorized, we may need to extract the values from the
2816 /// roots. This method calculates the cost of extracting the values.
2817 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2818 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2819 Type *ScalarTy) const;
2820
2821 /// Set the Builder insert point to one after the last instruction in
2822 /// the bundle
2823 void setInsertPointAfterBundle(const TreeEntry *E);
2824
2825 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2826 /// specified, the starting vector value is poison.
2827 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2828
2829 /// \returns whether the VectorizableTree is fully vectorizable and will
2830 /// be beneficial even the tree height is tiny.
2831 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2832
2833 /// Reorder commutative or alt operands to get better probability of
2834 /// generating vectorized code.
2835 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2836 SmallVectorImpl<Value *> &Left,
2837 SmallVectorImpl<Value *> &Right,
2838 const BoUpSLP &R);
2839
2840 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2841 /// users of \p TE and collects the stores. It returns the map from the store
2842 /// pointers to the collected stores.
2843 DenseMap<Value *, SmallVector<StoreInst *>>
2844 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2845
2846 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2847 /// stores in \p StoresVec can form a vector instruction. If so it returns
2848 /// true and populates \p ReorderIndices with the shuffle indices of the
2849 /// stores when compared to the sorted vector.
2850 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2851 OrdersType &ReorderIndices) const;
2852
2853 /// Iterates through the users of \p TE, looking for scalar stores that can be
2854 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2855 /// their order and builds an order index vector for each store bundle. It
2856 /// returns all these order vectors found.
2857 /// We run this after the tree has formed, otherwise we may come across user
2858 /// instructions that are not yet in the tree.
2859 SmallVector<OrdersType, 1>
2860 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2861
2862 struct TreeEntry {
2863 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
TreeEntryllvm::slpvectorizer::BoUpSLP::TreeEntry2864 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2865
2866 /// \returns Common mask for reorder indices and reused scalars.
getCommonMaskllvm::slpvectorizer::BoUpSLP::TreeEntry2867 SmallVector<int> getCommonMask() const {
2868 SmallVector<int> Mask;
2869 inversePermutation(ReorderIndices, Mask);
2870 ::addMask(Mask, ReuseShuffleIndices);
2871 return Mask;
2872 }
2873
2874 /// \returns true if the scalars in VL are equal to this entry.
isSamellvm::slpvectorizer::BoUpSLP::TreeEntry2875 bool isSame(ArrayRef<Value *> VL) const {
2876 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2877 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2878 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2879 return VL.size() == Mask.size() &&
2880 std::equal(VL.begin(), VL.end(), Mask.begin(),
2881 [Scalars](Value *V, int Idx) {
2882 return (isa<UndefValue>(V) &&
2883 Idx == PoisonMaskElem) ||
2884 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2885 });
2886 };
2887 if (!ReorderIndices.empty()) {
2888 // TODO: implement matching if the nodes are just reordered, still can
2889 // treat the vector as the same if the list of scalars matches VL
2890 // directly, without reordering.
2891 SmallVector<int> Mask;
2892 inversePermutation(ReorderIndices, Mask);
2893 if (VL.size() == Scalars.size())
2894 return IsSame(Scalars, Mask);
2895 if (VL.size() == ReuseShuffleIndices.size()) {
2896 ::addMask(Mask, ReuseShuffleIndices);
2897 return IsSame(Scalars, Mask);
2898 }
2899 return false;
2900 }
2901 return IsSame(Scalars, ReuseShuffleIndices);
2902 }
2903
isOperandGatherNodellvm::slpvectorizer::BoUpSLP::TreeEntry2904 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2905 return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2906 UserTreeIndices.front().UserTE == UserEI.UserTE;
2907 }
2908
2909 /// \returns true if current entry has same operands as \p TE.
hasEqualOperandsllvm::slpvectorizer::BoUpSLP::TreeEntry2910 bool hasEqualOperands(const TreeEntry &TE) const {
2911 if (TE.getNumOperands() != getNumOperands())
2912 return false;
2913 SmallBitVector Used(getNumOperands());
2914 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2915 unsigned PrevCount = Used.count();
2916 for (unsigned K = 0; K < E; ++K) {
2917 if (Used.test(K))
2918 continue;
2919 if (getOperand(K) == TE.getOperand(I)) {
2920 Used.set(K);
2921 break;
2922 }
2923 }
2924 // Check if we actually found the matching operand.
2925 if (PrevCount == Used.count())
2926 return false;
2927 }
2928 return true;
2929 }
2930
2931 /// \return Final vectorization factor for the node. Defined by the total
2932 /// number of vectorized scalars, including those, used several times in the
2933 /// entry and counted in the \a ReuseShuffleIndices, if any.
getVectorFactorllvm::slpvectorizer::BoUpSLP::TreeEntry2934 unsigned getVectorFactor() const {
2935 if (!ReuseShuffleIndices.empty())
2936 return ReuseShuffleIndices.size();
2937 return Scalars.size();
2938 };
2939
2940 /// Checks if the current node is a gather node.
isGatherllvm::slpvectorizer::BoUpSLP::TreeEntry2941 bool isGather() const {return State == NeedToGather; }
2942
2943 /// A vector of scalars.
2944 ValueList Scalars;
2945
2946 /// The Scalars are vectorized into this value. It is initialized to Null.
2947 WeakTrackingVH VectorizedValue = nullptr;
2948
2949 /// New vector phi instructions emitted for the vectorized phi nodes.
2950 PHINode *PHI = nullptr;
2951
2952 /// Do we need to gather this sequence or vectorize it
2953 /// (either with vector instruction or with scatter/gather
2954 /// intrinsics for store/load)?
2955 enum EntryState {
2956 Vectorize,
2957 ScatterVectorize,
2958 StridedVectorize,
2959 NeedToGather
2960 };
2961 EntryState State;
2962
2963 /// Does this sequence require some shuffling?
2964 SmallVector<int, 4> ReuseShuffleIndices;
2965
2966 /// Does this entry require reordering?
2967 SmallVector<unsigned, 4> ReorderIndices;
2968
2969 /// Points back to the VectorizableTree.
2970 ///
2971 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2972 /// to be a pointer and needs to be able to initialize the child iterator.
2973 /// Thus we need a reference back to the container to translate the indices
2974 /// to entries.
2975 VecTreeTy &Container;
2976
2977 /// The TreeEntry index containing the user of this entry. We can actually
2978 /// have multiple users so the data structure is not truly a tree.
2979 SmallVector<EdgeInfo, 1> UserTreeIndices;
2980
2981 /// The index of this treeEntry in VectorizableTree.
2982 int Idx = -1;
2983
2984 private:
2985 /// The operands of each instruction in each lane Operands[op_index][lane].
2986 /// Note: This helps avoid the replication of the code that performs the
2987 /// reordering of operands during buildTree_rec() and vectorizeTree().
2988 SmallVector<ValueList, 2> Operands;
2989
2990 /// The main/alternate instruction.
2991 Instruction *MainOp = nullptr;
2992 Instruction *AltOp = nullptr;
2993
2994 public:
2995 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
setOperandllvm::slpvectorizer::BoUpSLP::TreeEntry2996 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2997 if (Operands.size() < OpIdx + 1)
2998 Operands.resize(OpIdx + 1);
2999 assert(Operands[OpIdx].empty() && "Already resized?");
3000 assert(OpVL.size() <= Scalars.size() &&
3001 "Number of operands is greater than the number of scalars.");
3002 Operands[OpIdx].resize(OpVL.size());
3003 copy(OpVL, Operands[OpIdx].begin());
3004 }
3005
3006 /// Set the operands of this bundle in their original order.
setOperandsInOrderllvm::slpvectorizer::BoUpSLP::TreeEntry3007 void setOperandsInOrder() {
3008 assert(Operands.empty() && "Already initialized?");
3009 auto *I0 = cast<Instruction>(Scalars[0]);
3010 Operands.resize(I0->getNumOperands());
3011 unsigned NumLanes = Scalars.size();
3012 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3013 OpIdx != NumOperands; ++OpIdx) {
3014 Operands[OpIdx].resize(NumLanes);
3015 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3016 auto *I = cast<Instruction>(Scalars[Lane]);
3017 assert(I->getNumOperands() == NumOperands &&
3018 "Expected same number of operands");
3019 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
3020 }
3021 }
3022 }
3023
3024 /// Reorders operands of the node to the given mask \p Mask.
reorderOperandsllvm::slpvectorizer::BoUpSLP::TreeEntry3025 void reorderOperands(ArrayRef<int> Mask) {
3026 for (ValueList &Operand : Operands)
3027 reorderScalars(Operand, Mask);
3028 }
3029
3030 /// \returns the \p OpIdx operand of this TreeEntry.
getOperandllvm::slpvectorizer::BoUpSLP::TreeEntry3031 ValueList &getOperand(unsigned OpIdx) {
3032 assert(OpIdx < Operands.size() && "Off bounds");
3033 return Operands[OpIdx];
3034 }
3035
3036 /// \returns the \p OpIdx operand of this TreeEntry.
getOperandllvm::slpvectorizer::BoUpSLP::TreeEntry3037 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3038 assert(OpIdx < Operands.size() && "Off bounds");
3039 return Operands[OpIdx];
3040 }
3041
3042 /// \returns the number of operands.
getNumOperandsllvm::slpvectorizer::BoUpSLP::TreeEntry3043 unsigned getNumOperands() const { return Operands.size(); }
3044
3045 /// \return the single \p OpIdx operand.
getSingleOperandllvm::slpvectorizer::BoUpSLP::TreeEntry3046 Value *getSingleOperand(unsigned OpIdx) const {
3047 assert(OpIdx < Operands.size() && "Off bounds");
3048 assert(!Operands[OpIdx].empty() && "No operand available");
3049 return Operands[OpIdx][0];
3050 }
3051
3052 /// Some of the instructions in the list have alternate opcodes.
isAltShufflellvm::slpvectorizer::BoUpSLP::TreeEntry3053 bool isAltShuffle() const { return MainOp != AltOp; }
3054
isOpcodeOrAltllvm::slpvectorizer::BoUpSLP::TreeEntry3055 bool isOpcodeOrAlt(Instruction *I) const {
3056 unsigned CheckedOpcode = I->getOpcode();
3057 return (getOpcode() == CheckedOpcode ||
3058 getAltOpcode() == CheckedOpcode);
3059 }
3060
3061 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3062 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3063 /// \p OpValue.
isOneOfllvm::slpvectorizer::BoUpSLP::TreeEntry3064 Value *isOneOf(Value *Op) const {
3065 auto *I = dyn_cast<Instruction>(Op);
3066 if (I && isOpcodeOrAlt(I))
3067 return Op;
3068 return MainOp;
3069 }
3070
setOperationsllvm::slpvectorizer::BoUpSLP::TreeEntry3071 void setOperations(const InstructionsState &S) {
3072 MainOp = S.MainOp;
3073 AltOp = S.AltOp;
3074 }
3075
getMainOpllvm::slpvectorizer::BoUpSLP::TreeEntry3076 Instruction *getMainOp() const {
3077 return MainOp;
3078 }
3079
getAltOpllvm::slpvectorizer::BoUpSLP::TreeEntry3080 Instruction *getAltOp() const {
3081 return AltOp;
3082 }
3083
3084 /// The main/alternate opcodes for the list of instructions.
getOpcodellvm::slpvectorizer::BoUpSLP::TreeEntry3085 unsigned getOpcode() const {
3086 return MainOp ? MainOp->getOpcode() : 0;
3087 }
3088
getAltOpcodellvm::slpvectorizer::BoUpSLP::TreeEntry3089 unsigned getAltOpcode() const {
3090 return AltOp ? AltOp->getOpcode() : 0;
3091 }
3092
3093 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3094 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
findLaneForValuellvm::slpvectorizer::BoUpSLP::TreeEntry3095 int findLaneForValue(Value *V) const {
3096 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
3097 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3098 if (!ReorderIndices.empty())
3099 FoundLane = ReorderIndices[FoundLane];
3100 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3101 if (!ReuseShuffleIndices.empty()) {
3102 FoundLane = std::distance(ReuseShuffleIndices.begin(),
3103 find(ReuseShuffleIndices, FoundLane));
3104 }
3105 return FoundLane;
3106 }
3107
3108 /// Build a shuffle mask for graph entry which represents a merge of main
3109 /// and alternate operations.
3110 void
3111 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3112 SmallVectorImpl<int> &Mask,
3113 SmallVectorImpl<Value *> *OpScalars = nullptr,
3114 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3115
3116 /// Return true if this is a non-power-of-2 node.
isNonPowOf2Vecllvm::slpvectorizer::BoUpSLP::TreeEntry3117 bool isNonPowOf2Vec() const {
3118 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
3119 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3120 "Reshuffling not supported with non-power-of-2 vectors yet.");
3121 return IsNonPowerOf2;
3122 }
3123
3124 #ifndef NDEBUG
3125 /// Debug printer.
dumpllvm::slpvectorizer::BoUpSLP::TreeEntry3126 LLVM_DUMP_METHOD void dump() const {
3127 dbgs() << Idx << ".\n";
3128 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3129 dbgs() << "Operand " << OpI << ":\n";
3130 for (const Value *V : Operands[OpI])
3131 dbgs().indent(2) << *V << "\n";
3132 }
3133 dbgs() << "Scalars: \n";
3134 for (Value *V : Scalars)
3135 dbgs().indent(2) << *V << "\n";
3136 dbgs() << "State: ";
3137 switch (State) {
3138 case Vectorize:
3139 dbgs() << "Vectorize\n";
3140 break;
3141 case ScatterVectorize:
3142 dbgs() << "ScatterVectorize\n";
3143 break;
3144 case StridedVectorize:
3145 dbgs() << "StridedVectorize\n";
3146 break;
3147 case NeedToGather:
3148 dbgs() << "NeedToGather\n";
3149 break;
3150 }
3151 dbgs() << "MainOp: ";
3152 if (MainOp)
3153 dbgs() << *MainOp << "\n";
3154 else
3155 dbgs() << "NULL\n";
3156 dbgs() << "AltOp: ";
3157 if (AltOp)
3158 dbgs() << *AltOp << "\n";
3159 else
3160 dbgs() << "NULL\n";
3161 dbgs() << "VectorizedValue: ";
3162 if (VectorizedValue)
3163 dbgs() << *VectorizedValue << "\n";
3164 else
3165 dbgs() << "NULL\n";
3166 dbgs() << "ReuseShuffleIndices: ";
3167 if (ReuseShuffleIndices.empty())
3168 dbgs() << "Empty";
3169 else
3170 for (int ReuseIdx : ReuseShuffleIndices)
3171 dbgs() << ReuseIdx << ", ";
3172 dbgs() << "\n";
3173 dbgs() << "ReorderIndices: ";
3174 for (unsigned ReorderIdx : ReorderIndices)
3175 dbgs() << ReorderIdx << ", ";
3176 dbgs() << "\n";
3177 dbgs() << "UserTreeIndices: ";
3178 for (const auto &EInfo : UserTreeIndices)
3179 dbgs() << EInfo << ", ";
3180 dbgs() << "\n";
3181 }
3182 #endif
3183 };
3184
3185 #ifndef NDEBUG
dumpTreeCosts(const TreeEntry * E,InstructionCost ReuseShuffleCost,InstructionCost VecCost,InstructionCost ScalarCost,StringRef Banner) const3186 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3187 InstructionCost VecCost, InstructionCost ScalarCost,
3188 StringRef Banner) const {
3189 dbgs() << "SLP: " << Banner << ":\n";
3190 E->dump();
3191 dbgs() << "SLP: Costs:\n";
3192 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3193 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3194 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3195 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3196 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3197 }
3198 #endif
3199
3200 /// Create a new VectorizableTree entry.
newTreeEntry(ArrayRef<Value * > VL,std::optional<ScheduleData * > Bundle,const InstructionsState & S,const EdgeInfo & UserTreeIdx,ArrayRef<int> ReuseShuffleIndices=std::nullopt,ArrayRef<unsigned> ReorderIndices=std::nullopt)3201 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3202 std::optional<ScheduleData *> Bundle,
3203 const InstructionsState &S,
3204 const EdgeInfo &UserTreeIdx,
3205 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3206 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3207 TreeEntry::EntryState EntryState =
3208 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3209 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3210 ReuseShuffleIndices, ReorderIndices);
3211 }
3212
newTreeEntry(ArrayRef<Value * > VL,TreeEntry::EntryState EntryState,std::optional<ScheduleData * > Bundle,const InstructionsState & S,const EdgeInfo & UserTreeIdx,ArrayRef<int> ReuseShuffleIndices=std::nullopt,ArrayRef<unsigned> ReorderIndices=std::nullopt)3213 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3214 TreeEntry::EntryState EntryState,
3215 std::optional<ScheduleData *> Bundle,
3216 const InstructionsState &S,
3217 const EdgeInfo &UserTreeIdx,
3218 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3219 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3220 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3221 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3222 "Need to vectorize gather entry?");
3223 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3224 TreeEntry *Last = VectorizableTree.back().get();
3225 Last->Idx = VectorizableTree.size() - 1;
3226 Last->State = EntryState;
3227 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3228 ReuseShuffleIndices.end());
3229 if (ReorderIndices.empty()) {
3230 Last->Scalars.assign(VL.begin(), VL.end());
3231 Last->setOperations(S);
3232 } else {
3233 // Reorder scalars and build final mask.
3234 Last->Scalars.assign(VL.size(), nullptr);
3235 transform(ReorderIndices, Last->Scalars.begin(),
3236 [VL](unsigned Idx) -> Value * {
3237 if (Idx >= VL.size())
3238 return UndefValue::get(VL.front()->getType());
3239 return VL[Idx];
3240 });
3241 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3242 Last->setOperations(S);
3243 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3244 }
3245 if (!Last->isGather()) {
3246 for (Value *V : VL) {
3247 const TreeEntry *TE = getTreeEntry(V);
3248 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3249 "Scalar already in tree!");
3250 if (TE) {
3251 if (TE != Last)
3252 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3253 continue;
3254 }
3255 ScalarToTreeEntry[V] = Last;
3256 }
3257 // Update the scheduler bundle to point to this TreeEntry.
3258 ScheduleData *BundleMember = *Bundle;
3259 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3260 isVectorLikeInstWithConstOps(S.MainOp) ||
3261 doesNotNeedToSchedule(VL)) &&
3262 "Bundle and VL out of sync");
3263 if (BundleMember) {
3264 for (Value *V : VL) {
3265 if (doesNotNeedToBeScheduled(V))
3266 continue;
3267 if (!BundleMember)
3268 continue;
3269 BundleMember->TE = Last;
3270 BundleMember = BundleMember->NextInBundle;
3271 }
3272 }
3273 assert(!BundleMember && "Bundle and VL out of sync");
3274 } else {
3275 // Build a map for gathered scalars to the nodes where they are used.
3276 bool AllConstsOrCasts = true;
3277 for (Value *V : VL)
3278 if (!isConstant(V)) {
3279 auto *I = dyn_cast<CastInst>(V);
3280 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3281 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3282 }
3283 if (AllConstsOrCasts)
3284 CastMaxMinBWSizes =
3285 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3286 MustGather.insert(VL.begin(), VL.end());
3287 }
3288
3289 if (UserTreeIdx.UserTE) {
3290 Last->UserTreeIndices.push_back(UserTreeIdx);
3291 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3292 "Reordering isn't implemented for non-power-of-2 nodes yet");
3293 }
3294 return Last;
3295 }
3296
3297 /// -- Vectorization State --
3298 /// Holds all of the tree entries.
3299 TreeEntry::VecTreeTy VectorizableTree;
3300
3301 #ifndef NDEBUG
3302 /// Debug printer.
dumpVectorizableTree() const3303 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3304 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3305 VectorizableTree[Id]->dump();
3306 dbgs() << "\n";
3307 }
3308 }
3309 #endif
3310
getTreeEntry(Value * V)3311 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3312
getTreeEntry(Value * V) const3313 const TreeEntry *getTreeEntry(Value *V) const {
3314 return ScalarToTreeEntry.lookup(V);
3315 }
3316
3317 /// Check that the operand node of alternate node does not generate
3318 /// buildvector sequence. If it is, then probably not worth it to build
3319 /// alternate shuffle, if number of buildvector operands + alternate
3320 /// instruction > than the number of buildvector instructions.
3321 /// \param S the instructions state of the analyzed values.
3322 /// \param VL list of the instructions with alternate opcodes.
3323 bool areAltOperandsProfitable(const InstructionsState &S,
3324 ArrayRef<Value *> VL) const;
3325
3326 /// Checks if the specified list of the instructions/values can be vectorized
3327 /// and fills required data before actual scheduling of the instructions.
3328 TreeEntry::EntryState getScalarsVectorizationState(
3329 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3330 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3331
3332 /// Maps a specific scalar to its tree entry.
3333 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3334
3335 /// List of scalars, used in several vectorize nodes, and the list of the
3336 /// nodes.
3337 SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;
3338
3339 /// Maps a value to the proposed vectorizable size.
3340 SmallDenseMap<Value *, unsigned> InstrElementSize;
3341
3342 /// A list of scalars that we found that we need to keep as scalars.
3343 ValueSet MustGather;
3344
3345 /// A set of first non-schedulable values.
3346 ValueSet NonScheduledFirst;
3347
3348 /// A map between the vectorized entries and the last instructions in the
3349 /// bundles. The bundles are built in use order, not in the def order of the
3350 /// instructions. So, we cannot rely directly on the last instruction in the
3351 /// bundle being the last instruction in the program order during
3352 /// vectorization process since the basic blocks are affected, need to
3353 /// pre-gather them before.
3354 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3355
3356 /// List of gather nodes, depending on other gather/vector nodes, which should
3357 /// be emitted after the vector instruction emission process to correctly
3358 /// handle order of the vector instructions and shuffles.
3359 SetVector<const TreeEntry *> PostponedGathers;
3360
3361 using ValueToGatherNodesMap =
3362 DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
3363 ValueToGatherNodesMap ValueToGatherNodes;
3364
3365 /// This POD struct describes one external user in the vectorized tree.
3366 struct ExternalUser {
ExternalUserllvm::slpvectorizer::BoUpSLP::ExternalUser3367 ExternalUser(Value *S, llvm::User *U, int L)
3368 : Scalar(S), User(U), Lane(L) {}
3369
3370 // Which scalar in our function.
3371 Value *Scalar;
3372
3373 // Which user that uses the scalar.
3374 llvm::User *User;
3375
3376 // Which lane does the scalar belong to.
3377 int Lane;
3378 };
3379 using UserList = SmallVector<ExternalUser, 16>;
3380
3381 /// Checks if two instructions may access the same memory.
3382 ///
3383 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3384 /// is invariant in the calling loop.
isAliased(const MemoryLocation & Loc1,Instruction * Inst1,Instruction * Inst2)3385 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3386 Instruction *Inst2) {
3387 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3388 return true;
3389 // First check if the result is already in the cache.
3390 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3391 auto It = AliasCache.find(Key);
3392 if (It != AliasCache.end())
3393 return It->second;
3394 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3395 // Store the result in the cache.
3396 AliasCache.try_emplace(Key, Aliased);
3397 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3398 return Aliased;
3399 }
3400
3401 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3402
3403 /// Cache for alias results.
3404 /// TODO: consider moving this to the AliasAnalysis itself.
3405 DenseMap<AliasCacheKey, bool> AliasCache;
3406
3407 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3408 // globally through SLP because we don't perform any action which
3409 // invalidates capture results.
3410 BatchAAResults BatchAA;
3411
3412 /// Temporary store for deleted instructions. Instructions will be deleted
3413 /// eventually when the BoUpSLP is destructed. The deferral is required to
3414 /// ensure that there are no incorrect collisions in the AliasCache, which
3415 /// can happen if a new instruction is allocated at the same address as a
3416 /// previously deleted instruction.
3417 DenseSet<Instruction *> DeletedInstructions;
3418
3419 /// Set of the instruction, being analyzed already for reductions.
3420 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3421
3422 /// Set of hashes for the list of reduction values already being analyzed.
3423 DenseSet<size_t> AnalyzedReductionVals;
3424
3425 /// Values, already been analyzed for mininmal bitwidth and found to be
3426 /// non-profitable.
3427 DenseSet<Value *> AnalyzedMinBWVals;
3428
3429 /// A list of values that need to extracted out of the tree.
3430 /// This list holds pairs of (Internal Scalar : External User). External User
3431 /// can be nullptr, it means that this Internal Scalar will be used later,
3432 /// after vectorization.
3433 UserList ExternalUses;
3434
3435 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3436 /// extractelement instructions.
3437 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3438
3439 /// Values used only by @llvm.assume calls.
3440 SmallPtrSet<const Value *, 32> EphValues;
3441
3442 /// Holds all of the instructions that we gathered, shuffle instructions and
3443 /// extractelements.
3444 SetVector<Instruction *> GatherShuffleExtractSeq;
3445
3446 /// A list of blocks that we are going to CSE.
3447 DenseSet<BasicBlock *> CSEBlocks;
3448
3449 /// Contains all scheduling relevant data for an instruction.
3450 /// A ScheduleData either represents a single instruction or a member of an
3451 /// instruction bundle (= a group of instructions which is combined into a
3452 /// vector instruction).
3453 struct ScheduleData {
3454 // The initial value for the dependency counters. It means that the
3455 // dependencies are not calculated yet.
3456 enum { InvalidDeps = -1 };
3457
3458 ScheduleData() = default;
3459
initllvm::slpvectorizer::BoUpSLP::ScheduleData3460 void init(int BlockSchedulingRegionID, Value *OpVal) {
3461 FirstInBundle = this;
3462 NextInBundle = nullptr;
3463 NextLoadStore = nullptr;
3464 IsScheduled = false;
3465 SchedulingRegionID = BlockSchedulingRegionID;
3466 clearDependencies();
3467 OpValue = OpVal;
3468 TE = nullptr;
3469 }
3470
3471 /// Verify basic self consistency properties
verifyllvm::slpvectorizer::BoUpSLP::ScheduleData3472 void verify() {
3473 if (hasValidDependencies()) {
3474 assert(UnscheduledDeps <= Dependencies && "invariant");
3475 } else {
3476 assert(UnscheduledDeps == Dependencies && "invariant");
3477 }
3478
3479 if (IsScheduled) {
3480 assert(isSchedulingEntity() &&
3481 "unexpected scheduled state");
3482 for (const ScheduleData *BundleMember = this; BundleMember;
3483 BundleMember = BundleMember->NextInBundle) {
3484 assert(BundleMember->hasValidDependencies() &&
3485 BundleMember->UnscheduledDeps == 0 &&
3486 "unexpected scheduled state");
3487 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3488 "only bundle is marked scheduled");
3489 }
3490 }
3491
3492 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3493 "all bundle members must be in same basic block");
3494 }
3495
3496 /// Returns true if the dependency information has been calculated.
3497 /// Note that depenendency validity can vary between instructions within
3498 /// a single bundle.
hasValidDependenciesllvm::slpvectorizer::BoUpSLP::ScheduleData3499 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3500
3501 /// Returns true for single instructions and for bundle representatives
3502 /// (= the head of a bundle).
isSchedulingEntityllvm::slpvectorizer::BoUpSLP::ScheduleData3503 bool isSchedulingEntity() const { return FirstInBundle == this; }
3504
3505 /// Returns true if it represents an instruction bundle and not only a
3506 /// single instruction.
isPartOfBundlellvm::slpvectorizer::BoUpSLP::ScheduleData3507 bool isPartOfBundle() const {
3508 return NextInBundle != nullptr || FirstInBundle != this || TE;
3509 }
3510
3511 /// Returns true if it is ready for scheduling, i.e. it has no more
3512 /// unscheduled depending instructions/bundles.
isReadyllvm::slpvectorizer::BoUpSLP::ScheduleData3513 bool isReady() const {
3514 assert(isSchedulingEntity() &&
3515 "can't consider non-scheduling entity for ready list");
3516 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3517 }
3518
3519 /// Modifies the number of unscheduled dependencies for this instruction,
3520 /// and returns the number of remaining dependencies for the containing
3521 /// bundle.
incrementUnscheduledDepsllvm::slpvectorizer::BoUpSLP::ScheduleData3522 int incrementUnscheduledDeps(int Incr) {
3523 assert(hasValidDependencies() &&
3524 "increment of unscheduled deps would be meaningless");
3525 UnscheduledDeps += Incr;
3526 return FirstInBundle->unscheduledDepsInBundle();
3527 }
3528
3529 /// Sets the number of unscheduled dependencies to the number of
3530 /// dependencies.
resetUnscheduledDepsllvm::slpvectorizer::BoUpSLP::ScheduleData3531 void resetUnscheduledDeps() {
3532 UnscheduledDeps = Dependencies;
3533 }
3534
3535 /// Clears all dependency information.
clearDependenciesllvm::slpvectorizer::BoUpSLP::ScheduleData3536 void clearDependencies() {
3537 Dependencies = InvalidDeps;
3538 resetUnscheduledDeps();
3539 MemoryDependencies.clear();
3540 ControlDependencies.clear();
3541 }
3542
unscheduledDepsInBundlellvm::slpvectorizer::BoUpSLP::ScheduleData3543 int unscheduledDepsInBundle() const {
3544 assert(isSchedulingEntity() && "only meaningful on the bundle");
3545 int Sum = 0;
3546 for (const ScheduleData *BundleMember = this; BundleMember;
3547 BundleMember = BundleMember->NextInBundle) {
3548 if (BundleMember->UnscheduledDeps == InvalidDeps)
3549 return InvalidDeps;
3550 Sum += BundleMember->UnscheduledDeps;
3551 }
3552 return Sum;
3553 }
3554
dumpllvm::slpvectorizer::BoUpSLP::ScheduleData3555 void dump(raw_ostream &os) const {
3556 if (!isSchedulingEntity()) {
3557 os << "/ " << *Inst;
3558 } else if (NextInBundle) {
3559 os << '[' << *Inst;
3560 ScheduleData *SD = NextInBundle;
3561 while (SD) {
3562 os << ';' << *SD->Inst;
3563 SD = SD->NextInBundle;
3564 }
3565 os << ']';
3566 } else {
3567 os << *Inst;
3568 }
3569 }
3570
3571 Instruction *Inst = nullptr;
3572
3573 /// Opcode of the current instruction in the schedule data.
3574 Value *OpValue = nullptr;
3575
3576 /// The TreeEntry that this instruction corresponds to.
3577 TreeEntry *TE = nullptr;
3578
3579 /// Points to the head in an instruction bundle (and always to this for
3580 /// single instructions).
3581 ScheduleData *FirstInBundle = nullptr;
3582
3583 /// Single linked list of all instructions in a bundle. Null if it is a
3584 /// single instruction.
3585 ScheduleData *NextInBundle = nullptr;
3586
3587 /// Single linked list of all memory instructions (e.g. load, store, call)
3588 /// in the block - until the end of the scheduling region.
3589 ScheduleData *NextLoadStore = nullptr;
3590
3591 /// The dependent memory instructions.
3592 /// This list is derived on demand in calculateDependencies().
3593 SmallVector<ScheduleData *, 4> MemoryDependencies;
3594
3595 /// List of instructions which this instruction could be control dependent
3596 /// on. Allowing such nodes to be scheduled below this one could introduce
3597 /// a runtime fault which didn't exist in the original program.
3598 /// ex: this is a load or udiv following a readonly call which inf loops
3599 SmallVector<ScheduleData *, 4> ControlDependencies;
3600
3601 /// This ScheduleData is in the current scheduling region if this matches
3602 /// the current SchedulingRegionID of BlockScheduling.
3603 int SchedulingRegionID = 0;
3604
3605 /// Used for getting a "good" final ordering of instructions.
3606 int SchedulingPriority = 0;
3607
3608 /// The number of dependencies. Constitutes of the number of users of the
3609 /// instruction plus the number of dependent memory instructions (if any).
3610 /// This value is calculated on demand.
3611 /// If InvalidDeps, the number of dependencies is not calculated yet.
3612 int Dependencies = InvalidDeps;
3613
3614 /// The number of dependencies minus the number of dependencies of scheduled
3615 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3616 /// for scheduling.
3617 /// Note that this is negative as long as Dependencies is not calculated.
3618 int UnscheduledDeps = InvalidDeps;
3619
3620 /// True if this instruction is scheduled (or considered as scheduled in the
3621 /// dry-run).
3622 bool IsScheduled = false;
3623 };
3624
3625 #ifndef NDEBUG
operator <<(raw_ostream & os,const BoUpSLP::ScheduleData & SD)3626 friend inline raw_ostream &operator<<(raw_ostream &os,
3627 const BoUpSLP::ScheduleData &SD) {
3628 SD.dump(os);
3629 return os;
3630 }
3631 #endif
3632
3633 friend struct GraphTraits<BoUpSLP *>;
3634 friend struct DOTGraphTraits<BoUpSLP *>;
3635
3636 /// Contains all scheduling data for a basic block.
3637 /// It does not schedules instructions, which are not memory read/write
3638 /// instructions and their operands are either constants, or arguments, or
3639 /// phis, or instructions from others blocks, or their users are phis or from
3640 /// the other blocks. The resulting vector instructions can be placed at the
3641 /// beginning of the basic block without scheduling (if operands does not need
3642 /// to be scheduled) or at the end of the block (if users are outside of the
3643 /// block). It allows to save some compile time and memory used by the
3644 /// compiler.
3645 /// ScheduleData is assigned for each instruction in between the boundaries of
3646 /// the tree entry, even for those, which are not part of the graph. It is
3647 /// required to correctly follow the dependencies between the instructions and
3648 /// their correct scheduling. The ScheduleData is not allocated for the
3649 /// instructions, which do not require scheduling, like phis, nodes with
3650 /// extractelements/insertelements only or nodes with instructions, with
3651 /// uses/operands outside of the block.
3652 struct BlockScheduling {
BlockSchedulingllvm::slpvectorizer::BoUpSLP::BlockScheduling3653 BlockScheduling(BasicBlock *BB)
3654 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3655
clearllvm::slpvectorizer::BoUpSLP::BlockScheduling3656 void clear() {
3657 ReadyInsts.clear();
3658 ScheduleStart = nullptr;
3659 ScheduleEnd = nullptr;
3660 FirstLoadStoreInRegion = nullptr;
3661 LastLoadStoreInRegion = nullptr;
3662 RegionHasStackSave = false;
3663
3664 // Reduce the maximum schedule region size by the size of the
3665 // previous scheduling run.
3666 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3667 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3668 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3669 ScheduleRegionSize = 0;
3670
3671 // Make a new scheduling region, i.e. all existing ScheduleData is not
3672 // in the new region yet.
3673 ++SchedulingRegionID;
3674 }
3675
getScheduleDatallvm::slpvectorizer::BoUpSLP::BlockScheduling3676 ScheduleData *getScheduleData(Instruction *I) {
3677 if (BB != I->getParent())
3678 // Avoid lookup if can't possibly be in map.
3679 return nullptr;
3680 ScheduleData *SD = ScheduleDataMap.lookup(I);
3681 if (SD && isInSchedulingRegion(SD))
3682 return SD;
3683 return nullptr;
3684 }
3685
getScheduleDatallvm::slpvectorizer::BoUpSLP::BlockScheduling3686 ScheduleData *getScheduleData(Value *V) {
3687 if (auto *I = dyn_cast<Instruction>(V))
3688 return getScheduleData(I);
3689 return nullptr;
3690 }
3691
getScheduleDatallvm::slpvectorizer::BoUpSLP::BlockScheduling3692 ScheduleData *getScheduleData(Value *V, Value *Key) {
3693 if (V == Key)
3694 return getScheduleData(V);
3695 auto I = ExtraScheduleDataMap.find(V);
3696 if (I != ExtraScheduleDataMap.end()) {
3697 ScheduleData *SD = I->second.lookup(Key);
3698 if (SD && isInSchedulingRegion(SD))
3699 return SD;
3700 }
3701 return nullptr;
3702 }
3703
isInSchedulingRegionllvm::slpvectorizer::BoUpSLP::BlockScheduling3704 bool isInSchedulingRegion(ScheduleData *SD) const {
3705 return SD->SchedulingRegionID == SchedulingRegionID;
3706 }
3707
3708 /// Marks an instruction as scheduled and puts all dependent ready
3709 /// instructions into the ready-list.
3710 template <typename ReadyListType>
schedulellvm::slpvectorizer::BoUpSLP::BlockScheduling3711 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3712 SD->IsScheduled = true;
3713 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3714
3715 for (ScheduleData *BundleMember = SD; BundleMember;
3716 BundleMember = BundleMember->NextInBundle) {
3717 if (BundleMember->Inst != BundleMember->OpValue)
3718 continue;
3719
3720 // Handle the def-use chain dependencies.
3721
3722 // Decrement the unscheduled counter and insert to ready list if ready.
3723 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3724 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3725 if (OpDef && OpDef->hasValidDependencies() &&
3726 OpDef->incrementUnscheduledDeps(-1) == 0) {
3727 // There are no more unscheduled dependencies after
3728 // decrementing, so we can put the dependent instruction
3729 // into the ready list.
3730 ScheduleData *DepBundle = OpDef->FirstInBundle;
3731 assert(!DepBundle->IsScheduled &&
3732 "already scheduled bundle gets ready");
3733 ReadyList.insert(DepBundle);
3734 LLVM_DEBUG(dbgs()
3735 << "SLP: gets ready (def): " << *DepBundle << "\n");
3736 }
3737 });
3738 };
3739
3740 // If BundleMember is a vector bundle, its operands may have been
3741 // reordered during buildTree(). We therefore need to get its operands
3742 // through the TreeEntry.
3743 if (TreeEntry *TE = BundleMember->TE) {
3744 // Need to search for the lane since the tree entry can be reordered.
3745 int Lane = std::distance(TE->Scalars.begin(),
3746 find(TE->Scalars, BundleMember->Inst));
3747 assert(Lane >= 0 && "Lane not set");
3748
3749 // Since vectorization tree is being built recursively this assertion
3750 // ensures that the tree entry has all operands set before reaching
3751 // this code. Couple of exceptions known at the moment are extracts
3752 // where their second (immediate) operand is not added. Since
3753 // immediates do not affect scheduler behavior this is considered
3754 // okay.
3755 auto *In = BundleMember->Inst;
3756 assert(
3757 In &&
3758 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3759 In->getNumOperands() == TE->getNumOperands()) &&
3760 "Missed TreeEntry operands?");
3761 (void)In; // fake use to avoid build failure when assertions disabled
3762
3763 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3764 OpIdx != NumOperands; ++OpIdx)
3765 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3766 DecrUnsched(I);
3767 } else {
3768 // If BundleMember is a stand-alone instruction, no operand reordering
3769 // has taken place, so we directly access its operands.
3770 for (Use &U : BundleMember->Inst->operands())
3771 if (auto *I = dyn_cast<Instruction>(U.get()))
3772 DecrUnsched(I);
3773 }
3774 // Handle the memory dependencies.
3775 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3776 if (MemoryDepSD->hasValidDependencies() &&
3777 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3778 // There are no more unscheduled dependencies after decrementing,
3779 // so we can put the dependent instruction into the ready list.
3780 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3781 assert(!DepBundle->IsScheduled &&
3782 "already scheduled bundle gets ready");
3783 ReadyList.insert(DepBundle);
3784 LLVM_DEBUG(dbgs()
3785 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3786 }
3787 }
3788 // Handle the control dependencies.
3789 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3790 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3791 // There are no more unscheduled dependencies after decrementing,
3792 // so we can put the dependent instruction into the ready list.
3793 ScheduleData *DepBundle = DepSD->FirstInBundle;
3794 assert(!DepBundle->IsScheduled &&
3795 "already scheduled bundle gets ready");
3796 ReadyList.insert(DepBundle);
3797 LLVM_DEBUG(dbgs()
3798 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3799 }
3800 }
3801 }
3802 }
3803
3804 /// Verify basic self consistency properties of the data structure.
verifyllvm::slpvectorizer::BoUpSLP::BlockScheduling3805 void verify() {
3806 if (!ScheduleStart)
3807 return;
3808
3809 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3810 ScheduleStart->comesBefore(ScheduleEnd) &&
3811 "Not a valid scheduling region?");
3812
3813 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3814 auto *SD = getScheduleData(I);
3815 if (!SD)
3816 continue;
3817 assert(isInSchedulingRegion(SD) &&
3818 "primary schedule data not in window?");
3819 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3820 "entire bundle in window!");
3821 (void)SD;
3822 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3823 }
3824
3825 for (auto *SD : ReadyInsts) {
3826 assert(SD->isSchedulingEntity() && SD->isReady() &&
3827 "item in ready list not ready?");
3828 (void)SD;
3829 }
3830 }
3831
doForAllOpcodesllvm::slpvectorizer::BoUpSLP::BlockScheduling3832 void doForAllOpcodes(Value *V,
3833 function_ref<void(ScheduleData *SD)> Action) {
3834 if (ScheduleData *SD = getScheduleData(V))
3835 Action(SD);
3836 auto I = ExtraScheduleDataMap.find(V);
3837 if (I != ExtraScheduleDataMap.end())
3838 for (auto &P : I->second)
3839 if (isInSchedulingRegion(P.second))
3840 Action(P.second);
3841 }
3842
3843 /// Put all instructions into the ReadyList which are ready for scheduling.
3844 template <typename ReadyListType>
initialFillReadyListllvm::slpvectorizer::BoUpSLP::BlockScheduling3845 void initialFillReadyList(ReadyListType &ReadyList) {
3846 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3847 doForAllOpcodes(I, [&](ScheduleData *SD) {
3848 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3849 SD->isReady()) {
3850 ReadyList.insert(SD);
3851 LLVM_DEBUG(dbgs()
3852 << "SLP: initially in ready list: " << *SD << "\n");
3853 }
3854 });
3855 }
3856 }
3857
3858 /// Build a bundle from the ScheduleData nodes corresponding to the
3859 /// scalar instruction for each lane.
3860 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3861
3862 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3863 /// cyclic dependencies. This is only a dry-run, no instructions are
3864 /// actually moved at this stage.
3865 /// \returns the scheduling bundle. The returned Optional value is not
3866 /// std::nullopt if \p VL is allowed to be scheduled.
3867 std::optional<ScheduleData *>
3868 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3869 const InstructionsState &S);
3870
3871 /// Un-bundles a group of instructions.
3872 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3873
3874 /// Allocates schedule data chunk.
3875 ScheduleData *allocateScheduleDataChunks();
3876
3877 /// Extends the scheduling region so that V is inside the region.
3878 /// \returns true if the region size is within the limit.
3879 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3880
3881 /// Initialize the ScheduleData structures for new instructions in the
3882 /// scheduling region.
3883 void initScheduleData(Instruction *FromI, Instruction *ToI,
3884 ScheduleData *PrevLoadStore,
3885 ScheduleData *NextLoadStore);
3886
3887 /// Updates the dependency information of a bundle and of all instructions/
3888 /// bundles which depend on the original bundle.
3889 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3890 BoUpSLP *SLP);
3891
3892 /// Sets all instruction in the scheduling region to un-scheduled.
3893 void resetSchedule();
3894
3895 BasicBlock *BB;
3896
3897 /// Simple memory allocation for ScheduleData.
3898 SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3899
3900 /// The size of a ScheduleData array in ScheduleDataChunks.
3901 int ChunkSize;
3902
3903 /// The allocator position in the current chunk, which is the last entry
3904 /// of ScheduleDataChunks.
3905 int ChunkPos;
3906
3907 /// Attaches ScheduleData to Instruction.
3908 /// Note that the mapping survives during all vectorization iterations, i.e.
3909 /// ScheduleData structures are recycled.
3910 DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
3911
3912 /// Attaches ScheduleData to Instruction with the leading key.
3913 DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
3914 ExtraScheduleDataMap;
3915
3916 /// The ready-list for scheduling (only used for the dry-run).
3917 SetVector<ScheduleData *> ReadyInsts;
3918
3919 /// The first instruction of the scheduling region.
3920 Instruction *ScheduleStart = nullptr;
3921
3922 /// The first instruction _after_ the scheduling region.
3923 Instruction *ScheduleEnd = nullptr;
3924
3925 /// The first memory accessing instruction in the scheduling region
3926 /// (can be null).
3927 ScheduleData *FirstLoadStoreInRegion = nullptr;
3928
3929 /// The last memory accessing instruction in the scheduling region
3930 /// (can be null).
3931 ScheduleData *LastLoadStoreInRegion = nullptr;
3932
3933 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3934 /// region? Used to optimize the dependence calculation for the
3935 /// common case where there isn't.
3936 bool RegionHasStackSave = false;
3937
3938 /// The current size of the scheduling region.
3939 int ScheduleRegionSize = 0;
3940
3941 /// The maximum size allowed for the scheduling region.
3942 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3943
3944 /// The ID of the scheduling region. For a new vectorization iteration this
3945 /// is incremented which "removes" all ScheduleData from the region.
3946 /// Make sure that the initial SchedulingRegionID is greater than the
3947 /// initial SchedulingRegionID in ScheduleData (which is 0).
3948 int SchedulingRegionID = 1;
3949 };
3950
3951 /// Attaches the BlockScheduling structures to basic blocks.
3952 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
3953
3954 /// Performs the "real" scheduling. Done before vectorization is actually
3955 /// performed in a basic block.
3956 void scheduleBlock(BlockScheduling *BS);
3957
3958 /// List of users to ignore during scheduling and that don't need extracting.
3959 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3960
3961 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3962 /// sorted SmallVectors of unsigned.
3963 struct OrdersTypeDenseMapInfo {
getEmptyKeyllvm::slpvectorizer::BoUpSLP::OrdersTypeDenseMapInfo3964 static OrdersType getEmptyKey() {
3965 OrdersType V;
3966 V.push_back(~1U);
3967 return V;
3968 }
3969
getTombstoneKeyllvm::slpvectorizer::BoUpSLP::OrdersTypeDenseMapInfo3970 static OrdersType getTombstoneKey() {
3971 OrdersType V;
3972 V.push_back(~2U);
3973 return V;
3974 }
3975
getHashValuellvm::slpvectorizer::BoUpSLP::OrdersTypeDenseMapInfo3976 static unsigned getHashValue(const OrdersType &V) {
3977 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3978 }
3979
isEqualllvm::slpvectorizer::BoUpSLP::OrdersTypeDenseMapInfo3980 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3981 return LHS == RHS;
3982 }
3983 };
3984
3985 // Analysis and block reference.
3986 Function *F;
3987 ScalarEvolution *SE;
3988 TargetTransformInfo *TTI;
3989 TargetLibraryInfo *TLI;
3990 LoopInfo *LI;
3991 DominatorTree *DT;
3992 AssumptionCache *AC;
3993 DemandedBits *DB;
3994 const DataLayout *DL;
3995 OptimizationRemarkEmitter *ORE;
3996
3997 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3998 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3999
4000 /// Instruction builder to construct the vectorized tree.
4001 IRBuilder<TargetFolder> Builder;
4002
4003 /// A map of scalar integer values to the smallest bit width with which they
4004 /// can legally be represented. The values map to (width, signed) pairs,
4005 /// where "width" indicates the minimum bit width and "signed" is True if the
4006 /// value must be signed-extended, rather than zero-extended, back to its
4007 /// original width.
4008 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
4009
4010 /// Final size of the reduced vector, if the current graph represents the
4011 /// input for the reduction and it was possible to narrow the size of the
4012 /// reduction.
4013 unsigned ReductionBitWidth = 0;
4014
4015 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4016 /// type sizes, used in the tree.
4017 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4018
4019 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4020 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4021 DenseSet<unsigned> ExtraBitWidthNodes;
4022 };
4023
4024 } // end namespace slpvectorizer
4025
4026 template <> struct GraphTraits<BoUpSLP *> {
4027 using TreeEntry = BoUpSLP::TreeEntry;
4028
4029 /// NodeRef has to be a pointer per the GraphWriter.
4030 using NodeRef = TreeEntry *;
4031
4032 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
4033
4034 /// Add the VectorizableTree to the index iterator to be able to return
4035 /// TreeEntry pointers.
4036 struct ChildIteratorType
4037 : public iterator_adaptor_base<
4038 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4039 ContainerTy &VectorizableTree;
4040
ChildIteratorTypellvm::GraphTraits::ChildIteratorType4041 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
4042 ContainerTy &VT)
4043 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4044
operator *llvm::GraphTraits::ChildIteratorType4045 NodeRef operator*() { return I->UserTE; }
4046 };
4047
getEntryNodellvm::GraphTraits4048 static NodeRef getEntryNode(BoUpSLP &R) {
4049 return R.VectorizableTree[0].get();
4050 }
4051
child_beginllvm::GraphTraits4052 static ChildIteratorType child_begin(NodeRef N) {
4053 return {N->UserTreeIndices.begin(), N->Container};
4054 }
4055
child_endllvm::GraphTraits4056 static ChildIteratorType child_end(NodeRef N) {
4057 return {N->UserTreeIndices.end(), N->Container};
4058 }
4059
4060 /// For the node iterator we just need to turn the TreeEntry iterator into a
4061 /// TreeEntry* iterator so that it dereferences to NodeRef.
4062 class nodes_iterator {
4063 using ItTy = ContainerTy::iterator;
4064 ItTy It;
4065
4066 public:
nodes_iterator(const ItTy & It2)4067 nodes_iterator(const ItTy &It2) : It(It2) {}
operator *()4068 NodeRef operator*() { return It->get(); }
operator ++()4069 nodes_iterator operator++() {
4070 ++It;
4071 return *this;
4072 }
operator !=(const nodes_iterator & N2) const4073 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4074 };
4075
nodes_beginllvm::GraphTraits4076 static nodes_iterator nodes_begin(BoUpSLP *R) {
4077 return nodes_iterator(R->VectorizableTree.begin());
4078 }
4079
nodes_endllvm::GraphTraits4080 static nodes_iterator nodes_end(BoUpSLP *R) {
4081 return nodes_iterator(R->VectorizableTree.end());
4082 }
4083
sizellvm::GraphTraits4084 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4085 };
4086
4087 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4088 using TreeEntry = BoUpSLP::TreeEntry;
4089
DOTGraphTraitsllvm::DOTGraphTraits4090 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4091
getNodeLabelllvm::DOTGraphTraits4092 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4093 std::string Str;
4094 raw_string_ostream OS(Str);
4095 OS << Entry->Idx << ".\n";
4096 if (isSplat(Entry->Scalars))
4097 OS << "<splat> ";
4098 for (auto *V : Entry->Scalars) {
4099 OS << *V;
4100 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4101 return EU.Scalar == V;
4102 }))
4103 OS << " <extract>";
4104 OS << "\n";
4105 }
4106 return Str;
4107 }
4108
getNodeAttributesllvm::DOTGraphTraits4109 static std::string getNodeAttributes(const TreeEntry *Entry,
4110 const BoUpSLP *) {
4111 if (Entry->isGather())
4112 return "color=red";
4113 if (Entry->State == TreeEntry::ScatterVectorize ||
4114 Entry->State == TreeEntry::StridedVectorize)
4115 return "color=blue";
4116 return "";
4117 }
4118 };
4119
4120 } // end namespace llvm
4121
~BoUpSLP()4122 BoUpSLP::~BoUpSLP() {
4123 SmallVector<WeakTrackingVH> DeadInsts;
4124 for (auto *I : DeletedInstructions) {
4125 if (!I->getParent()) {
4126 // Temporarily insert instruction back to erase them from parent and
4127 // memory later.
4128 if (isa<PHINode>(I))
4129 // Phi nodes must be the very first instructions in the block.
4130 I->insertBefore(F->getEntryBlock(),
4131 F->getEntryBlock().getFirstNonPHIIt());
4132 else
4133 I->insertBefore(F->getEntryBlock().getTerminator());
4134 continue;
4135 }
4136 for (Use &U : I->operands()) {
4137 auto *Op = dyn_cast<Instruction>(U.get());
4138 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4139 wouldInstructionBeTriviallyDead(Op, TLI))
4140 DeadInsts.emplace_back(Op);
4141 }
4142 I->dropAllReferences();
4143 }
4144 for (auto *I : DeletedInstructions) {
4145 assert(I->use_empty() &&
4146 "trying to erase instruction with users.");
4147 I->eraseFromParent();
4148 }
4149
4150 // Cleanup any dead scalar code feeding the vectorized instructions
4151 RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
4152
4153 #ifdef EXPENSIVE_CHECKS
4154 // If we could guarantee that this call is not extremely slow, we could
4155 // remove the ifdef limitation (see PR47712).
4156 assert(!verifyFunction(*F, &dbgs()));
4157 #endif
4158 }
4159
4160 /// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4161 /// contains original mask for the scalars reused in the node. Procedure
4162 /// transform this mask in accordance with the given \p Mask.
reorderReuses(SmallVectorImpl<int> & Reuses,ArrayRef<int> Mask)4163 static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
4164 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4165 "Expected non-empty mask.");
4166 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4167 Prev.swap(Reuses);
4168 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4169 if (Mask[I] != PoisonMaskElem)
4170 Reuses[Mask[I]] = Prev[I];
4171 }
4172
4173 /// Reorders the given \p Order according to the given \p Mask. \p Order - is
4174 /// the original order of the scalars. Procedure transforms the provided order
4175 /// in accordance with the given \p Mask. If the resulting \p Order is just an
4176 /// identity order, \p Order is cleared.
reorderOrder(SmallVectorImpl<unsigned> & Order,ArrayRef<int> Mask,bool BottomOrder=false)4177 static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
4178 bool BottomOrder = false) {
4179 assert(!Mask.empty() && "Expected non-empty mask.");
4180 unsigned Sz = Mask.size();
4181 if (BottomOrder) {
4182 SmallVector<unsigned> PrevOrder;
4183 if (Order.empty()) {
4184 PrevOrder.resize(Sz);
4185 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4186 } else {
4187 PrevOrder.swap(Order);
4188 }
4189 Order.assign(Sz, Sz);
4190 for (unsigned I = 0; I < Sz; ++I)
4191 if (Mask[I] != PoisonMaskElem)
4192 Order[I] = PrevOrder[Mask[I]];
4193 if (all_of(enumerate(Order), [&](const auto &Data) {
4194 return Data.value() == Sz || Data.index() == Data.value();
4195 })) {
4196 Order.clear();
4197 return;
4198 }
4199 fixupOrderingIndices(Order);
4200 return;
4201 }
4202 SmallVector<int> MaskOrder;
4203 if (Order.empty()) {
4204 MaskOrder.resize(Sz);
4205 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4206 } else {
4207 inversePermutation(Order, MaskOrder);
4208 }
4209 reorderReuses(MaskOrder, Mask);
4210 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4211 Order.clear();
4212 return;
4213 }
4214 Order.assign(Sz, Sz);
4215 for (unsigned I = 0; I < Sz; ++I)
4216 if (MaskOrder[I] != PoisonMaskElem)
4217 Order[MaskOrder[I]] = I;
4218 fixupOrderingIndices(Order);
4219 }
4220
4221 std::optional<BoUpSLP::OrdersType>
findReusedOrderedScalars(const BoUpSLP::TreeEntry & TE)4222 BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4223 assert(TE.isGather() && "Expected gather node only.");
4224 // Try to find subvector extract/insert patterns and reorder only such
4225 // patterns.
4226 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4227 Type *ScalarTy = GatheredScalars.front()->getType();
4228 int NumScalars = GatheredScalars.size();
4229 if (!isValidElementType(ScalarTy))
4230 return std::nullopt;
4231 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4232 int NumParts = TTI->getNumberOfParts(VecTy);
4233 if (NumParts == 0 || NumParts >= NumScalars)
4234 NumParts = 1;
4235 SmallVector<int> ExtractMask;
4236 SmallVector<int> Mask;
4237 SmallVector<SmallVector<const TreeEntry *>> Entries;
4238 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
4239 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4240 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
4241 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4242 /*ForOrder=*/true);
4243 // No shuffled operands - ignore.
4244 if (GatherShuffles.empty() && ExtractShuffles.empty())
4245 return std::nullopt;
4246 OrdersType CurrentOrder(NumScalars, NumScalars);
4247 if (GatherShuffles.size() == 1 &&
4248 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4249 Entries.front().front()->isSame(TE.Scalars)) {
4250 // Perfect match in the graph, will reuse the previously vectorized
4251 // node. Cost is 0.
4252 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4253 return CurrentOrder;
4254 }
4255 auto IsSplatMask = [](ArrayRef<int> Mask) {
4256 int SingleElt = PoisonMaskElem;
4257 return all_of(Mask, [&](int I) {
4258 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4259 SingleElt = I;
4260 return I == PoisonMaskElem || I == SingleElt;
4261 });
4262 };
4263 // Exclusive broadcast mask - ignore.
4264 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4265 (Entries.size() != 1 ||
4266 Entries.front().front()->ReorderIndices.empty())) ||
4267 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4268 return std::nullopt;
4269 SmallBitVector ShuffledSubMasks(NumParts);
4270 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4271 ArrayRef<int> Mask, int PartSz, int NumParts,
4272 function_ref<unsigned(unsigned)> GetVF) {
4273 for (int I : seq<int>(0, NumParts)) {
4274 if (ShuffledSubMasks.test(I))
4275 continue;
4276 const int VF = GetVF(I);
4277 if (VF == 0)
4278 continue;
4279 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4280 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4281 // Shuffle of at least 2 vectors - ignore.
4282 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4283 std::fill(Slice.begin(), Slice.end(), NumScalars);
4284 ShuffledSubMasks.set(I);
4285 continue;
4286 }
4287 // Try to include as much elements from the mask as possible.
4288 int FirstMin = INT_MAX;
4289 int SecondVecFound = false;
4290 for (int K : seq<int>(Limit)) {
4291 int Idx = Mask[I * PartSz + K];
4292 if (Idx == PoisonMaskElem) {
4293 Value *V = GatheredScalars[I * PartSz + K];
4294 if (isConstant(V) && !isa<PoisonValue>(V)) {
4295 SecondVecFound = true;
4296 break;
4297 }
4298 continue;
4299 }
4300 if (Idx < VF) {
4301 if (FirstMin > Idx)
4302 FirstMin = Idx;
4303 } else {
4304 SecondVecFound = true;
4305 break;
4306 }
4307 }
4308 FirstMin = (FirstMin / PartSz) * PartSz;
4309 // Shuffle of at least 2 vectors - ignore.
4310 if (SecondVecFound) {
4311 std::fill(Slice.begin(), Slice.end(), NumScalars);
4312 ShuffledSubMasks.set(I);
4313 continue;
4314 }
4315 for (int K : seq<int>(Limit)) {
4316 int Idx = Mask[I * PartSz + K];
4317 if (Idx == PoisonMaskElem)
4318 continue;
4319 Idx -= FirstMin;
4320 if (Idx >= PartSz) {
4321 SecondVecFound = true;
4322 break;
4323 }
4324 if (CurrentOrder[I * PartSz + Idx] >
4325 static_cast<unsigned>(I * PartSz + K) &&
4326 CurrentOrder[I * PartSz + Idx] !=
4327 static_cast<unsigned>(I * PartSz + Idx))
4328 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4329 }
4330 // Shuffle of at least 2 vectors - ignore.
4331 if (SecondVecFound) {
4332 std::fill(Slice.begin(), Slice.end(), NumScalars);
4333 ShuffledSubMasks.set(I);
4334 continue;
4335 }
4336 }
4337 };
4338 int PartSz = getPartNumElems(NumScalars, NumParts);
4339 if (!ExtractShuffles.empty())
4340 TransformMaskToOrder(
4341 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4342 if (!ExtractShuffles[I])
4343 return 0U;
4344 unsigned VF = 0;
4345 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4346 for (unsigned Idx : seq<unsigned>(Sz)) {
4347 int K = I * PartSz + Idx;
4348 if (ExtractMask[K] == PoisonMaskElem)
4349 continue;
4350 if (!TE.ReuseShuffleIndices.empty())
4351 K = TE.ReuseShuffleIndices[K];
4352 if (!TE.ReorderIndices.empty())
4353 K = std::distance(TE.ReorderIndices.begin(),
4354 find(TE.ReorderIndices, K));
4355 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4356 if (!EI)
4357 continue;
4358 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4359 ->getElementCount()
4360 .getKnownMinValue());
4361 }
4362 return VF;
4363 });
4364 // Check special corner case - single shuffle of the same entry.
4365 if (GatherShuffles.size() == 1 && NumParts != 1) {
4366 if (ShuffledSubMasks.any())
4367 return std::nullopt;
4368 PartSz = NumScalars;
4369 NumParts = 1;
4370 }
4371 if (!Entries.empty())
4372 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4373 if (!GatherShuffles[I])
4374 return 0U;
4375 return std::max(Entries[I].front()->getVectorFactor(),
4376 Entries[I].back()->getVectorFactor());
4377 });
4378 int NumUndefs =
4379 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4380 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4381 return std::nullopt;
4382 return std::move(CurrentOrder);
4383 }
4384
arePointersCompatible(Value * Ptr1,Value * Ptr2,const TargetLibraryInfo & TLI,bool CompareOpcodes=true)4385 static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4386 const TargetLibraryInfo &TLI,
4387 bool CompareOpcodes = true) {
4388 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4389 return false;
4390 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4391 if (!GEP1)
4392 return false;
4393 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4394 if (!GEP2)
4395 return false;
4396 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4397 ((isConstant(GEP1->getOperand(1)) &&
4398 isConstant(GEP2->getOperand(1))) ||
4399 !CompareOpcodes ||
4400 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4401 .getOpcode());
4402 }
4403
4404 /// Calculates minimal alignment as a common alignment.
4405 template <typename T>
computeCommonAlignment(ArrayRef<Value * > VL)4406 static Align computeCommonAlignment(ArrayRef<Value *> VL) {
4407 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4408 for (Value *V : VL.drop_front())
4409 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4410 return CommonAlignment;
4411 }
4412
4413 /// Check if \p Order represents reverse order.
isReverseOrder(ArrayRef<unsigned> Order)4414 static bool isReverseOrder(ArrayRef<unsigned> Order) {
4415 unsigned Sz = Order.size();
4416 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4417 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4418 });
4419 }
4420
4421 /// Checks if the provided list of pointers \p Pointers represents the strided
4422 /// pointers for type ElemTy. If they are not, std::nullopt is returned.
4423 /// Otherwise, if \p Inst is not specified, just initialized optional value is
4424 /// returned to show that the pointers represent strided pointers. If \p Inst
4425 /// specified, the runtime stride is materialized before the given \p Inst.
4426 /// \returns std::nullopt if the pointers are not pointers with the runtime
4427 /// stride, nullptr or actual stride value, otherwise.
4428 static std::optional<Value *>
calculateRtStride(ArrayRef<Value * > PointerOps,Type * ElemTy,const DataLayout & DL,ScalarEvolution & SE,SmallVectorImpl<unsigned> & SortedIndices,Instruction * Inst=nullptr)4429 calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
4430 const DataLayout &DL, ScalarEvolution &SE,
4431 SmallVectorImpl<unsigned> &SortedIndices,
4432 Instruction *Inst = nullptr) {
4433 SmallVector<const SCEV *> SCEVs;
4434 const SCEV *PtrSCEVLowest = nullptr;
4435 const SCEV *PtrSCEVHighest = nullptr;
4436 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4437 // addresses).
4438 for (Value *Ptr : PointerOps) {
4439 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4440 if (!PtrSCEV)
4441 return std::nullopt;
4442 SCEVs.push_back(PtrSCEV);
4443 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4444 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4445 continue;
4446 }
4447 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4448 if (isa<SCEVCouldNotCompute>(Diff))
4449 return std::nullopt;
4450 if (Diff->isNonConstantNegative()) {
4451 PtrSCEVLowest = PtrSCEV;
4452 continue;
4453 }
4454 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4455 if (isa<SCEVCouldNotCompute>(Diff1))
4456 return std::nullopt;
4457 if (Diff1->isNonConstantNegative()) {
4458 PtrSCEVHighest = PtrSCEV;
4459 continue;
4460 }
4461 }
4462 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4463 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4464 if (isa<SCEVCouldNotCompute>(Dist))
4465 return std::nullopt;
4466 int Size = DL.getTypeStoreSize(ElemTy);
4467 auto TryGetStride = [&](const SCEV *Dist,
4468 const SCEV *Multiplier) -> const SCEV * {
4469 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4470 if (M->getOperand(0) == Multiplier)
4471 return M->getOperand(1);
4472 if (M->getOperand(1) == Multiplier)
4473 return M->getOperand(0);
4474 return nullptr;
4475 }
4476 if (Multiplier == Dist)
4477 return SE.getConstant(Dist->getType(), 1);
4478 return SE.getUDivExactExpr(Dist, Multiplier);
4479 };
4480 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4481 const SCEV *Stride = nullptr;
4482 if (Size != 1 || SCEVs.size() > 2) {
4483 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4484 Stride = TryGetStride(Dist, Sz);
4485 if (!Stride)
4486 return std::nullopt;
4487 }
4488 if (!Stride || isa<SCEVConstant>(Stride))
4489 return std::nullopt;
4490 // Iterate through all pointers and check if all distances are
4491 // unique multiple of Stride.
4492 using DistOrdPair = std::pair<int64_t, int>;
4493 auto Compare = llvm::less_first();
4494 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4495 int Cnt = 0;
4496 bool IsConsecutive = true;
4497 for (const SCEV *PtrSCEV : SCEVs) {
4498 unsigned Dist = 0;
4499 if (PtrSCEV != PtrSCEVLowest) {
4500 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4501 const SCEV *Coeff = TryGetStride(Diff, Stride);
4502 if (!Coeff)
4503 return std::nullopt;
4504 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4505 if (!SC || isa<SCEVCouldNotCompute>(SC))
4506 return std::nullopt;
4507 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4508 SE.getMulExpr(Stride, SC)))
4509 ->isZero())
4510 return std::nullopt;
4511 Dist = SC->getAPInt().getZExtValue();
4512 }
4513 // If the strides are not the same or repeated, we can't vectorize.
4514 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4515 return std::nullopt;
4516 auto Res = Offsets.emplace(Dist, Cnt);
4517 if (!Res.second)
4518 return std::nullopt;
4519 // Consecutive order if the inserted element is the last one.
4520 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4521 ++Cnt;
4522 }
4523 if (Offsets.size() != SCEVs.size())
4524 return std::nullopt;
4525 SortedIndices.clear();
4526 if (!IsConsecutive) {
4527 // Fill SortedIndices array only if it is non-consecutive.
4528 SortedIndices.resize(PointerOps.size());
4529 Cnt = 0;
4530 for (const std::pair<int64_t, int> &Pair : Offsets) {
4531 SortedIndices[Cnt] = Pair.second;
4532 ++Cnt;
4533 }
4534 }
4535 if (!Inst)
4536 return nullptr;
4537 SCEVExpander Expander(SE, DL, "strided-load-vec");
4538 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4539 }
4540
4541 static std::pair<InstructionCost, InstructionCost>
4542 getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
4543 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4544 Type *ScalarTy, VectorType *VecTy);
4545
canVectorizeLoads(ArrayRef<Value * > VL,const Value * VL0,SmallVectorImpl<unsigned> & Order,SmallVectorImpl<Value * > & PointerOps,bool TryRecursiveCheck) const4546 BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4547 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4548 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4549 // Check that a vectorized load would load the same memory as a scalar
4550 // load. For example, we don't want to vectorize loads that are smaller
4551 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4552 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4553 // from such a struct, we read/write packed bits disagreeing with the
4554 // unvectorized version.
4555 Type *ScalarTy = VL0->getType();
4556
4557 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4558 return LoadsState::Gather;
4559
4560 // Make sure all loads in the bundle are simple - we can't vectorize
4561 // atomic or volatile loads.
4562 PointerOps.clear();
4563 const unsigned Sz = VL.size();
4564 PointerOps.resize(Sz);
4565 auto *POIter = PointerOps.begin();
4566 for (Value *V : VL) {
4567 auto *L = cast<LoadInst>(V);
4568 if (!L->isSimple())
4569 return LoadsState::Gather;
4570 *POIter = L->getPointerOperand();
4571 ++POIter;
4572 }
4573
4574 Order.clear();
4575 auto *VecTy = getWidenedType(ScalarTy, Sz);
4576 // Check the order of pointer operands or that all pointers are the same.
4577 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4578 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4579 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4580 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4581 "supported with VectorizeNonPowerOf2");
4582 return LoadsState::Gather;
4583 }
4584
4585 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4586 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4587 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4588 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4589 return LoadsState::StridedVectorize;
4590 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4591 return arePointersCompatible(P, PointerOps.front(), *TLI);
4592 })) {
4593 if (IsSorted) {
4594 Value *Ptr0;
4595 Value *PtrN;
4596 if (Order.empty()) {
4597 Ptr0 = PointerOps.front();
4598 PtrN = PointerOps.back();
4599 } else {
4600 Ptr0 = PointerOps[Order.front()];
4601 PtrN = PointerOps[Order.back()];
4602 }
4603 std::optional<int> Diff =
4604 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4605 // Check that the sorted loads are consecutive.
4606 if (static_cast<unsigned>(*Diff) == Sz - 1)
4607 return LoadsState::Vectorize;
4608 // Simple check if not a strided access - clear order.
4609 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4610 // Try to generate strided load node if:
4611 // 1. Target with strided load support is detected.
4612 // 2. The number of loads is greater than MinProfitableStridedLoads,
4613 // or the potential stride <= MaxProfitableLoadStride and the
4614 // potential stride is power-of-2 (to avoid perf regressions for the very
4615 // small number of loads) and max distance > number of loads, or potential
4616 // stride is -1.
4617 // 3. The loads are ordered, or number of unordered loads <=
4618 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4619 // (this check is to avoid extra costs for very expensive shuffles).
4620 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4621 (static_cast<unsigned>(std::abs(*Diff)) <=
4622 MaxProfitableLoadStride * Sz &&
4623 isPowerOf2_32(std::abs(*Diff)))) &&
4624 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4625 *Diff == -(static_cast<int>(Sz) - 1))) {
4626 int Stride = *Diff / static_cast<int>(Sz - 1);
4627 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4628 Align Alignment =
4629 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4630 ->getAlign();
4631 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4632 // Iterate through all pointers and check if all distances are
4633 // unique multiple of Dist.
4634 SmallSet<int, 4> Dists;
4635 for (Value *Ptr : PointerOps) {
4636 int Dist = 0;
4637 if (Ptr == PtrN)
4638 Dist = *Diff;
4639 else if (Ptr != Ptr0)
4640 Dist =
4641 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4642 // If the strides are not the same or repeated, we can't
4643 // vectorize.
4644 if (((Dist / Stride) * Stride) != Dist ||
4645 !Dists.insert(Dist).second)
4646 break;
4647 }
4648 if (Dists.size() == Sz)
4649 return LoadsState::StridedVectorize;
4650 }
4651 }
4652 }
4653 }
4654 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4655 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4656 unsigned MinVF = getMinVF(Sz);
4657 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4658 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4659 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4660 unsigned VectorizedCnt = 0;
4661 SmallVector<LoadsState> States;
4662 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4663 Cnt += VF, ++VectorizedCnt) {
4664 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4665 SmallVector<unsigned> Order;
4666 SmallVector<Value *> PointerOps;
4667 LoadsState LS =
4668 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4669 /*TryRecursiveCheck=*/false);
4670 // Check that the sorted loads are consecutive.
4671 if (LS == LoadsState::Gather)
4672 break;
4673 // If need the reorder - consider as high-cost masked gather for now.
4674 if ((LS == LoadsState::Vectorize ||
4675 LS == LoadsState::StridedVectorize) &&
4676 !Order.empty() && !isReverseOrder(Order))
4677 LS = LoadsState::ScatterVectorize;
4678 States.push_back(LS);
4679 }
4680 // Can be vectorized later as a serie of loads/insertelements.
4681 if (VectorizedCnt == VL.size() / VF) {
4682 // Compare masked gather cost and loads + insersubvector costs.
4683 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4684 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4685 TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4686 CostKind, ScalarTy, VecTy);
4687 InstructionCost MaskedGatherCost =
4688 TTI.getGatherScatterOpCost(
4689 Instruction::Load, VecTy,
4690 cast<LoadInst>(VL0)->getPointerOperand(),
4691 /*VariableMask=*/false, CommonAlignment, CostKind) +
4692 VectorGEPCost - ScalarGEPCost;
4693 InstructionCost VecLdCost = 0;
4694 auto *SubVecTy = getWidenedType(ScalarTy, VF);
4695 for (auto [I, LS] : enumerate(States)) {
4696 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4697 switch (LS) {
4698 case LoadsState::Vectorize: {
4699 auto [ScalarGEPCost, VectorGEPCost] =
4700 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4701 LI0->getPointerOperand(), Instruction::Load,
4702 CostKind, ScalarTy, SubVecTy);
4703 VecLdCost += TTI.getMemoryOpCost(
4704 Instruction::Load, SubVecTy, LI0->getAlign(),
4705 LI0->getPointerAddressSpace(), CostKind,
4706 TTI::OperandValueInfo()) +
4707 VectorGEPCost - ScalarGEPCost;
4708 break;
4709 }
4710 case LoadsState::StridedVectorize: {
4711 auto [ScalarGEPCost, VectorGEPCost] =
4712 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4713 LI0->getPointerOperand(), Instruction::Load,
4714 CostKind, ScalarTy, SubVecTy);
4715 VecLdCost +=
4716 TTI.getStridedMemoryOpCost(
4717 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4718 /*VariableMask=*/false, CommonAlignment, CostKind) +
4719 VectorGEPCost - ScalarGEPCost;
4720 break;
4721 }
4722 case LoadsState::ScatterVectorize: {
4723 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4724 TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4725 LI0->getPointerOperand(), Instruction::GetElementPtr,
4726 CostKind, ScalarTy, SubVecTy);
4727 VecLdCost +=
4728 TTI.getGatherScatterOpCost(
4729 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4730 /*VariableMask=*/false, CommonAlignment, CostKind) +
4731 VectorGEPCost - ScalarGEPCost;
4732 break;
4733 }
4734 case LoadsState::Gather:
4735 llvm_unreachable(
4736 "Expected only consecutive, strided or masked gather loads.");
4737 }
4738 SmallVector<int> ShuffleMask(VL.size());
4739 for (int Idx : seq<int>(0, VL.size()))
4740 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4741 VecLdCost +=
4742 TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask,
4743 CostKind, I * VF, SubVecTy);
4744 }
4745 // If masked gather cost is higher - better to vectorize, so
4746 // consider it as a gather node. It will be better estimated
4747 // later.
4748 if (MaskedGatherCost >= VecLdCost)
4749 return true;
4750 }
4751 }
4752 return false;
4753 };
4754 // TODO: need to improve analysis of the pointers, if not all of them are
4755 // GEPs or have > 2 operands, we end up with a gather node, which just
4756 // increases the cost.
4757 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4758 bool ProfitableGatherPointers =
4759 L && Sz > 2 &&
4760 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4761 return L->isLoopInvariant(V);
4762 })) <= Sz / 2;
4763 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4764 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4765 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4766 (GEP && GEP->getNumOperands() == 2 &&
4767 isa<Constant, Instruction>(GEP->getOperand(1)));
4768 })) {
4769 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4770 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4771 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4772 // Check if potential masked gather can be represented as series
4773 // of loads + insertsubvectors.
4774 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4775 // If masked gather cost is higher - better to vectorize, so
4776 // consider it as a gather node. It will be better estimated
4777 // later.
4778 return LoadsState::Gather;
4779 }
4780 return LoadsState::ScatterVectorize;
4781 }
4782 }
4783 }
4784
4785 return LoadsState::Gather;
4786 }
4787
clusterSortPtrAccesses(ArrayRef<Value * > VL,Type * ElemTy,const DataLayout & DL,ScalarEvolution & SE,SmallVectorImpl<unsigned> & SortedIndices)4788 static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
4789 const DataLayout &DL, ScalarEvolution &SE,
4790 SmallVectorImpl<unsigned> &SortedIndices) {
4791 assert(llvm::all_of(
4792 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4793 "Expected list of pointer operands.");
4794 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4795 // Ptr into, sort and return the sorted indices with values next to one
4796 // another.
4797 MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases;
4798 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4799
4800 unsigned Cnt = 1;
4801 for (Value *Ptr : VL.drop_front()) {
4802 bool Found = any_of(Bases, [&](auto &Base) {
4803 std::optional<int> Diff =
4804 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4805 /*StrictCheck=*/true);
4806 if (!Diff)
4807 return false;
4808
4809 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4810 return true;
4811 });
4812
4813 if (!Found) {
4814 // If we haven't found enough to usefully cluster, return early.
4815 if (Bases.size() > VL.size() / 2 - 1)
4816 return false;
4817
4818 // Not found already - add a new Base
4819 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4820 }
4821 }
4822
4823 // For each of the bases sort the pointers by Offset and check if any of the
4824 // base become consecutively allocated.
4825 bool AnyConsecutive = false;
4826 for (auto &Base : Bases) {
4827 auto &Vec = Base.second;
4828 if (Vec.size() > 1) {
4829 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4830 const std::tuple<Value *, int, unsigned> &Y) {
4831 return std::get<1>(X) < std::get<1>(Y);
4832 });
4833 int InitialOffset = std::get<1>(Vec[0]);
4834 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4835 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4836 });
4837 }
4838 }
4839
4840 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4841 SortedIndices.clear();
4842 if (!AnyConsecutive)
4843 return false;
4844
4845 for (auto &Base : Bases) {
4846 for (auto &T : Base.second)
4847 SortedIndices.push_back(std::get<2>(T));
4848 }
4849
4850 assert(SortedIndices.size() == VL.size() &&
4851 "Expected SortedIndices to be the size of VL");
4852 return true;
4853 }
4854
4855 std::optional<BoUpSLP::OrdersType>
findPartiallyOrderedLoads(const BoUpSLP::TreeEntry & TE)4856 BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4857 assert(TE.isGather() && "Expected gather node only.");
4858 Type *ScalarTy = TE.Scalars[0]->getType();
4859
4860 SmallVector<Value *> Ptrs;
4861 Ptrs.reserve(TE.Scalars.size());
4862 for (Value *V : TE.Scalars) {
4863 auto *L = dyn_cast<LoadInst>(V);
4864 if (!L || !L->isSimple())
4865 return std::nullopt;
4866 Ptrs.push_back(L->getPointerOperand());
4867 }
4868
4869 BoUpSLP::OrdersType Order;
4870 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4871 return std::move(Order);
4872 return std::nullopt;
4873 }
4874
4875 /// Check if two insertelement instructions are from the same buildvector.
areTwoInsertFromSameBuildVector(InsertElementInst * VU,InsertElementInst * V,function_ref<Value * (InsertElementInst *)> GetBaseOperand)4876 static bool areTwoInsertFromSameBuildVector(
4877 InsertElementInst *VU, InsertElementInst *V,
4878 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4879 // Instructions must be from the same basic blocks.
4880 if (VU->getParent() != V->getParent())
4881 return false;
4882 // Checks if 2 insertelements are from the same buildvector.
4883 if (VU->getType() != V->getType())
4884 return false;
4885 // Multiple used inserts are separate nodes.
4886 if (!VU->hasOneUse() && !V->hasOneUse())
4887 return false;
4888 auto *IE1 = VU;
4889 auto *IE2 = V;
4890 std::optional<unsigned> Idx1 = getElementIndex(IE1);
4891 std::optional<unsigned> Idx2 = getElementIndex(IE2);
4892 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4893 return false;
4894 // Go through the vector operand of insertelement instructions trying to find
4895 // either VU as the original vector for IE2 or V as the original vector for
4896 // IE1.
4897 SmallBitVector ReusedIdx(
4898 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4899 bool IsReusedIdx = false;
4900 do {
4901 if (IE2 == VU && !IE1)
4902 return VU->hasOneUse();
4903 if (IE1 == V && !IE2)
4904 return V->hasOneUse();
4905 if (IE1 && IE1 != V) {
4906 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
4907 IsReusedIdx |= ReusedIdx.test(Idx1);
4908 ReusedIdx.set(Idx1);
4909 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4910 IE1 = nullptr;
4911 else
4912 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4913 }
4914 if (IE2 && IE2 != VU) {
4915 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
4916 IsReusedIdx |= ReusedIdx.test(Idx2);
4917 ReusedIdx.set(Idx2);
4918 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4919 IE2 = nullptr;
4920 else
4921 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4922 }
4923 } while (!IsReusedIdx && (IE1 || IE2));
4924 return false;
4925 }
4926
4927 std::optional<BoUpSLP::OrdersType>
getReorderingData(const TreeEntry & TE,bool TopToBottom)4928 BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4929 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4930 if (TE.isNonPowOf2Vec())
4931 return std::nullopt;
4932
4933 // No need to reorder if need to shuffle reuses, still need to shuffle the
4934 // node.
4935 if (!TE.ReuseShuffleIndices.empty()) {
4936 if (isSplat(TE.Scalars))
4937 return std::nullopt;
4938 // Check if reuse shuffle indices can be improved by reordering.
4939 // For this, check that reuse mask is "clustered", i.e. each scalar values
4940 // is used once in each submask of size <number_of_scalars>.
4941 // Example: 4 scalar values.
4942 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4943 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4944 // element 3 is used twice in the second submask.
4945 unsigned Sz = TE.Scalars.size();
4946 if (TE.isGather()) {
4947 if (std::optional<OrdersType> CurrentOrder =
4948 findReusedOrderedScalars(TE)) {
4949 SmallVector<int> Mask;
4950 fixupOrderingIndices(*CurrentOrder);
4951 inversePermutation(*CurrentOrder, Mask);
4952 ::addMask(Mask, TE.ReuseShuffleIndices);
4953 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4954 unsigned Sz = TE.Scalars.size();
4955 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4956 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4957 if (Idx != PoisonMaskElem)
4958 Res[Idx + K * Sz] = I + K * Sz;
4959 }
4960 return std::move(Res);
4961 }
4962 }
4963 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4964 TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
4965 2 * TE.getVectorFactor())) == 1)
4966 return std::nullopt;
4967 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4968 Sz)) {
4969 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4970 if (TE.ReorderIndices.empty())
4971 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4972 else
4973 inversePermutation(TE.ReorderIndices, ReorderMask);
4974 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4975 unsigned VF = ReorderMask.size();
4976 OrdersType ResOrder(VF, VF);
4977 unsigned NumParts = divideCeil(VF, Sz);
4978 SmallBitVector UsedVals(NumParts);
4979 for (unsigned I = 0; I < VF; I += Sz) {
4980 int Val = PoisonMaskElem;
4981 unsigned UndefCnt = 0;
4982 unsigned Limit = std::min(Sz, VF - I);
4983 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
4984 [&](int Idx) {
4985 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4986 Val = Idx;
4987 if (Idx == PoisonMaskElem)
4988 ++UndefCnt;
4989 return Idx != PoisonMaskElem && Idx != Val;
4990 }) ||
4991 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4992 UndefCnt > Sz / 2)
4993 return std::nullopt;
4994 UsedVals.set(Val);
4995 for (unsigned K = 0; K < NumParts; ++K)
4996 ResOrder[Val + Sz * K] = I + K;
4997 }
4998 return std::move(ResOrder);
4999 }
5000 unsigned VF = TE.getVectorFactor();
5001 // Try build correct order for extractelement instructions.
5002 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5003 TE.ReuseShuffleIndices.end());
5004 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
5005 all_of(TE.Scalars, [Sz](Value *V) {
5006 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5007 return Idx && *Idx < Sz;
5008 })) {
5009 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5010 if (TE.ReorderIndices.empty())
5011 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5012 else
5013 inversePermutation(TE.ReorderIndices, ReorderMask);
5014 for (unsigned I = 0; I < VF; ++I) {
5015 int &Idx = ReusedMask[I];
5016 if (Idx == PoisonMaskElem)
5017 continue;
5018 Value *V = TE.Scalars[ReorderMask[Idx]];
5019 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5020 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5021 }
5022 }
5023 // Build the order of the VF size, need to reorder reuses shuffles, they are
5024 // always of VF size.
5025 OrdersType ResOrder(VF);
5026 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5027 auto *It = ResOrder.begin();
5028 for (unsigned K = 0; K < VF; K += Sz) {
5029 OrdersType CurrentOrder(TE.ReorderIndices);
5030 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5031 if (SubMask.front() == PoisonMaskElem)
5032 std::iota(SubMask.begin(), SubMask.end(), 0);
5033 reorderOrder(CurrentOrder, SubMask);
5034 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5035 std::advance(It, Sz);
5036 }
5037 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5038 return Data.index() == Data.value();
5039 }))
5040 return std::nullopt; // No need to reorder.
5041 return std::move(ResOrder);
5042 }
5043 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5044 any_of(TE.UserTreeIndices,
5045 [](const EdgeInfo &EI) {
5046 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5047 }) &&
5048 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5049 return std::nullopt;
5050 if ((TE.State == TreeEntry::Vectorize ||
5051 TE.State == TreeEntry::StridedVectorize) &&
5052 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5053 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
5054 !TE.isAltShuffle())
5055 return TE.ReorderIndices;
5056 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5057 auto PHICompare = [&](unsigned I1, unsigned I2) {
5058 Value *V1 = TE.Scalars[I1];
5059 Value *V2 = TE.Scalars[I2];
5060 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5061 return false;
5062 if (V1->getNumUses() < V2->getNumUses())
5063 return true;
5064 if (V1->getNumUses() > V2->getNumUses())
5065 return false;
5066 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5067 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5068 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
5069 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
5070 if (!areTwoInsertFromSameBuildVector(
5071 IE1, IE2,
5072 [](InsertElementInst *II) { return II->getOperand(0); }))
5073 return I1 < I2;
5074 return getElementIndex(IE1) < getElementIndex(IE2);
5075 }
5076 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
5077 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
5078 if (EE1->getOperand(0) != EE2->getOperand(0))
5079 return I1 < I2;
5080 return getElementIndex(EE1) < getElementIndex(EE2);
5081 }
5082 return I1 < I2;
5083 };
5084 auto IsIdentityOrder = [](const OrdersType &Order) {
5085 for (unsigned Idx : seq<unsigned>(0, Order.size()))
5086 if (Idx != Order[Idx])
5087 return false;
5088 return true;
5089 };
5090 if (!TE.ReorderIndices.empty())
5091 return TE.ReorderIndices;
5092 DenseMap<unsigned, unsigned> PhiToId;
5093 SmallVector<unsigned> Phis(TE.Scalars.size());
5094 std::iota(Phis.begin(), Phis.end(), 0);
5095 OrdersType ResOrder(TE.Scalars.size());
5096 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
5097 PhiToId[Id] = Id;
5098 stable_sort(Phis, PHICompare);
5099 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
5100 ResOrder[Id] = PhiToId[Phis[Id]];
5101 if (IsIdentityOrder(ResOrder))
5102 return std::nullopt; // No need to reorder.
5103 return std::move(ResOrder);
5104 }
5105 if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
5106 // TODO: add analysis of other gather nodes with extractelement
5107 // instructions and other values/instructions, not only undefs.
5108 if ((TE.getOpcode() == Instruction::ExtractElement ||
5109 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5110 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5111 all_of(TE.Scalars, [](Value *V) {
5112 auto *EE = dyn_cast<ExtractElementInst>(V);
5113 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5114 })) {
5115 // Check that gather of extractelements can be represented as
5116 // just a shuffle of a single vector.
5117 OrdersType CurrentOrder;
5118 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5119 /*ResizeAllowed=*/true);
5120 if (Reuse || !CurrentOrder.empty())
5121 return std::move(CurrentOrder);
5122 }
5123 // If the gather node is <undef, v, .., poison> and
5124 // insertelement poison, v, 0 [+ permute]
5125 // is cheaper than
5126 // insertelement poison, v, n - try to reorder.
5127 // If rotating the whole graph, exclude the permute cost, the whole graph
5128 // might be transformed.
5129 int Sz = TE.Scalars.size();
5130 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5131 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5132 const auto *It =
5133 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5134 if (It == TE.Scalars.begin())
5135 return OrdersType();
5136 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5137 if (It != TE.Scalars.end()) {
5138 OrdersType Order(Sz, Sz);
5139 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5140 Order[Idx] = 0;
5141 fixupOrderingIndices(Order);
5142 SmallVector<int> Mask;
5143 inversePermutation(Order, Mask);
5144 InstructionCost PermuteCost =
5145 TopToBottom
5146 ? 0
5147 : TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, Mask);
5148 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5149 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5150 PoisonValue::get(Ty), *It);
5151 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5152 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5153 PoisonValue::get(Ty), *It);
5154 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5155 OrdersType Order(Sz, Sz);
5156 Order[Idx] = 0;
5157 return std::move(Order);
5158 }
5159 }
5160 }
5161 if (isSplat(TE.Scalars))
5162 return std::nullopt;
5163 if (TE.Scalars.size() >= 4)
5164 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5165 return Order;
5166 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5167 return CurrentOrder;
5168 }
5169 return std::nullopt;
5170 }
5171
5172 /// Checks if the given mask is a "clustered" mask with the same clusters of
5173 /// size \p Sz, which are not identity submasks.
isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,unsigned Sz)5174 static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
5175 unsigned Sz) {
5176 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5177 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5178 return false;
5179 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5180 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5181 if (Cluster != FirstCluster)
5182 return false;
5183 }
5184 return true;
5185 }
5186
reorderNodeWithReuses(TreeEntry & TE,ArrayRef<int> Mask) const5187 void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5188 // Reorder reuses mask.
5189 reorderReuses(TE.ReuseShuffleIndices, Mask);
5190 const unsigned Sz = TE.Scalars.size();
5191 // For vectorized and non-clustered reused no need to do anything else.
5192 if (!TE.isGather() ||
5193 !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5194 Sz) ||
5195 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5196 return;
5197 SmallVector<int> NewMask;
5198 inversePermutation(TE.ReorderIndices, NewMask);
5199 addMask(NewMask, TE.ReuseShuffleIndices);
5200 // Clear reorder since it is going to be applied to the new mask.
5201 TE.ReorderIndices.clear();
5202 // Try to improve gathered nodes with clustered reuses, if possible.
5203 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5204 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
5205 inversePermutation(NewOrder, NewMask);
5206 reorderScalars(TE.Scalars, NewMask);
5207 // Fill the reuses mask with the identity submasks.
5208 for (auto *It = TE.ReuseShuffleIndices.begin(),
5209 *End = TE.ReuseShuffleIndices.end();
5210 It != End; std::advance(It, Sz))
5211 std::iota(It, std::next(It, Sz), 0);
5212 }
5213
combineOrders(MutableArrayRef<unsigned> Order,ArrayRef<unsigned> SecondaryOrder)5214 static void combineOrders(MutableArrayRef<unsigned> Order,
5215 ArrayRef<unsigned> SecondaryOrder) {
5216 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5217 "Expected same size of orders");
5218 unsigned Sz = Order.size();
5219 SmallBitVector UsedIndices(Sz);
5220 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5221 if (Order[Idx] != Sz)
5222 UsedIndices.set(Order[Idx]);
5223 }
5224 if (SecondaryOrder.empty()) {
5225 for (unsigned Idx : seq<unsigned>(0, Sz))
5226 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5227 Order[Idx] = Idx;
5228 } else {
5229 for (unsigned Idx : seq<unsigned>(0, Sz))
5230 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5231 !UsedIndices.test(SecondaryOrder[Idx]))
5232 Order[Idx] = SecondaryOrder[Idx];
5233 }
5234 }
5235
reorderTopToBottom()5236 void BoUpSLP::reorderTopToBottom() {
5237 // Maps VF to the graph nodes.
5238 DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
5239 // ExtractElement gather nodes which can be vectorized and need to handle
5240 // their ordering.
5241 DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
5242
5243 // Phi nodes can have preferred ordering based on their result users
5244 DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
5245
5246 // AltShuffles can also have a preferred ordering that leads to fewer
5247 // instructions, e.g., the addsub instruction in x86.
5248 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5249
5250 // Maps a TreeEntry to the reorder indices of external users.
5251 DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
5252 ExternalUserReorderMap;
5253 // Find all reorderable nodes with the given VF.
5254 // Currently the are vectorized stores,loads,extracts + some gathering of
5255 // extracts.
5256 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5257 const std::unique_ptr<TreeEntry> &TE) {
5258 // Look for external users that will probably be vectorized.
5259 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5260 findExternalStoreUsersReorderIndices(TE.get());
5261 if (!ExternalUserReorderIndices.empty()) {
5262 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5263 ExternalUserReorderMap.try_emplace(TE.get(),
5264 std::move(ExternalUserReorderIndices));
5265 }
5266
5267 // Patterns like [fadd,fsub] can be combined into a single instruction in
5268 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5269 // to take into account their order when looking for the most used order.
5270 if (TE->isAltShuffle()) {
5271 VectorType *VecTy =
5272 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5273 unsigned Opcode0 = TE->getOpcode();
5274 unsigned Opcode1 = TE->getAltOpcode();
5275 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5276 // If this pattern is supported by the target then we consider the order.
5277 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5278 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5279 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5280 }
5281 // TODO: Check the reverse order too.
5282 }
5283
5284 if (std::optional<OrdersType> CurrentOrder =
5285 getReorderingData(*TE, /*TopToBottom=*/true)) {
5286 // Do not include ordering for nodes used in the alt opcode vectorization,
5287 // better to reorder them during bottom-to-top stage. If follow the order
5288 // here, it causes reordering of the whole graph though actually it is
5289 // profitable just to reorder the subgraph that starts from the alternate
5290 // opcode vectorization node. Such nodes already end-up with the shuffle
5291 // instruction and it is just enough to change this shuffle rather than
5292 // rotate the scalars for the whole graph.
5293 unsigned Cnt = 0;
5294 const TreeEntry *UserTE = TE.get();
5295 while (UserTE && Cnt < RecursionMaxDepth) {
5296 if (UserTE->UserTreeIndices.size() != 1)
5297 break;
5298 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5299 return EI.UserTE->State == TreeEntry::Vectorize &&
5300 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5301 }))
5302 return;
5303 UserTE = UserTE->UserTreeIndices.back().UserTE;
5304 ++Cnt;
5305 }
5306 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5307 if (!(TE->State == TreeEntry::Vectorize ||
5308 TE->State == TreeEntry::StridedVectorize) ||
5309 !TE->ReuseShuffleIndices.empty())
5310 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5311 if (TE->State == TreeEntry::Vectorize &&
5312 TE->getOpcode() == Instruction::PHI)
5313 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5314 }
5315 });
5316
5317 // Reorder the graph nodes according to their vectorization factor.
5318 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5319 VF /= 2) {
5320 auto It = VFToOrderedEntries.find(VF);
5321 if (It == VFToOrderedEntries.end())
5322 continue;
5323 // Try to find the most profitable order. We just are looking for the most
5324 // used order and reorder scalar elements in the nodes according to this
5325 // mostly used order.
5326 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5327 // All operands are reordered and used only in this node - propagate the
5328 // most used order to the user node.
5329 MapVector<OrdersType, unsigned,
5330 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5331 OrdersUses;
5332 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
5333 for (const TreeEntry *OpTE : OrderedEntries) {
5334 // No need to reorder this nodes, still need to extend and to use shuffle,
5335 // just need to merge reordering shuffle and the reuse shuffle.
5336 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5337 continue;
5338 // Count number of orders uses.
5339 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5340 &PhisToOrders]() -> const OrdersType & {
5341 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5342 auto It = GathersToOrders.find(OpTE);
5343 if (It != GathersToOrders.end())
5344 return It->second;
5345 }
5346 if (OpTE->isAltShuffle()) {
5347 auto It = AltShufflesToOrders.find(OpTE);
5348 if (It != AltShufflesToOrders.end())
5349 return It->second;
5350 }
5351 if (OpTE->State == TreeEntry::Vectorize &&
5352 OpTE->getOpcode() == Instruction::PHI) {
5353 auto It = PhisToOrders.find(OpTE);
5354 if (It != PhisToOrders.end())
5355 return It->second;
5356 }
5357 return OpTE->ReorderIndices;
5358 }();
5359 // First consider the order of the external scalar users.
5360 auto It = ExternalUserReorderMap.find(OpTE);
5361 if (It != ExternalUserReorderMap.end()) {
5362 const auto &ExternalUserReorderIndices = It->second;
5363 // If the OpTE vector factor != number of scalars - use natural order,
5364 // it is an attempt to reorder node with reused scalars but with
5365 // external uses.
5366 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5367 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5368 ExternalUserReorderIndices.size();
5369 } else {
5370 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5371 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5372 }
5373 // No other useful reorder data in this entry.
5374 if (Order.empty())
5375 continue;
5376 }
5377 // Stores actually store the mask, not the order, need to invert.
5378 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5379 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5380 SmallVector<int> Mask;
5381 inversePermutation(Order, Mask);
5382 unsigned E = Order.size();
5383 OrdersType CurrentOrder(E, E);
5384 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5385 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5386 });
5387 fixupOrderingIndices(CurrentOrder);
5388 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5389 } else {
5390 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5391 }
5392 }
5393 if (OrdersUses.empty())
5394 continue;
5395 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5396 const unsigned Sz = Order.size();
5397 for (unsigned Idx : seq<unsigned>(0, Sz))
5398 if (Idx != Order[Idx] && Order[Idx] != Sz)
5399 return false;
5400 return true;
5401 };
5402 // Choose the most used order.
5403 unsigned IdentityCnt = 0;
5404 unsigned FilledIdentityCnt = 0;
5405 OrdersType IdentityOrder(VF, VF);
5406 for (auto &Pair : OrdersUses) {
5407 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5408 if (!Pair.first.empty())
5409 FilledIdentityCnt += Pair.second;
5410 IdentityCnt += Pair.second;
5411 combineOrders(IdentityOrder, Pair.first);
5412 }
5413 }
5414 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5415 unsigned Cnt = IdentityCnt;
5416 for (auto &Pair : OrdersUses) {
5417 // Prefer identity order. But, if filled identity found (non-empty order)
5418 // with same number of uses, as the new candidate order, we can choose
5419 // this candidate order.
5420 if (Cnt < Pair.second ||
5421 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5422 Cnt == Pair.second && !BestOrder.empty() &&
5423 IsIdentityOrder(BestOrder))) {
5424 combineOrders(Pair.first, BestOrder);
5425 BestOrder = Pair.first;
5426 Cnt = Pair.second;
5427 } else {
5428 combineOrders(BestOrder, Pair.first);
5429 }
5430 }
5431 // Set order of the user node.
5432 if (IsIdentityOrder(BestOrder))
5433 continue;
5434 fixupOrderingIndices(BestOrder);
5435 SmallVector<int> Mask;
5436 inversePermutation(BestOrder, Mask);
5437 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5438 unsigned E = BestOrder.size();
5439 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5440 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5441 });
5442 // Do an actual reordering, if profitable.
5443 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5444 // Just do the reordering for the nodes with the given VF.
5445 if (TE->Scalars.size() != VF) {
5446 if (TE->ReuseShuffleIndices.size() == VF) {
5447 // Need to reorder the reuses masks of the operands with smaller VF to
5448 // be able to find the match between the graph nodes and scalar
5449 // operands of the given node during vectorization/cost estimation.
5450 assert(all_of(TE->UserTreeIndices,
5451 [VF, &TE](const EdgeInfo &EI) {
5452 return EI.UserTE->Scalars.size() == VF ||
5453 EI.UserTE->Scalars.size() ==
5454 TE->Scalars.size();
5455 }) &&
5456 "All users must be of VF size.");
5457 // Update ordering of the operands with the smaller VF than the given
5458 // one.
5459 reorderNodeWithReuses(*TE, Mask);
5460 }
5461 continue;
5462 }
5463 if ((TE->State == TreeEntry::Vectorize ||
5464 TE->State == TreeEntry::StridedVectorize) &&
5465 isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
5466 InsertElementInst>(TE->getMainOp()) &&
5467 !TE->isAltShuffle()) {
5468 // Build correct orders for extract{element,value}, loads and
5469 // stores.
5470 reorderOrder(TE->ReorderIndices, Mask);
5471 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5472 TE->reorderOperands(Mask);
5473 } else {
5474 // Reorder the node and its operands.
5475 TE->reorderOperands(Mask);
5476 assert(TE->ReorderIndices.empty() &&
5477 "Expected empty reorder sequence.");
5478 reorderScalars(TE->Scalars, Mask);
5479 }
5480 if (!TE->ReuseShuffleIndices.empty()) {
5481 // Apply reversed order to keep the original ordering of the reused
5482 // elements to avoid extra reorder indices shuffling.
5483 OrdersType CurrentOrder;
5484 reorderOrder(CurrentOrder, MaskOrder);
5485 SmallVector<int> NewReuses;
5486 inversePermutation(CurrentOrder, NewReuses);
5487 addMask(NewReuses, TE->ReuseShuffleIndices);
5488 TE->ReuseShuffleIndices.swap(NewReuses);
5489 }
5490 }
5491 }
5492 }
5493
canReorderOperands(TreeEntry * UserTE,SmallVectorImpl<std::pair<unsigned,TreeEntry * >> & Edges,ArrayRef<TreeEntry * > ReorderableGathers,SmallVectorImpl<TreeEntry * > & GatherOps)5494 bool BoUpSLP::canReorderOperands(
5495 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5496 ArrayRef<TreeEntry *> ReorderableGathers,
5497 SmallVectorImpl<TreeEntry *> &GatherOps) {
5498 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5499 if (UserTE->isNonPowOf2Vec())
5500 return false;
5501
5502 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5503 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5504 return OpData.first == I &&
5505 (OpData.second->State == TreeEntry::Vectorize ||
5506 OpData.second->State == TreeEntry::StridedVectorize);
5507 }))
5508 continue;
5509 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5510 // Do not reorder if operand node is used by many user nodes.
5511 if (any_of(TE->UserTreeIndices,
5512 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5513 return false;
5514 // Add the node to the list of the ordered nodes with the identity
5515 // order.
5516 Edges.emplace_back(I, TE);
5517 // Add ScatterVectorize nodes to the list of operands, where just
5518 // reordering of the scalars is required. Similar to the gathers, so
5519 // simply add to the list of gathered ops.
5520 // If there are reused scalars, process this node as a regular vectorize
5521 // node, just reorder reuses mask.
5522 if (TE->State != TreeEntry::Vectorize &&
5523 TE->State != TreeEntry::StridedVectorize &&
5524 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5525 GatherOps.push_back(TE);
5526 continue;
5527 }
5528 TreeEntry *Gather = nullptr;
5529 if (count_if(ReorderableGathers,
5530 [&Gather, UserTE, I](TreeEntry *TE) {
5531 assert(TE->State != TreeEntry::Vectorize &&
5532 TE->State != TreeEntry::StridedVectorize &&
5533 "Only non-vectorized nodes are expected.");
5534 if (any_of(TE->UserTreeIndices,
5535 [UserTE, I](const EdgeInfo &EI) {
5536 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5537 })) {
5538 assert(TE->isSame(UserTE->getOperand(I)) &&
5539 "Operand entry does not match operands.");
5540 Gather = TE;
5541 return true;
5542 }
5543 return false;
5544 }) > 1 &&
5545 !allConstant(UserTE->getOperand(I)))
5546 return false;
5547 if (Gather)
5548 GatherOps.push_back(Gather);
5549 }
5550 return true;
5551 }
5552
reorderBottomToTop(bool IgnoreReorder)5553 void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5554 SetVector<TreeEntry *> OrderedEntries;
5555 DenseSet<const TreeEntry *> GathersToOrders;
5556 // Find all reorderable leaf nodes with the given VF.
5557 // Currently the are vectorized loads,extracts without alternate operands +
5558 // some gathering of extracts.
5559 SmallVector<TreeEntry *> NonVectorized;
5560 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5561 if (TE->State != TreeEntry::Vectorize &&
5562 TE->State != TreeEntry::StridedVectorize)
5563 NonVectorized.push_back(TE.get());
5564 if (std::optional<OrdersType> CurrentOrder =
5565 getReorderingData(*TE, /*TopToBottom=*/false)) {
5566 OrderedEntries.insert(TE.get());
5567 if (!(TE->State == TreeEntry::Vectorize ||
5568 TE->State == TreeEntry::StridedVectorize) ||
5569 !TE->ReuseShuffleIndices.empty())
5570 GathersToOrders.insert(TE.get());
5571 }
5572 }
5573
5574 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5575 // I.e., if the node has operands, that are reordered, try to make at least
5576 // one operand order in the natural order and reorder others + reorder the
5577 // user node itself.
5578 SmallPtrSet<const TreeEntry *, 4> Visited;
5579 while (!OrderedEntries.empty()) {
5580 // 1. Filter out only reordered nodes.
5581 // 2. If the entry has multiple uses - skip it and jump to the next node.
5582 DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
5583 SmallVector<TreeEntry *> Filtered;
5584 for (TreeEntry *TE : OrderedEntries) {
5585 if (!(TE->State == TreeEntry::Vectorize ||
5586 TE->State == TreeEntry::StridedVectorize ||
5587 (TE->isGather() && GathersToOrders.contains(TE))) ||
5588 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5589 !all_of(drop_begin(TE->UserTreeIndices),
5590 [TE](const EdgeInfo &EI) {
5591 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5592 }) ||
5593 !Visited.insert(TE).second) {
5594 Filtered.push_back(TE);
5595 continue;
5596 }
5597 // Build a map between user nodes and their operands order to speedup
5598 // search. The graph currently does not provide this dependency directly.
5599 for (EdgeInfo &EI : TE->UserTreeIndices) {
5600 TreeEntry *UserTE = EI.UserTE;
5601 auto It = Users.find(UserTE);
5602 if (It == Users.end())
5603 It = Users.insert({UserTE, {}}).first;
5604 It->second.emplace_back(EI.EdgeIdx, TE);
5605 }
5606 }
5607 // Erase filtered entries.
5608 for (TreeEntry *TE : Filtered)
5609 OrderedEntries.remove(TE);
5610 SmallVector<
5611 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5612 UsersVec(Users.begin(), Users.end());
5613 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5614 return Data1.first->Idx > Data2.first->Idx;
5615 });
5616 for (auto &Data : UsersVec) {
5617 // Check that operands are used only in the User node.
5618 SmallVector<TreeEntry *> GatherOps;
5619 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5620 GatherOps)) {
5621 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5622 OrderedEntries.remove(Op.second);
5623 continue;
5624 }
5625 // All operands are reordered and used only in this node - propagate the
5626 // most used order to the user node.
5627 MapVector<OrdersType, unsigned,
5628 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5629 OrdersUses;
5630 // Do the analysis for each tree entry only once, otherwise the order of
5631 // the same node my be considered several times, though might be not
5632 // profitable.
5633 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
5634 SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
5635 for (const auto &Op : Data.second) {
5636 TreeEntry *OpTE = Op.second;
5637 if (!VisitedOps.insert(OpTE).second)
5638 continue;
5639 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5640 continue;
5641 const auto Order = [&]() -> const OrdersType {
5642 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
5643 return getReorderingData(*OpTE, /*TopToBottom=*/false)
5644 .value_or(OrdersType(1));
5645 return OpTE->ReorderIndices;
5646 }();
5647 // The order is partially ordered, skip it in favor of fully non-ordered
5648 // orders.
5649 if (Order.size() == 1)
5650 continue;
5651 unsigned NumOps = count_if(
5652 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5653 return P.second == OpTE;
5654 });
5655 // Stores actually store the mask, not the order, need to invert.
5656 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5657 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5658 SmallVector<int> Mask;
5659 inversePermutation(Order, Mask);
5660 unsigned E = Order.size();
5661 OrdersType CurrentOrder(E, E);
5662 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5663 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5664 });
5665 fixupOrderingIndices(CurrentOrder);
5666 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5667 NumOps;
5668 } else {
5669 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5670 }
5671 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5672 const auto AllowsReordering = [&](const TreeEntry *TE) {
5673 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5674 if (TE->isNonPowOf2Vec())
5675 return false;
5676 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5677 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5678 (IgnoreReorder && TE->Idx == 0))
5679 return true;
5680 if (TE->isGather()) {
5681 if (GathersToOrders.contains(TE))
5682 return !getReorderingData(*TE, /*TopToBottom=*/false)
5683 .value_or(OrdersType(1))
5684 .empty();
5685 return true;
5686 }
5687 return false;
5688 };
5689 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5690 TreeEntry *UserTE = EI.UserTE;
5691 if (!VisitedUsers.insert(UserTE).second)
5692 continue;
5693 // May reorder user node if it requires reordering, has reused
5694 // scalars, is an alternate op vectorize node or its op nodes require
5695 // reordering.
5696 if (AllowsReordering(UserTE))
5697 continue;
5698 // Check if users allow reordering.
5699 // Currently look up just 1 level of operands to avoid increase of
5700 // the compile time.
5701 // Profitable to reorder if definitely more operands allow
5702 // reordering rather than those with natural order.
5703 ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];
5704 if (static_cast<unsigned>(count_if(
5705 Ops, [UserTE, &AllowsReordering](
5706 const std::pair<unsigned, TreeEntry *> &Op) {
5707 return AllowsReordering(Op.second) &&
5708 all_of(Op.second->UserTreeIndices,
5709 [UserTE](const EdgeInfo &EI) {
5710 return EI.UserTE == UserTE;
5711 });
5712 })) <= Ops.size() / 2)
5713 ++Res.first->second;
5714 }
5715 }
5716 if (OrdersUses.empty()) {
5717 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5718 OrderedEntries.remove(Op.second);
5719 continue;
5720 }
5721 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5722 const unsigned Sz = Order.size();
5723 for (unsigned Idx : seq<unsigned>(0, Sz))
5724 if (Idx != Order[Idx] && Order[Idx] != Sz)
5725 return false;
5726 return true;
5727 };
5728 // Choose the most used order.
5729 unsigned IdentityCnt = 0;
5730 unsigned VF = Data.second.front().second->getVectorFactor();
5731 OrdersType IdentityOrder(VF, VF);
5732 for (auto &Pair : OrdersUses) {
5733 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5734 IdentityCnt += Pair.second;
5735 combineOrders(IdentityOrder, Pair.first);
5736 }
5737 }
5738 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5739 unsigned Cnt = IdentityCnt;
5740 for (auto &Pair : OrdersUses) {
5741 // Prefer identity order. But, if filled identity found (non-empty
5742 // order) with same number of uses, as the new candidate order, we can
5743 // choose this candidate order.
5744 if (Cnt < Pair.second) {
5745 combineOrders(Pair.first, BestOrder);
5746 BestOrder = Pair.first;
5747 Cnt = Pair.second;
5748 } else {
5749 combineOrders(BestOrder, Pair.first);
5750 }
5751 }
5752 // Set order of the user node.
5753 if (IsIdentityOrder(BestOrder)) {
5754 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5755 OrderedEntries.remove(Op.second);
5756 continue;
5757 }
5758 fixupOrderingIndices(BestOrder);
5759 // Erase operands from OrderedEntries list and adjust their orders.
5760 VisitedOps.clear();
5761 SmallVector<int> Mask;
5762 inversePermutation(BestOrder, Mask);
5763 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5764 unsigned E = BestOrder.size();
5765 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5766 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5767 });
5768 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5769 TreeEntry *TE = Op.second;
5770 OrderedEntries.remove(TE);
5771 if (!VisitedOps.insert(TE).second)
5772 continue;
5773 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5774 reorderNodeWithReuses(*TE, Mask);
5775 continue;
5776 }
5777 // Gathers are processed separately.
5778 if (TE->State != TreeEntry::Vectorize &&
5779 TE->State != TreeEntry::StridedVectorize &&
5780 (TE->State != TreeEntry::ScatterVectorize ||
5781 TE->ReorderIndices.empty()))
5782 continue;
5783 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5784 TE->ReorderIndices.empty()) &&
5785 "Non-matching sizes of user/operand entries.");
5786 reorderOrder(TE->ReorderIndices, Mask);
5787 if (IgnoreReorder && TE == VectorizableTree.front().get())
5788 IgnoreReorder = false;
5789 }
5790 // For gathers just need to reorder its scalars.
5791 for (TreeEntry *Gather : GatherOps) {
5792 assert(Gather->ReorderIndices.empty() &&
5793 "Unexpected reordering of gathers.");
5794 if (!Gather->ReuseShuffleIndices.empty()) {
5795 // Just reorder reuses indices.
5796 reorderReuses(Gather->ReuseShuffleIndices, Mask);
5797 continue;
5798 }
5799 reorderScalars(Gather->Scalars, Mask);
5800 OrderedEntries.remove(Gather);
5801 }
5802 // Reorder operands of the user node and set the ordering for the user
5803 // node itself.
5804 if (Data.first->State != TreeEntry::Vectorize ||
5805 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5806 Data.first->getMainOp()) ||
5807 Data.first->isAltShuffle())
5808 Data.first->reorderOperands(Mask);
5809 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5810 Data.first->isAltShuffle() ||
5811 Data.first->State == TreeEntry::StridedVectorize) {
5812 reorderScalars(Data.first->Scalars, Mask);
5813 reorderOrder(Data.first->ReorderIndices, MaskOrder,
5814 /*BottomOrder=*/true);
5815 if (Data.first->ReuseShuffleIndices.empty() &&
5816 !Data.first->ReorderIndices.empty() &&
5817 !Data.first->isAltShuffle()) {
5818 // Insert user node to the list to try to sink reordering deeper in
5819 // the graph.
5820 OrderedEntries.insert(Data.first);
5821 }
5822 } else {
5823 reorderOrder(Data.first->ReorderIndices, Mask);
5824 }
5825 }
5826 }
5827 // If the reordering is unnecessary, just remove the reorder.
5828 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5829 VectorizableTree.front()->ReuseShuffleIndices.empty())
5830 VectorizableTree.front()->ReorderIndices.clear();
5831 }
5832
buildExternalUses(const ExtraValueToDebugLocsMap & ExternallyUsedValues)5833 void BoUpSLP::buildExternalUses(
5834 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5835 DenseMap<Value *, unsigned> ScalarToExtUses;
5836 // Collect the values that we need to extract from the tree.
5837 for (auto &TEPtr : VectorizableTree) {
5838 TreeEntry *Entry = TEPtr.get();
5839
5840 // No need to handle users of gathered values.
5841 if (Entry->isGather())
5842 continue;
5843
5844 // For each lane:
5845 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5846 Value *Scalar = Entry->Scalars[Lane];
5847 if (!isa<Instruction>(Scalar))
5848 continue;
5849 // All uses must be replaced already? No need to do it again.
5850 auto It = ScalarToExtUses.find(Scalar);
5851 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5852 continue;
5853
5854 // Check if the scalar is externally used as an extra arg.
5855 const auto *ExtI = ExternallyUsedValues.find(Scalar);
5856 if (ExtI != ExternallyUsedValues.end()) {
5857 int FoundLane = Entry->findLaneForValue(Scalar);
5858 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5859 << FoundLane << " from " << *Scalar << ".\n");
5860 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
5861 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5862 continue;
5863 }
5864 for (User *U : Scalar->users()) {
5865 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5866
5867 Instruction *UserInst = dyn_cast<Instruction>(U);
5868 if (!UserInst || isDeleted(UserInst))
5869 continue;
5870
5871 // Ignore users in the user ignore list.
5872 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5873 continue;
5874
5875 // Skip in-tree scalars that become vectors
5876 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5877 // Some in-tree scalars will remain as scalar in vectorized
5878 // instructions. If that is the case, the one in FoundLane will
5879 // be used.
5880 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5881 !doesInTreeUserNeedToExtract(
5882 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5883 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5884 << ".\n");
5885 assert(!UseEntry->isGather() && "Bad state");
5886 continue;
5887 }
5888 U = nullptr;
5889 if (It != ScalarToExtUses.end()) {
5890 ExternalUses[It->second].User = nullptr;
5891 break;
5892 }
5893 }
5894
5895 if (U && Scalar->hasNUsesOrMore(UsesLimit))
5896 U = nullptr;
5897 int FoundLane = Entry->findLaneForValue(Scalar);
5898 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5899 << " from lane " << FoundLane << " from " << *Scalar
5900 << ".\n");
5901 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
5902 ExternalUses.emplace_back(Scalar, U, FoundLane);
5903 if (!U)
5904 break;
5905 }
5906 }
5907 }
5908 }
5909
5910 DenseMap<Value *, SmallVector<StoreInst *>>
collectUserStores(const BoUpSLP::TreeEntry * TE) const5911 BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5912 DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap;
5913 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5914 Value *V = TE->Scalars[Lane];
5915 // To save compilation time we don't visit if we have too many users.
5916 if (V->hasNUsesOrMore(UsesLimit))
5917 break;
5918
5919 // Collect stores per pointer object.
5920 for (User *U : V->users()) {
5921 auto *SI = dyn_cast<StoreInst>(U);
5922 if (SI == nullptr || !SI->isSimple() ||
5923 !isValidElementType(SI->getValueOperand()->getType()))
5924 continue;
5925 // Skip entry if already
5926 if (getTreeEntry(U))
5927 continue;
5928
5929 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5930 auto &StoresVec = PtrToStoresMap[Ptr];
5931 // For now just keep one store per pointer object per lane.
5932 // TODO: Extend this to support multiple stores per pointer per lane
5933 if (StoresVec.size() > Lane)
5934 continue;
5935 // Skip if in different BBs.
5936 if (!StoresVec.empty() &&
5937 SI->getParent() != StoresVec.back()->getParent())
5938 continue;
5939 // Make sure that the stores are of the same type.
5940 if (!StoresVec.empty() &&
5941 SI->getValueOperand()->getType() !=
5942 StoresVec.back()->getValueOperand()->getType())
5943 continue;
5944 StoresVec.push_back(SI);
5945 }
5946 }
5947 return PtrToStoresMap;
5948 }
5949
canFormVector(ArrayRef<StoreInst * > StoresVec,OrdersType & ReorderIndices) const5950 bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5951 OrdersType &ReorderIndices) const {
5952 // We check whether the stores in StoreVec can form a vector by sorting them
5953 // and checking whether they are consecutive.
5954
5955 // To avoid calling getPointersDiff() while sorting we create a vector of
5956 // pairs {store, offset from first} and sort this instead.
5957 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5958 StoreInst *S0 = StoresVec[0];
5959 StoreOffsetVec[0] = {S0, 0};
5960 Type *S0Ty = S0->getValueOperand()->getType();
5961 Value *S0Ptr = S0->getPointerOperand();
5962 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5963 StoreInst *SI = StoresVec[Idx];
5964 std::optional<int> Diff =
5965 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5966 SI->getPointerOperand(), *DL, *SE,
5967 /*StrictCheck=*/true);
5968 // We failed to compare the pointers so just abandon this StoresVec.
5969 if (!Diff)
5970 return false;
5971 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5972 }
5973
5974 // Sort the vector based on the pointers. We create a copy because we may
5975 // need the original later for calculating the reorder (shuffle) indices.
5976 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5977 const std::pair<StoreInst *, int> &Pair2) {
5978 int Offset1 = Pair1.second;
5979 int Offset2 = Pair2.second;
5980 return Offset1 < Offset2;
5981 });
5982
5983 // Check if the stores are consecutive by checking if their difference is 1.
5984 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5985 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5986 return false;
5987
5988 // Calculate the shuffle indices according to their offset against the sorted
5989 // StoreOffsetVec.
5990 ReorderIndices.reserve(StoresVec.size());
5991 for (StoreInst *SI : StoresVec) {
5992 unsigned Idx = find_if(StoreOffsetVec,
5993 [SI](const std::pair<StoreInst *, int> &Pair) {
5994 return Pair.first == SI;
5995 }) -
5996 StoreOffsetVec.begin();
5997 ReorderIndices.push_back(Idx);
5998 }
5999 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6000 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6001 // same convention here.
6002 auto IsIdentityOrder = [](const OrdersType &Order) {
6003 for (unsigned Idx : seq<unsigned>(0, Order.size()))
6004 if (Idx != Order[Idx])
6005 return false;
6006 return true;
6007 };
6008 if (IsIdentityOrder(ReorderIndices))
6009 ReorderIndices.clear();
6010
6011 return true;
6012 }
6013
6014 #ifndef NDEBUG
dumpOrder(const BoUpSLP::OrdersType & Order)6015 LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
6016 for (unsigned Idx : Order)
6017 dbgs() << Idx << ", ";
6018 dbgs() << "\n";
6019 }
6020 #endif
6021
6022 SmallVector<BoUpSLP::OrdersType, 1>
findExternalStoreUsersReorderIndices(TreeEntry * TE) const6023 BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6024 unsigned NumLanes = TE->Scalars.size();
6025
6026 DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap =
6027 collectUserStores(TE);
6028
6029 // Holds the reorder indices for each candidate store vector that is a user of
6030 // the current TreeEntry.
6031 SmallVector<OrdersType, 1> ExternalReorderIndices;
6032
6033 // Now inspect the stores collected per pointer and look for vectorization
6034 // candidates. For each candidate calculate the reorder index vector and push
6035 // it into `ExternalReorderIndices`
6036 for (const auto &Pair : PtrToStoresMap) {
6037 auto &StoresVec = Pair.second;
6038 // If we have fewer than NumLanes stores, then we can't form a vector.
6039 if (StoresVec.size() != NumLanes)
6040 continue;
6041
6042 // If the stores are not consecutive then abandon this StoresVec.
6043 OrdersType ReorderIndices;
6044 if (!canFormVector(StoresVec, ReorderIndices))
6045 continue;
6046
6047 // We now know that the scalars in StoresVec can form a vector instruction,
6048 // so set the reorder indices.
6049 ExternalReorderIndices.push_back(ReorderIndices);
6050 }
6051 return ExternalReorderIndices;
6052 }
6053
buildTree(ArrayRef<Value * > Roots,const SmallDenseSet<Value * > & UserIgnoreLst)6054 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
6055 const SmallDenseSet<Value *> &UserIgnoreLst) {
6056 deleteTree();
6057 UserIgnoreList = &UserIgnoreLst;
6058 if (!allSameType(Roots))
6059 return;
6060 buildTree_rec(Roots, 0, EdgeInfo());
6061 }
6062
buildTree(ArrayRef<Value * > Roots)6063 void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
6064 deleteTree();
6065 if (!allSameType(Roots))
6066 return;
6067 buildTree_rec(Roots, 0, EdgeInfo());
6068 }
6069
6070 /// \return true if the specified list of values has only one instruction that
6071 /// requires scheduling, false otherwise.
6072 #ifndef NDEBUG
needToScheduleSingleInstruction(ArrayRef<Value * > VL)6073 static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
6074 Value *NeedsScheduling = nullptr;
6075 for (Value *V : VL) {
6076 if (doesNotNeedToBeScheduled(V))
6077 continue;
6078 if (!NeedsScheduling) {
6079 NeedsScheduling = V;
6080 continue;
6081 }
6082 return false;
6083 }
6084 return NeedsScheduling;
6085 }
6086 #endif
6087
6088 /// Generates key/subkey pair for the given value to provide effective sorting
6089 /// of the values and better detection of the vectorizable values sequences. The
6090 /// keys/subkeys can be used for better sorting of the values themselves (keys)
6091 /// and in values subgroups (subkeys).
generateKeySubkey(Value * V,const TargetLibraryInfo * TLI,function_ref<hash_code (size_t,LoadInst *)> LoadsSubkeyGenerator,bool AllowAlternate)6092 static std::pair<size_t, size_t> generateKeySubkey(
6093 Value *V, const TargetLibraryInfo *TLI,
6094 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
6095 bool AllowAlternate) {
6096 hash_code Key = hash_value(V->getValueID() + 2);
6097 hash_code SubKey = hash_value(0);
6098 // Sort the loads by the distance between the pointers.
6099 if (auto *LI = dyn_cast<LoadInst>(V)) {
6100 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
6101 if (LI->isSimple())
6102 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
6103 else
6104 Key = SubKey = hash_value(LI);
6105 } else if (isVectorLikeInstWithConstOps(V)) {
6106 // Sort extracts by the vector operands.
6107 if (isa<ExtractElementInst, UndefValue>(V))
6108 Key = hash_value(Value::UndefValueVal + 1);
6109 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
6110 if (!isUndefVector(EI->getVectorOperand()).all() &&
6111 !isa<UndefValue>(EI->getIndexOperand()))
6112 SubKey = hash_value(EI->getVectorOperand());
6113 }
6114 } else if (auto *I = dyn_cast<Instruction>(V)) {
6115 // Sort other instructions just by the opcodes except for CMPInst.
6116 // For CMP also sort by the predicate kind.
6117 if ((isa<BinaryOperator, CastInst>(I)) &&
6118 isValidForAlternation(I->getOpcode())) {
6119 if (AllowAlternate)
6120 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
6121 else
6122 Key = hash_combine(hash_value(I->getOpcode()), Key);
6123 SubKey = hash_combine(
6124 hash_value(I->getOpcode()), hash_value(I->getType()),
6125 hash_value(isa<BinaryOperator>(I)
6126 ? I->getType()
6127 : cast<CastInst>(I)->getOperand(0)->getType()));
6128 // For casts, look through the only operand to improve compile time.
6129 if (isa<CastInst>(I)) {
6130 std::pair<size_t, size_t> OpVals =
6131 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
6132 /*AllowAlternate=*/true);
6133 Key = hash_combine(OpVals.first, Key);
6134 SubKey = hash_combine(OpVals.first, SubKey);
6135 }
6136 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
6137 CmpInst::Predicate Pred = CI->getPredicate();
6138 if (CI->isCommutative())
6139 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
6140 CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);
6141 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
6142 hash_value(SwapPred),
6143 hash_value(CI->getOperand(0)->getType()));
6144 } else if (auto *Call = dyn_cast<CallInst>(I)) {
6145 Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);
6146 if (isTriviallyVectorizable(ID)) {
6147 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
6148 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
6149 SubKey = hash_combine(hash_value(I->getOpcode()),
6150 hash_value(Call->getCalledFunction()));
6151 } else {
6152 Key = hash_combine(hash_value(Call), Key);
6153 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
6154 }
6155 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
6156 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
6157 hash_value(Op.Tag), SubKey);
6158 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
6159 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
6160 SubKey = hash_value(Gep->getPointerOperand());
6161 else
6162 SubKey = hash_value(Gep);
6163 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
6164 !isa<ConstantInt>(I->getOperand(1))) {
6165 // Do not try to vectorize instructions with potentially high cost.
6166 SubKey = hash_value(I);
6167 } else {
6168 SubKey = hash_value(I->getOpcode());
6169 }
6170 Key = hash_combine(hash_value(I->getParent()), Key);
6171 }
6172 return std::make_pair(Key, SubKey);
6173 }
6174
6175 /// Checks if the specified instruction \p I is an alternate operation for
6176 /// the given \p MainOp and \p AltOp instructions.
6177 static bool isAlternateInstruction(const Instruction *I,
6178 const Instruction *MainOp,
6179 const Instruction *AltOp,
6180 const TargetLibraryInfo &TLI);
6181
areAltOperandsProfitable(const InstructionsState & S,ArrayRef<Value * > VL) const6182 bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
6183 ArrayRef<Value *> VL) const {
6184 unsigned Opcode0 = S.getOpcode();
6185 unsigned Opcode1 = S.getAltOpcode();
6186 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
6187 // If this pattern is supported by the target then consider it profitable.
6188 if (TTI->isLegalAltInstr(getWidenedType(S.MainOp->getType(), VL.size()),
6189 Opcode0, Opcode1, OpcodeMask))
6190 return true;
6191 SmallVector<ValueList> Operands;
6192 for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
6193 Operands.emplace_back();
6194 // Prepare the operand vector.
6195 for (Value *V : VL)
6196 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
6197 }
6198 if (Operands.size() == 2) {
6199 // Try find best operands candidates.
6200 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
6201 SmallVector<std::pair<Value *, Value *>> Candidates(3);
6202 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
6203 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
6204 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
6205 std::optional<int> Res = findBestRootPair(Candidates);
6206 switch (Res.value_or(0)) {
6207 case 0:
6208 break;
6209 case 1:
6210 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
6211 break;
6212 case 2:
6213 std::swap(Operands[0][I], Operands[1][I]);
6214 break;
6215 default:
6216 llvm_unreachable("Unexpected index.");
6217 }
6218 }
6219 }
6220 DenseSet<unsigned> UniqueOpcodes;
6221 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
6222 unsigned NonInstCnt = 0;
6223 // Estimate number of instructions, required for the vectorized node and for
6224 // the buildvector node.
6225 unsigned UndefCnt = 0;
6226 // Count the number of extra shuffles, required for vector nodes.
6227 unsigned ExtraShuffleInsts = 0;
6228 // Check that operands do not contain same values and create either perfect
6229 // diamond match or shuffled match.
6230 if (Operands.size() == 2) {
6231 // Do not count same operands twice.
6232 if (Operands.front() == Operands.back()) {
6233 Operands.erase(Operands.begin());
6234 } else if (!allConstant(Operands.front()) &&
6235 all_of(Operands.front(), [&](Value *V) {
6236 return is_contained(Operands.back(), V);
6237 })) {
6238 Operands.erase(Operands.begin());
6239 ++ExtraShuffleInsts;
6240 }
6241 }
6242 const Loop *L = LI->getLoopFor(S.MainOp->getParent());
6243 // Vectorize node, if:
6244 // 1. at least single operand is constant or splat.
6245 // 2. Operands have many loop invariants (the instructions are not loop
6246 // invariants).
6247 // 3. At least single unique operands is supposed to vectorized.
6248 return none_of(Operands,
6249 [&](ArrayRef<Value *> Op) {
6250 if (allConstant(Op) ||
6251 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
6252 getSameOpcode(Op, *TLI).MainOp))
6253 return false;
6254 DenseMap<Value *, unsigned> Uniques;
6255 for (Value *V : Op) {
6256 if (isa<Constant, ExtractElementInst>(V) ||
6257 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
6258 if (isa<UndefValue>(V))
6259 ++UndefCnt;
6260 continue;
6261 }
6262 auto Res = Uniques.try_emplace(V, 0);
6263 // Found first duplicate - need to add shuffle.
6264 if (!Res.second && Res.first->second == 1)
6265 ++ExtraShuffleInsts;
6266 ++Res.first->getSecond();
6267 if (auto *I = dyn_cast<Instruction>(V))
6268 UniqueOpcodes.insert(I->getOpcode());
6269 else if (Res.second)
6270 ++NonInstCnt;
6271 }
6272 return none_of(Uniques, [&](const auto &P) {
6273 return P.first->hasNUsesOrMore(P.second + 1) &&
6274 none_of(P.first->users(), [&](User *U) {
6275 return getTreeEntry(U) || Uniques.contains(U);
6276 });
6277 });
6278 }) ||
6279 // Do not vectorize node, if estimated number of vector instructions is
6280 // more than estimated number of buildvector instructions. Number of
6281 // vector operands is number of vector instructions + number of vector
6282 // instructions for operands (buildvectors). Number of buildvector
6283 // instructions is just number_of_operands * number_of_scalars.
6284 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6285 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
6286 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6287 }
6288
getScalarsVectorizationState(InstructionsState & S,ArrayRef<Value * > VL,bool IsScatterVectorizeUserTE,OrdersType & CurrentOrder,SmallVectorImpl<Value * > & PointerOps) const6289 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6290 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
6291 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
6292 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
6293
6294 unsigned ShuffleOrOp =
6295 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
6296 auto *VL0 = cast<Instruction>(S.OpValue);
6297 switch (ShuffleOrOp) {
6298 case Instruction::PHI: {
6299 // Too many operands - gather, most probably won't be vectorized.
6300 if (VL0->getNumOperands() > MaxPHINumOperands)
6301 return TreeEntry::NeedToGather;
6302 // Check for terminator values (e.g. invoke).
6303 for (Value *V : VL)
6304 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
6305 Instruction *Term = dyn_cast<Instruction>(Incoming);
6306 if (Term && Term->isTerminator()) {
6307 LLVM_DEBUG(dbgs()
6308 << "SLP: Need to swizzle PHINodes (terminator use).\n");
6309 return TreeEntry::NeedToGather;
6310 }
6311 }
6312
6313 return TreeEntry::Vectorize;
6314 }
6315 case Instruction::ExtractValue:
6316 case Instruction::ExtractElement: {
6317 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6318 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6319 if (!isPowerOf2_32(VL.size()))
6320 return TreeEntry::NeedToGather;
6321 if (Reuse || !CurrentOrder.empty())
6322 return TreeEntry::Vectorize;
6323 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6324 return TreeEntry::NeedToGather;
6325 }
6326 case Instruction::InsertElement: {
6327 // Check that we have a buildvector and not a shuffle of 2 or more
6328 // different vectors.
6329 ValueSet SourceVectors;
6330 for (Value *V : VL) {
6331 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
6332 assert(getElementIndex(V) != std::nullopt &&
6333 "Non-constant or undef index?");
6334 }
6335
6336 if (count_if(VL, [&SourceVectors](Value *V) {
6337 return !SourceVectors.contains(V);
6338 }) >= 2) {
6339 // Found 2nd source vector - cancel.
6340 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6341 "different source vectors.\n");
6342 return TreeEntry::NeedToGather;
6343 }
6344
6345 return TreeEntry::Vectorize;
6346 }
6347 case Instruction::Load: {
6348 // Check that a vectorized load would load the same memory as a scalar
6349 // load. For example, we don't want to vectorize loads that are smaller
6350 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6351 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6352 // from such a struct, we read/write packed bits disagreeing with the
6353 // unvectorized version.
6354 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
6355 case LoadsState::Vectorize:
6356 return TreeEntry::Vectorize;
6357 case LoadsState::ScatterVectorize:
6358 return TreeEntry::ScatterVectorize;
6359 case LoadsState::StridedVectorize:
6360 return TreeEntry::StridedVectorize;
6361 case LoadsState::Gather:
6362 #ifndef NDEBUG
6363 Type *ScalarTy = VL0->getType();
6364 if (DL->getTypeSizeInBits(ScalarTy) !=
6365 DL->getTypeAllocSizeInBits(ScalarTy))
6366 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6367 else if (any_of(VL,
6368 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6369 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6370 else
6371 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6372 #endif // NDEBUG
6373 return TreeEntry::NeedToGather;
6374 }
6375 llvm_unreachable("Unexpected state of loads");
6376 }
6377 case Instruction::ZExt:
6378 case Instruction::SExt:
6379 case Instruction::FPToUI:
6380 case Instruction::FPToSI:
6381 case Instruction::FPExt:
6382 case Instruction::PtrToInt:
6383 case Instruction::IntToPtr:
6384 case Instruction::SIToFP:
6385 case Instruction::UIToFP:
6386 case Instruction::Trunc:
6387 case Instruction::FPTrunc:
6388 case Instruction::BitCast: {
6389 Type *SrcTy = VL0->getOperand(0)->getType();
6390 for (Value *V : VL) {
6391 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6392 if (Ty != SrcTy || !isValidElementType(Ty)) {
6393 LLVM_DEBUG(
6394 dbgs() << "SLP: Gathering casts with different src types.\n");
6395 return TreeEntry::NeedToGather;
6396 }
6397 }
6398 return TreeEntry::Vectorize;
6399 }
6400 case Instruction::ICmp:
6401 case Instruction::FCmp: {
6402 // Check that all of the compares have the same predicate.
6403 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6404 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
6405 Type *ComparedTy = VL0->getOperand(0)->getType();
6406 for (Value *V : VL) {
6407 CmpInst *Cmp = cast<CmpInst>(V);
6408 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6409 Cmp->getOperand(0)->getType() != ComparedTy) {
6410 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6411 return TreeEntry::NeedToGather;
6412 }
6413 }
6414 return TreeEntry::Vectorize;
6415 }
6416 case Instruction::Select:
6417 case Instruction::FNeg:
6418 case Instruction::Add:
6419 case Instruction::FAdd:
6420 case Instruction::Sub:
6421 case Instruction::FSub:
6422 case Instruction::Mul:
6423 case Instruction::FMul:
6424 case Instruction::UDiv:
6425 case Instruction::SDiv:
6426 case Instruction::FDiv:
6427 case Instruction::URem:
6428 case Instruction::SRem:
6429 case Instruction::FRem:
6430 case Instruction::Shl:
6431 case Instruction::LShr:
6432 case Instruction::AShr:
6433 case Instruction::And:
6434 case Instruction::Or:
6435 case Instruction::Xor:
6436 return TreeEntry::Vectorize;
6437 case Instruction::GetElementPtr: {
6438 // We don't combine GEPs with complicated (nested) indexing.
6439 for (Value *V : VL) {
6440 auto *I = dyn_cast<GetElementPtrInst>(V);
6441 if (!I)
6442 continue;
6443 if (I->getNumOperands() != 2) {
6444 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6445 return TreeEntry::NeedToGather;
6446 }
6447 }
6448
6449 // We can't combine several GEPs into one vector if they operate on
6450 // different types.
6451 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6452 for (Value *V : VL) {
6453 auto *GEP = dyn_cast<GEPOperator>(V);
6454 if (!GEP)
6455 continue;
6456 Type *CurTy = GEP->getSourceElementType();
6457 if (Ty0 != CurTy) {
6458 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6459 return TreeEntry::NeedToGather;
6460 }
6461 }
6462
6463 // We don't combine GEPs with non-constant indexes.
6464 Type *Ty1 = VL0->getOperand(1)->getType();
6465 for (Value *V : VL) {
6466 auto *I = dyn_cast<GetElementPtrInst>(V);
6467 if (!I)
6468 continue;
6469 auto *Op = I->getOperand(1);
6470 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6471 (Op->getType() != Ty1 &&
6472 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6473 Op->getType()->getScalarSizeInBits() >
6474 DL->getIndexSizeInBits(
6475 V->getType()->getPointerAddressSpace())))) {
6476 LLVM_DEBUG(
6477 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6478 return TreeEntry::NeedToGather;
6479 }
6480 }
6481
6482 return TreeEntry::Vectorize;
6483 }
6484 case Instruction::Store: {
6485 // Check if the stores are consecutive or if we need to swizzle them.
6486 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6487 // Avoid types that are padded when being allocated as scalars, while
6488 // being packed together in a vector (such as i1).
6489 if (DL->getTypeSizeInBits(ScalarTy) !=
6490 DL->getTypeAllocSizeInBits(ScalarTy)) {
6491 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6492 return TreeEntry::NeedToGather;
6493 }
6494 // Make sure all stores in the bundle are simple - we can't vectorize
6495 // atomic or volatile stores.
6496 for (Value *V : VL) {
6497 auto *SI = cast<StoreInst>(V);
6498 if (!SI->isSimple()) {
6499 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6500 return TreeEntry::NeedToGather;
6501 }
6502 PointerOps.push_back(SI->getPointerOperand());
6503 }
6504
6505 // Check the order of pointer operands.
6506 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
6507 Value *Ptr0;
6508 Value *PtrN;
6509 if (CurrentOrder.empty()) {
6510 Ptr0 = PointerOps.front();
6511 PtrN = PointerOps.back();
6512 } else {
6513 Ptr0 = PointerOps[CurrentOrder.front()];
6514 PtrN = PointerOps[CurrentOrder.back()];
6515 }
6516 std::optional<int> Dist =
6517 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6518 // Check that the sorted pointer operands are consecutive.
6519 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6520 return TreeEntry::Vectorize;
6521 }
6522
6523 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6524 return TreeEntry::NeedToGather;
6525 }
6526 case Instruction::Call: {
6527 // Check if the calls are all to the same vectorizable intrinsic or
6528 // library function.
6529 CallInst *CI = cast<CallInst>(VL0);
6530 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6531
6532 VFShape Shape = VFShape::get(
6533 CI->getFunctionType(),
6534 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
6535 false /*HasGlobalPred*/);
6536 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6537
6538 if (!VecFunc && !isTriviallyVectorizable(ID)) {
6539 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6540 return TreeEntry::NeedToGather;
6541 }
6542 Function *F = CI->getCalledFunction();
6543 unsigned NumArgs = CI->arg_size();
6544 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6545 for (unsigned J = 0; J != NumArgs; ++J)
6546 if (isVectorIntrinsicWithScalarOpAtArg(ID, J))
6547 ScalarArgs[J] = CI->getArgOperand(J);
6548 for (Value *V : VL) {
6549 CallInst *CI2 = dyn_cast<CallInst>(V);
6550 if (!CI2 || CI2->getCalledFunction() != F ||
6551 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
6552 (VecFunc &&
6553 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6554 !CI->hasIdenticalOperandBundleSchema(*CI2)) {
6555 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6556 << "\n");
6557 return TreeEntry::NeedToGather;
6558 }
6559 // Some intrinsics have scalar arguments and should be same in order for
6560 // them to be vectorized.
6561 for (unsigned J = 0; J != NumArgs; ++J) {
6562 if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) {
6563 Value *A1J = CI2->getArgOperand(J);
6564 if (ScalarArgs[J] != A1J) {
6565 LLVM_DEBUG(dbgs()
6566 << "SLP: mismatched arguments in call:" << *CI
6567 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6568 return TreeEntry::NeedToGather;
6569 }
6570 }
6571 }
6572 // Verify that the bundle operands are identical between the two calls.
6573 if (CI->hasOperandBundles() &&
6574 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6575 CI->op_begin() + CI->getBundleOperandsEndIndex(),
6576 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6577 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6578 << "!=" << *V << '\n');
6579 return TreeEntry::NeedToGather;
6580 }
6581 }
6582
6583 return TreeEntry::Vectorize;
6584 }
6585 case Instruction::ShuffleVector: {
6586 // If this is not an alternate sequence of opcode like add-sub
6587 // then do not vectorize this instruction.
6588 if (!S.isAltShuffle()) {
6589 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6590 return TreeEntry::NeedToGather;
6591 }
6592 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6593 LLVM_DEBUG(
6594 dbgs()
6595 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6596 "the whole alt sequence is not profitable.\n");
6597 return TreeEntry::NeedToGather;
6598 }
6599
6600 return TreeEntry::Vectorize;
6601 }
6602 default:
6603 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6604 return TreeEntry::NeedToGather;
6605 }
6606 }
6607
6608 namespace {
6609 /// Allows to correctly handle operands of the phi nodes based on the \p Main
6610 /// PHINode order of incoming basic blocks/values.
6611 class PHIHandler {
6612 DominatorTree &DT;
6613 PHINode *Main = nullptr;
6614 SmallVector<Value *> Phis;
6615 SmallVector<SmallVector<Value *>> Operands;
6616
6617 public:
6618 PHIHandler() = delete;
PHIHandler(DominatorTree & DT,PHINode * Main,ArrayRef<Value * > Phis)6619 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
6620 : DT(DT), Main(Main), Phis(Phis),
6621 Operands(Main->getNumIncomingValues(),
6622 SmallVector<Value *>(Phis.size(), nullptr)) {}
buildOperands()6623 void buildOperands() {
6624 constexpr unsigned FastLimit = 4;
6625 if (Main->getNumIncomingValues() <= FastLimit) {
6626 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6627 BasicBlock *InBB = Main->getIncomingBlock(I);
6628 if (!DT.isReachableFromEntry(InBB)) {
6629 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6630 continue;
6631 }
6632 // Prepare the operand vector.
6633 for (auto [Idx, V] : enumerate(Phis)) {
6634 auto *P = cast<PHINode>(V);
6635 if (P->getIncomingBlock(I) == InBB)
6636 Operands[I][Idx] = P->getIncomingValue(I);
6637 else
6638 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
6639 }
6640 }
6641 return;
6642 }
6643 SmallDenseMap<BasicBlock *, SmallVector<unsigned>, 4> Blocks;
6644 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6645 BasicBlock *InBB = Main->getIncomingBlock(I);
6646 if (!DT.isReachableFromEntry(InBB)) {
6647 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6648 continue;
6649 }
6650 Blocks.try_emplace(InBB).first->second.push_back(I);
6651 }
6652 for (auto [Idx, V] : enumerate(Phis)) {
6653 auto *P = cast<PHINode>(V);
6654 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
6655 BasicBlock *InBB = P->getIncomingBlock(I);
6656 if (InBB == Main->getIncomingBlock(I)) {
6657 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
6658 continue;
6659 Operands[I][Idx] = P->getIncomingValue(I);
6660 continue;
6661 }
6662 auto It = Blocks.find(InBB);
6663 if (It == Blocks.end())
6664 continue;
6665 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
6666 }
6667 }
6668 for (const auto &P : Blocks) {
6669 if (P.getSecond().size() <= 1)
6670 continue;
6671 unsigned BasicI = P.getSecond().front();
6672 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
6673 assert(all_of(enumerate(Operands[I]),
6674 [&](const auto &Data) {
6675 return !Data.value() ||
6676 Data.value() == Operands[BasicI][Data.index()];
6677 }) &&
6678 "Expected empty operands list.");
6679 Operands[I] = Operands[BasicI];
6680 }
6681 }
6682 }
getOperands(unsigned I) const6683 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
6684 };
6685 } // namespace
6686
buildTree_rec(ArrayRef<Value * > VL,unsigned Depth,const EdgeInfo & UserTreeIdx)6687 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6688 const EdgeInfo &UserTreeIdx) {
6689 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6690
6691 SmallVector<int> ReuseShuffleIndices;
6692 SmallVector<Value *> UniqueValues;
6693 SmallVector<Value *> NonUniqueValueVL;
6694 auto TryToFindDuplicates = [&](const InstructionsState &S,
6695 bool DoNotFail = false) {
6696 // Check that every instruction appears once in this bundle.
6697 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6698 for (Value *V : VL) {
6699 if (isConstant(V)) {
6700 ReuseShuffleIndices.emplace_back(
6701 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6702 UniqueValues.emplace_back(V);
6703 continue;
6704 }
6705 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6706 ReuseShuffleIndices.emplace_back(Res.first->second);
6707 if (Res.second)
6708 UniqueValues.emplace_back(V);
6709 }
6710 size_t NumUniqueScalarValues = UniqueValues.size();
6711 if (NumUniqueScalarValues == VL.size()) {
6712 ReuseShuffleIndices.clear();
6713 } else {
6714 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6715 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6716 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6717 "for nodes with padding.\n");
6718 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6719 return false;
6720 }
6721 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6722 if (NumUniqueScalarValues <= 1 ||
6723 (UniquePositions.size() == 1 && all_of(UniqueValues,
6724 [](Value *V) {
6725 return isa<UndefValue>(V) ||
6726 !isConstant(V);
6727 })) ||
6728 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6729 if (DoNotFail && UniquePositions.size() > 1 &&
6730 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6731 all_of(UniqueValues, [=](Value *V) {
6732 return isa<ExtractElementInst>(V) ||
6733 areAllUsersVectorized(cast<Instruction>(V),
6734 UserIgnoreList);
6735 })) {
6736 unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6737 if (PWSz == VL.size()) {
6738 ReuseShuffleIndices.clear();
6739 } else {
6740 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6741 NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6742 UniqueValues.back());
6743 VL = NonUniqueValueVL;
6744 }
6745 return true;
6746 }
6747 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6748 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6749 return false;
6750 }
6751 VL = UniqueValues;
6752 }
6753 return true;
6754 };
6755
6756 InstructionsState S = getSameOpcode(VL, *TLI);
6757
6758 // Don't vectorize ephemeral values.
6759 if (!EphValues.empty()) {
6760 for (Value *V : VL) {
6761 if (EphValues.count(V)) {
6762 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6763 << ") is ephemeral.\n");
6764 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6765 return;
6766 }
6767 }
6768 }
6769
6770 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6771 // a load), in which case peek through to include it in the tree, without
6772 // ballooning over-budget.
6773 if (Depth >= RecursionMaxDepth &&
6774 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6775 VL.size() >= 4 &&
6776 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6777 return match(I,
6778 m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&
6779 cast<Instruction>(I)->getOpcode() ==
6780 cast<Instruction>(S.MainOp)->getOpcode();
6781 })))) {
6782 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6783 if (TryToFindDuplicates(S))
6784 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6785 ReuseShuffleIndices);
6786 return;
6787 }
6788
6789 // Don't handle scalable vectors
6790 if (S.getOpcode() == Instruction::ExtractElement &&
6791 isa<ScalableVectorType>(
6792 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6793 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6794 if (TryToFindDuplicates(S))
6795 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6796 ReuseShuffleIndices);
6797 return;
6798 }
6799
6800 // Don't handle vectors.
6801 if (!SLPReVec && S.OpValue->getType()->isVectorTy() &&
6802 !isa<InsertElementInst>(S.OpValue)) {
6803 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6804 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6805 return;
6806 }
6807
6808 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6809 if (!SLPReVec && SI->getValueOperand()->getType()->isVectorTy()) {
6810 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6811 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6812 return;
6813 }
6814
6815 // If all of the operands are identical or constant we have a simple solution.
6816 // If we deal with insert/extract instructions, they all must have constant
6817 // indices, otherwise we should gather them, not try to vectorize.
6818 // If alternate op node with 2 elements with gathered operands - do not
6819 // vectorize.
6820 auto &&NotProfitableForVectorization = [&S, this,
6821 Depth](ArrayRef<Value *> VL) {
6822 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6823 return false;
6824 if (VectorizableTree.size() < MinTreeSize)
6825 return false;
6826 if (Depth >= RecursionMaxDepth - 1)
6827 return true;
6828 // Check if all operands are extracts, part of vector node or can build a
6829 // regular vectorize node.
6830 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6831 for (Value *V : VL) {
6832 auto *I = cast<Instruction>(V);
6833 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
6834 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6835 }));
6836 }
6837 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
6838 if ((IsCommutative &&
6839 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6840 (!IsCommutative &&
6841 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
6842 return true;
6843 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6844 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
6845 auto *I1 = cast<Instruction>(VL.front());
6846 auto *I2 = cast<Instruction>(VL.back());
6847 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6848 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6849 I2->getOperand(Op));
6850 if (static_cast<unsigned>(count_if(
6851 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6852 return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
6853 })) >= S.MainOp->getNumOperands() / 2)
6854 return false;
6855 if (S.MainOp->getNumOperands() > 2)
6856 return true;
6857 if (IsCommutative) {
6858 // Check permuted operands.
6859 Candidates.clear();
6860 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6861 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6862 I2->getOperand((Op + 1) % E));
6863 if (any_of(
6864 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6865 return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);
6866 }))
6867 return false;
6868 }
6869 return true;
6870 };
6871 SmallVector<unsigned> SortedIndices;
6872 BasicBlock *BB = nullptr;
6873 bool IsScatterVectorizeUserTE =
6874 UserTreeIdx.UserTE &&
6875 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6876 bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL);
6877 bool AreScatterAllGEPSameBlock =
6878 (IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() &&
6879 VL.size() > 2 &&
6880 all_of(VL,
6881 [&BB](Value *V) {
6882 auto *I = dyn_cast<GetElementPtrInst>(V);
6883 if (!I)
6884 return doesNotNeedToBeScheduled(V);
6885 if (!BB)
6886 BB = I->getParent();
6887 return BB == I->getParent() && I->getNumOperands() == 2;
6888 }) &&
6889 BB &&
6890 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6891 SortedIndices));
6892 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
6893 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6894 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6895 S.OpValue) &&
6896 !all_of(VL, isVectorLikeInstWithConstOps)) ||
6897 NotProfitableForVectorization(VL)) {
6898 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6899 if (TryToFindDuplicates(S))
6900 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6901 ReuseShuffleIndices);
6902 return;
6903 }
6904
6905 // We now know that this is a vector of instructions of the same type from
6906 // the same block.
6907
6908 // Check if this is a duplicate of another entry.
6909 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6910 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6911 if (!E->isSame(VL)) {
6912 auto It = MultiNodeScalars.find(S.OpValue);
6913 if (It != MultiNodeScalars.end()) {
6914 auto *TEIt = find_if(It->getSecond(),
6915 [&](TreeEntry *ME) { return ME->isSame(VL); });
6916 if (TEIt != It->getSecond().end())
6917 E = *TEIt;
6918 else
6919 E = nullptr;
6920 } else {
6921 E = nullptr;
6922 }
6923 }
6924 if (!E) {
6925 if (!doesNotNeedToBeScheduled(S.OpValue)) {
6926 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6927 if (TryToFindDuplicates(S))
6928 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6929 ReuseShuffleIndices);
6930 return;
6931 }
6932 } else {
6933 // Record the reuse of the tree node. FIXME, currently this is only used
6934 // to properly draw the graph rather than for the actual vectorization.
6935 E->UserTreeIndices.push_back(UserTreeIdx);
6936 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6937 << ".\n");
6938 return;
6939 }
6940 }
6941
6942 // Check that none of the instructions in the bundle are already in the tree.
6943 for (Value *V : VL) {
6944 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6945 doesNotNeedToBeScheduled(V))
6946 continue;
6947 if (getTreeEntry(V)) {
6948 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6949 << ") is already in tree.\n");
6950 if (TryToFindDuplicates(S))
6951 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6952 ReuseShuffleIndices);
6953 return;
6954 }
6955 }
6956
6957 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6958 if (UserIgnoreList && !UserIgnoreList->empty()) {
6959 for (Value *V : VL) {
6960 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6961 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6962 if (TryToFindDuplicates(S))
6963 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6964 ReuseShuffleIndices);
6965 return;
6966 }
6967 }
6968 }
6969
6970 // Special processing for sorted pointers for ScatterVectorize node with
6971 // constant indeces only.
6972 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
6973 assert(S.OpValue->getType()->isPointerTy() &&
6974 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6975 "Expected pointers only.");
6976 // Reset S to make it GetElementPtr kind of node.
6977 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
6978 assert(It != VL.end() && "Expected at least one GEP.");
6979 S = getSameOpcode(*It, *TLI);
6980 }
6981
6982 // Check that all of the users of the scalars that we want to vectorize are
6983 // schedulable.
6984 auto *VL0 = cast<Instruction>(S.OpValue);
6985 BB = VL0->getParent();
6986
6987 if (!DT->isReachableFromEntry(BB)) {
6988 // Don't go into unreachable blocks. They may contain instructions with
6989 // dependency cycles which confuse the final scheduling.
6990 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6991 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6992 return;
6993 }
6994
6995 // Don't go into catchswitch blocks, which can happen with PHIs.
6996 // Such blocks can only have PHIs and the catchswitch. There is no
6997 // place to insert a shuffle if we need to, so just avoid that issue.
6998 if (isa<CatchSwitchInst>(BB->getTerminator())) {
6999 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
7000 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7001 return;
7002 }
7003
7004 // Check that every instruction appears once in this bundle.
7005 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
7006 return;
7007
7008 // Perform specific checks for each particular instruction kind.
7009 OrdersType CurrentOrder;
7010 SmallVector<Value *> PointerOps;
7011 TreeEntry::EntryState State = getScalarsVectorizationState(
7012 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
7013 if (State == TreeEntry::NeedToGather) {
7014 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7015 ReuseShuffleIndices);
7016 return;
7017 }
7018
7019 auto &BSRef = BlocksSchedules[BB];
7020 if (!BSRef)
7021 BSRef = std::make_unique<BlockScheduling>(BB);
7022
7023 BlockScheduling &BS = *BSRef;
7024
7025 std::optional<ScheduleData *> Bundle =
7026 BS.tryScheduleBundle(UniqueValues, this, S);
7027 #ifdef EXPENSIVE_CHECKS
7028 // Make sure we didn't break any internal invariants
7029 BS.verify();
7030 #endif
7031 if (!Bundle) {
7032 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
7033 assert((!BS.getScheduleData(VL0) ||
7034 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
7035 "tryScheduleBundle should cancelScheduling on failure");
7036 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7037 ReuseShuffleIndices);
7038 NonScheduledFirst.insert(VL.front());
7039 return;
7040 }
7041 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
7042
7043 unsigned ShuffleOrOp = S.isAltShuffle() ?
7044 (unsigned) Instruction::ShuffleVector : S.getOpcode();
7045 switch (ShuffleOrOp) {
7046 case Instruction::PHI: {
7047 auto *PH = cast<PHINode>(VL0);
7048
7049 TreeEntry *TE =
7050 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
7051 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
7052
7053 // Keeps the reordered operands to avoid code duplication.
7054 PHIHandler Handler(*DT, PH, VL);
7055 Handler.buildOperands();
7056 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7057 TE->setOperand(I, Handler.getOperands(I));
7058 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7059 buildTree_rec(Handler.getOperands(I), Depth + 1, {TE, I});
7060 return;
7061 }
7062 case Instruction::ExtractValue:
7063 case Instruction::ExtractElement: {
7064 if (CurrentOrder.empty()) {
7065 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
7066 } else {
7067 LLVM_DEBUG({
7068 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
7069 "with order";
7070 for (unsigned Idx : CurrentOrder)
7071 dbgs() << " " << Idx;
7072 dbgs() << "\n";
7073 });
7074 fixupOrderingIndices(CurrentOrder);
7075 }
7076 // Insert new order with initial value 0, if it does not exist,
7077 // otherwise return the iterator to the existing one.
7078 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7079 ReuseShuffleIndices, CurrentOrder);
7080 // This is a special case, as it does not gather, but at the same time
7081 // we are not extending buildTree_rec() towards the operands.
7082 ValueList Op0;
7083 Op0.assign(VL.size(), VL0->getOperand(0));
7084 VectorizableTree.back()->setOperand(0, Op0);
7085 return;
7086 }
7087 case Instruction::InsertElement: {
7088 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
7089
7090 auto OrdCompare = [](const std::pair<int, int> &P1,
7091 const std::pair<int, int> &P2) {
7092 return P1.first > P2.first;
7093 };
7094 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
7095 decltype(OrdCompare)>
7096 Indices(OrdCompare);
7097 for (int I = 0, E = VL.size(); I < E; ++I) {
7098 unsigned Idx = *getElementIndex(VL[I]);
7099 Indices.emplace(Idx, I);
7100 }
7101 OrdersType CurrentOrder(VL.size(), VL.size());
7102 bool IsIdentity = true;
7103 for (int I = 0, E = VL.size(); I < E; ++I) {
7104 CurrentOrder[Indices.top().second] = I;
7105 IsIdentity &= Indices.top().second == I;
7106 Indices.pop();
7107 }
7108 if (IsIdentity)
7109 CurrentOrder.clear();
7110 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7111 std::nullopt, CurrentOrder);
7112 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
7113
7114 TE->setOperandsInOrder();
7115 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
7116 return;
7117 }
7118 case Instruction::Load: {
7119 // Check that a vectorized load would load the same memory as a scalar
7120 // load. For example, we don't want to vectorize loads that are smaller
7121 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7122 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7123 // from such a struct, we read/write packed bits disagreeing with the
7124 // unvectorized version.
7125 TreeEntry *TE = nullptr;
7126 fixupOrderingIndices(CurrentOrder);
7127 switch (State) {
7128 case TreeEntry::Vectorize:
7129 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7130 ReuseShuffleIndices, CurrentOrder);
7131 if (CurrentOrder.empty())
7132 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
7133 else
7134 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
7135 TE->setOperandsInOrder();
7136 break;
7137 case TreeEntry::StridedVectorize:
7138 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7139 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
7140 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
7141 TE->setOperandsInOrder();
7142 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
7143 break;
7144 case TreeEntry::ScatterVectorize:
7145 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7146 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
7147 UserTreeIdx, ReuseShuffleIndices);
7148 TE->setOperandsInOrder();
7149 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
7150 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
7151 break;
7152 case TreeEntry::NeedToGather:
7153 llvm_unreachable("Unexpected loads state.");
7154 }
7155 return;
7156 }
7157 case Instruction::ZExt:
7158 case Instruction::SExt:
7159 case Instruction::FPToUI:
7160 case Instruction::FPToSI:
7161 case Instruction::FPExt:
7162 case Instruction::PtrToInt:
7163 case Instruction::IntToPtr:
7164 case Instruction::SIToFP:
7165 case Instruction::UIToFP:
7166 case Instruction::Trunc:
7167 case Instruction::FPTrunc:
7168 case Instruction::BitCast: {
7169 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
7170 std::make_pair(std::numeric_limits<unsigned>::min(),
7171 std::numeric_limits<unsigned>::max()));
7172 if (ShuffleOrOp == Instruction::ZExt ||
7173 ShuffleOrOp == Instruction::SExt) {
7174 CastMaxMinBWSizes = std::make_pair(
7175 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7176 PrevMaxBW),
7177 std::min<unsigned>(
7178 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7179 PrevMinBW));
7180 } else if (ShuffleOrOp == Instruction::Trunc) {
7181 CastMaxMinBWSizes = std::make_pair(
7182 std::max<unsigned>(
7183 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7184 PrevMaxBW),
7185 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7186 PrevMinBW));
7187 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7188 } else if (ShuffleOrOp == Instruction::SIToFP ||
7189 ShuffleOrOp == Instruction::UIToFP) {
7190 unsigned NumSignBits =
7191 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7192 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7193 APInt Mask = DB->getDemandedBits(OpI);
7194 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
7195 }
7196 if (NumSignBits * 2 >=
7197 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7198 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7199 }
7200 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7201 ReuseShuffleIndices);
7202 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
7203
7204 TE->setOperandsInOrder();
7205 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7206 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7207 return;
7208 }
7209 case Instruction::ICmp:
7210 case Instruction::FCmp: {
7211 // Check that all of the compares have the same predicate.
7212 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7213 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7214 ReuseShuffleIndices);
7215 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
7216
7217 ValueList Left, Right;
7218 if (cast<CmpInst>(VL0)->isCommutative()) {
7219 // Commutative predicate - collect + sort operands of the instructions
7220 // so that each side is more likely to have the same opcode.
7221 assert(P0 == CmpInst::getSwappedPredicate(P0) &&
7222 "Commutative Predicate mismatch");
7223 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7224 } else {
7225 // Collect operands - commute if it uses the swapped predicate.
7226 for (Value *V : VL) {
7227 auto *Cmp = cast<CmpInst>(V);
7228 Value *LHS = Cmp->getOperand(0);
7229 Value *RHS = Cmp->getOperand(1);
7230 if (Cmp->getPredicate() != P0)
7231 std::swap(LHS, RHS);
7232 Left.push_back(LHS);
7233 Right.push_back(RHS);
7234 }
7235 }
7236 TE->setOperand(0, Left);
7237 TE->setOperand(1, Right);
7238 buildTree_rec(Left, Depth + 1, {TE, 0});
7239 buildTree_rec(Right, Depth + 1, {TE, 1});
7240 if (ShuffleOrOp == Instruction::ICmp) {
7241 unsigned NumSignBits0 =
7242 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7243 if (NumSignBits0 * 2 >=
7244 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7245 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
7246 unsigned NumSignBits1 =
7247 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
7248 if (NumSignBits1 * 2 >=
7249 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
7250 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
7251 }
7252 return;
7253 }
7254 case Instruction::Select:
7255 case Instruction::FNeg:
7256 case Instruction::Add:
7257 case Instruction::FAdd:
7258 case Instruction::Sub:
7259 case Instruction::FSub:
7260 case Instruction::Mul:
7261 case Instruction::FMul:
7262 case Instruction::UDiv:
7263 case Instruction::SDiv:
7264 case Instruction::FDiv:
7265 case Instruction::URem:
7266 case Instruction::SRem:
7267 case Instruction::FRem:
7268 case Instruction::Shl:
7269 case Instruction::LShr:
7270 case Instruction::AShr:
7271 case Instruction::And:
7272 case Instruction::Or:
7273 case Instruction::Xor: {
7274 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7275 ReuseShuffleIndices);
7276 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
7277
7278 // Sort operands of the instructions so that each side is more likely to
7279 // have the same opcode.
7280 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
7281 ValueList Left, Right;
7282 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7283 TE->setOperand(0, Left);
7284 TE->setOperand(1, Right);
7285 buildTree_rec(Left, Depth + 1, {TE, 0});
7286 buildTree_rec(Right, Depth + 1, {TE, 1});
7287 return;
7288 }
7289
7290 TE->setOperandsInOrder();
7291 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7292 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7293 return;
7294 }
7295 case Instruction::GetElementPtr: {
7296 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7297 ReuseShuffleIndices);
7298 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
7299 SmallVector<ValueList, 2> Operands(2);
7300 // Prepare the operand vector for pointer operands.
7301 for (Value *V : VL) {
7302 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7303 if (!GEP) {
7304 Operands.front().push_back(V);
7305 continue;
7306 }
7307 Operands.front().push_back(GEP->getPointerOperand());
7308 }
7309 TE->setOperand(0, Operands.front());
7310 // Need to cast all indices to the same type before vectorization to
7311 // avoid crash.
7312 // Required to be able to find correct matches between different gather
7313 // nodes and reuse the vectorized values rather than trying to gather them
7314 // again.
7315 int IndexIdx = 1;
7316 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7317 Type *Ty = all_of(VL,
7318 [VL0Ty, IndexIdx](Value *V) {
7319 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7320 if (!GEP)
7321 return true;
7322 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
7323 })
7324 ? VL0Ty
7325 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
7326 ->getPointerOperandType()
7327 ->getScalarType());
7328 // Prepare the operand vector.
7329 for (Value *V : VL) {
7330 auto *I = dyn_cast<GetElementPtrInst>(V);
7331 if (!I) {
7332 Operands.back().push_back(
7333 ConstantInt::get(Ty, 0, /*isSigned=*/false));
7334 continue;
7335 }
7336 auto *Op = I->getOperand(IndexIdx);
7337 auto *CI = dyn_cast<ConstantInt>(Op);
7338 if (!CI)
7339 Operands.back().push_back(Op);
7340 else
7341 Operands.back().push_back(ConstantFoldIntegerCast(
7342 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7343 }
7344 TE->setOperand(IndexIdx, Operands.back());
7345
7346 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7347 buildTree_rec(Operands[I], Depth + 1, {TE, I});
7348 return;
7349 }
7350 case Instruction::Store: {
7351 bool Consecutive = CurrentOrder.empty();
7352 if (!Consecutive)
7353 fixupOrderingIndices(CurrentOrder);
7354 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7355 ReuseShuffleIndices, CurrentOrder);
7356 TE->setOperandsInOrder();
7357 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
7358 if (Consecutive)
7359 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7360 else
7361 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7362 return;
7363 }
7364 case Instruction::Call: {
7365 // Check if the calls are all to the same vectorizable intrinsic or
7366 // library function.
7367 CallInst *CI = cast<CallInst>(VL0);
7368 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7369
7370 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7371 ReuseShuffleIndices);
7372 // Sort operands of the instructions so that each side is more likely to
7373 // have the same opcode.
7374 if (isCommutative(VL0)) {
7375 ValueList Left, Right;
7376 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7377 TE->setOperand(0, Left);
7378 TE->setOperand(1, Right);
7379 SmallVector<ValueList> Operands;
7380 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7381 Operands.emplace_back();
7382 if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
7383 continue;
7384 for (Value *V : VL) {
7385 auto *CI2 = cast<CallInst>(V);
7386 Operands.back().push_back(CI2->getArgOperand(I));
7387 }
7388 TE->setOperand(I, Operands.back());
7389 }
7390 buildTree_rec(Left, Depth + 1, {TE, 0});
7391 buildTree_rec(Right, Depth + 1, {TE, 1});
7392 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7393 if (Operands[I - 2].empty())
7394 continue;
7395 buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
7396 }
7397 return;
7398 }
7399 TE->setOperandsInOrder();
7400 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
7401 // For scalar operands no need to create an entry since no need to
7402 // vectorize it.
7403 if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
7404 continue;
7405 ValueList Operands;
7406 // Prepare the operand vector.
7407 for (Value *V : VL) {
7408 auto *CI2 = cast<CallInst>(V);
7409 Operands.push_back(CI2->getArgOperand(I));
7410 }
7411 buildTree_rec(Operands, Depth + 1, {TE, I});
7412 }
7413 return;
7414 }
7415 case Instruction::ShuffleVector: {
7416 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7417 ReuseShuffleIndices);
7418 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7419
7420 // Reorder operands if reordering would enable vectorization.
7421 auto *CI = dyn_cast<CmpInst>(VL0);
7422 if (isa<BinaryOperator>(VL0) || CI) {
7423 ValueList Left, Right;
7424 if (!CI || all_of(VL, [](Value *V) {
7425 return cast<CmpInst>(V)->isCommutative();
7426 })) {
7427 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7428 } else {
7429 auto *MainCI = cast<CmpInst>(S.MainOp);
7430 auto *AltCI = cast<CmpInst>(S.AltOp);
7431 CmpInst::Predicate MainP = MainCI->getPredicate();
7432 CmpInst::Predicate AltP = AltCI->getPredicate();
7433 assert(MainP != AltP &&
7434 "Expected different main/alternate predicates.");
7435 // Collect operands - commute if it uses the swapped predicate or
7436 // alternate operation.
7437 for (Value *V : VL) {
7438 auto *Cmp = cast<CmpInst>(V);
7439 Value *LHS = Cmp->getOperand(0);
7440 Value *RHS = Cmp->getOperand(1);
7441
7442 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
7443 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7444 std::swap(LHS, RHS);
7445 } else {
7446 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7447 std::swap(LHS, RHS);
7448 }
7449 Left.push_back(LHS);
7450 Right.push_back(RHS);
7451 }
7452 }
7453 TE->setOperand(0, Left);
7454 TE->setOperand(1, Right);
7455 buildTree_rec(Left, Depth + 1, {TE, 0});
7456 buildTree_rec(Right, Depth + 1, {TE, 1});
7457 return;
7458 }
7459
7460 TE->setOperandsInOrder();
7461 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7462 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
7463 return;
7464 }
7465 default:
7466 break;
7467 }
7468 llvm_unreachable("Unexpected vectorization of the instructions.");
7469 }
7470
canMapToVector(Type * T) const7471 unsigned BoUpSLP::canMapToVector(Type *T) const {
7472 unsigned N = 1;
7473 Type *EltTy = T;
7474
7475 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7476 if (auto *ST = dyn_cast<StructType>(EltTy)) {
7477 // Check that struct is homogeneous.
7478 for (const auto *Ty : ST->elements())
7479 if (Ty != *ST->element_begin())
7480 return 0;
7481 N *= ST->getNumElements();
7482 EltTy = *ST->element_begin();
7483 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
7484 N *= AT->getNumElements();
7485 EltTy = AT->getElementType();
7486 } else {
7487 auto *VT = cast<FixedVectorType>(EltTy);
7488 N *= VT->getNumElements();
7489 EltTy = VT->getElementType();
7490 }
7491 }
7492
7493 if (!isValidElementType(EltTy))
7494 return 0;
7495 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
7496 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7497 VTSize != DL->getTypeStoreSizeInBits(T))
7498 return 0;
7499 return N;
7500 }
7501
canReuseExtract(ArrayRef<Value * > VL,Value * OpValue,SmallVectorImpl<unsigned> & CurrentOrder,bool ResizeAllowed) const7502 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7503 SmallVectorImpl<unsigned> &CurrentOrder,
7504 bool ResizeAllowed) const {
7505 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7506 assert(It != VL.end() && "Expected at least one extract instruction.");
7507 auto *E0 = cast<Instruction>(*It);
7508 assert(
7509 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7510 "Invalid opcode");
7511 // Check if all of the extracts come from the same vector and from the
7512 // correct offset.
7513 Value *Vec = E0->getOperand(0);
7514
7515 CurrentOrder.clear();
7516
7517 // We have to extract from a vector/aggregate with the same number of elements.
7518 unsigned NElts;
7519 if (E0->getOpcode() == Instruction::ExtractValue) {
7520 NElts = canMapToVector(Vec->getType());
7521 if (!NElts)
7522 return false;
7523 // Check if load can be rewritten as load of vector.
7524 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7525 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
7526 return false;
7527 } else {
7528 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
7529 }
7530
7531 unsigned E = VL.size();
7532 if (!ResizeAllowed && NElts != E)
7533 return false;
7534 SmallVector<int> Indices(E, PoisonMaskElem);
7535 unsigned MinIdx = NElts, MaxIdx = 0;
7536 for (auto [I, V] : enumerate(VL)) {
7537 auto *Inst = dyn_cast<Instruction>(V);
7538 if (!Inst)
7539 continue;
7540 if (Inst->getOperand(0) != Vec)
7541 return false;
7542 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
7543 if (isa<UndefValue>(EE->getIndexOperand()))
7544 continue;
7545 std::optional<unsigned> Idx = getExtractIndex(Inst);
7546 if (!Idx)
7547 return false;
7548 const unsigned ExtIdx = *Idx;
7549 if (ExtIdx >= NElts)
7550 continue;
7551 Indices[I] = ExtIdx;
7552 if (MinIdx > ExtIdx)
7553 MinIdx = ExtIdx;
7554 if (MaxIdx < ExtIdx)
7555 MaxIdx = ExtIdx;
7556 }
7557 if (MaxIdx - MinIdx + 1 > E)
7558 return false;
7559 if (MaxIdx + 1 <= E)
7560 MinIdx = 0;
7561
7562 // Check that all of the indices extract from the correct offset.
7563 bool ShouldKeepOrder = true;
7564 // Assign to all items the initial value E + 1 so we can check if the extract
7565 // instruction index was used already.
7566 // Also, later we can check that all the indices are used and we have a
7567 // consecutive access in the extract instructions, by checking that no
7568 // element of CurrentOrder still has value E + 1.
7569 CurrentOrder.assign(E, E);
7570 for (unsigned I = 0; I < E; ++I) {
7571 if (Indices[I] == PoisonMaskElem)
7572 continue;
7573 const unsigned ExtIdx = Indices[I] - MinIdx;
7574 if (CurrentOrder[ExtIdx] != E) {
7575 CurrentOrder.clear();
7576 return false;
7577 }
7578 ShouldKeepOrder &= ExtIdx == I;
7579 CurrentOrder[ExtIdx] = I;
7580 }
7581 if (ShouldKeepOrder)
7582 CurrentOrder.clear();
7583
7584 return ShouldKeepOrder;
7585 }
7586
areAllUsersVectorized(Instruction * I,const SmallDenseSet<Value * > * VectorizedVals) const7587 bool BoUpSLP::areAllUsersVectorized(
7588 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7589 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
7590 all_of(I->users(), [this](User *U) {
7591 return ScalarToTreeEntry.contains(U) ||
7592 isVectorLikeInstWithConstOps(U) ||
7593 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7594 });
7595 }
7596
7597 static std::pair<InstructionCost, InstructionCost>
getVectorCallCosts(CallInst * CI,FixedVectorType * VecTy,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,ArrayRef<Type * > ArgTys)7598 getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
7599 TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7600 ArrayRef<Type *> ArgTys) {
7601 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7602
7603 // Calculate the cost of the scalar and vector calls.
7604 FastMathFlags FMF;
7605 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7606 FMF = FPCI->getFastMathFlags();
7607 SmallVector<const Value *> Arguments(CI->args());
7608 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7609 dyn_cast<IntrinsicInst>(CI));
7610 auto IntrinsicCost =
7611 TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
7612
7613 auto Shape = VFShape::get(CI->getFunctionType(),
7614 ElementCount::getFixed(VecTy->getNumElements()),
7615 false /*HasGlobalPred*/);
7616 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7617 auto LibCost = IntrinsicCost;
7618 if (!CI->isNoBuiltin() && VecFunc) {
7619 // Calculate the cost of the vector library call.
7620 // If the corresponding vector call is cheaper, return its cost.
7621 LibCost =
7622 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7623 }
7624 return {IntrinsicCost, LibCost};
7625 }
7626
buildAltOpShuffleMask(const function_ref<bool (Instruction *)> IsAltOp,SmallVectorImpl<int> & Mask,SmallVectorImpl<Value * > * OpScalars,SmallVectorImpl<Value * > * AltScalars) const7627 void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7628 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7629 SmallVectorImpl<Value *> *OpScalars,
7630 SmallVectorImpl<Value *> *AltScalars) const {
7631 unsigned Sz = Scalars.size();
7632 Mask.assign(Sz, PoisonMaskElem);
7633 SmallVector<int> OrderMask;
7634 if (!ReorderIndices.empty())
7635 inversePermutation(ReorderIndices, OrderMask);
7636 for (unsigned I = 0; I < Sz; ++I) {
7637 unsigned Idx = I;
7638 if (!ReorderIndices.empty())
7639 Idx = OrderMask[I];
7640 auto *OpInst = cast<Instruction>(Scalars[Idx]);
7641 if (IsAltOp(OpInst)) {
7642 Mask[I] = Sz + Idx;
7643 if (AltScalars)
7644 AltScalars->push_back(OpInst);
7645 } else {
7646 Mask[I] = Idx;
7647 if (OpScalars)
7648 OpScalars->push_back(OpInst);
7649 }
7650 }
7651 if (!ReuseShuffleIndices.empty()) {
7652 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7653 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7654 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7655 });
7656 Mask.swap(NewMask);
7657 }
7658 }
7659
isAlternateInstruction(const Instruction * I,const Instruction * MainOp,const Instruction * AltOp,const TargetLibraryInfo & TLI)7660 static bool isAlternateInstruction(const Instruction *I,
7661 const Instruction *MainOp,
7662 const Instruction *AltOp,
7663 const TargetLibraryInfo &TLI) {
7664 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7665 auto *AltCI = cast<CmpInst>(AltOp);
7666 CmpInst::Predicate MainP = MainCI->getPredicate();
7667 CmpInst::Predicate AltP = AltCI->getPredicate();
7668 assert(MainP != AltP && "Expected different main/alternate predicates.");
7669 auto *CI = cast<CmpInst>(I);
7670 if (isCmpSameOrSwapped(MainCI, CI, TLI))
7671 return false;
7672 if (isCmpSameOrSwapped(AltCI, CI, TLI))
7673 return true;
7674 CmpInst::Predicate P = CI->getPredicate();
7675 CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P);
7676
7677 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7678 "CmpInst expected to match either main or alternate predicate or "
7679 "their swap.");
7680 (void)AltP;
7681 return MainP != P && MainP != SwappedP;
7682 }
7683 return I->getOpcode() == AltOp->getOpcode();
7684 }
7685
getOperandInfo(ArrayRef<Value * > Ops)7686 TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7687 assert(!Ops.empty());
7688 const auto *Op0 = Ops.front();
7689
7690 const bool IsConstant = all_of(Ops, [](Value *V) {
7691 // TODO: We should allow undef elements here
7692 return isConstant(V) && !isa<UndefValue>(V);
7693 });
7694 const bool IsUniform = all_of(Ops, [=](Value *V) {
7695 // TODO: We should allow undef elements here
7696 return V == Op0;
7697 });
7698 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7699 // TODO: We should allow undef elements here
7700 if (auto *CI = dyn_cast<ConstantInt>(V))
7701 return CI->getValue().isPowerOf2();
7702 return false;
7703 });
7704 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7705 // TODO: We should allow undef elements here
7706 if (auto *CI = dyn_cast<ConstantInt>(V))
7707 return CI->getValue().isNegatedPowerOf2();
7708 return false;
7709 });
7710
7711 TTI::OperandValueKind VK = TTI::OK_AnyValue;
7712 if (IsConstant && IsUniform)
7713 VK = TTI::OK_UniformConstantValue;
7714 else if (IsConstant)
7715 VK = TTI::OK_NonUniformConstantValue;
7716 else if (IsUniform)
7717 VK = TTI::OK_UniformValue;
7718
7719 TTI::OperandValueProperties VP = TTI::OP_None;
7720 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7721 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7722
7723 return {VK, VP};
7724 }
7725
7726 namespace {
7727 /// The base class for shuffle instruction emission and shuffle cost estimation.
7728 class BaseShuffleAnalysis {
7729 protected:
7730 /// Checks if the mask is an identity mask.
7731 /// \param IsStrict if is true the function returns false if mask size does
7732 /// not match vector size.
isIdentityMask(ArrayRef<int> Mask,const FixedVectorType * VecTy,bool IsStrict)7733 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7734 bool IsStrict) {
7735 int Limit = Mask.size();
7736 int VF = VecTy->getNumElements();
7737 int Index = -1;
7738 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7739 return true;
7740 if (!IsStrict) {
7741 // Consider extract subvector starting from index 0.
7742 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
7743 Index == 0)
7744 return true;
7745 // All VF-size submasks are identity (e.g.
7746 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7747 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7748 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7749 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7750 ShuffleVectorInst::isIdentityMask(Slice, VF);
7751 }))
7752 return true;
7753 }
7754 return false;
7755 }
7756
7757 /// Tries to combine 2 different masks into single one.
7758 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7759 /// change the size of the vector, \p LocalVF is the original size of the
7760 /// shuffled vector.
combineMasks(unsigned LocalVF,SmallVectorImpl<int> & Mask,ArrayRef<int> ExtMask)7761 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7762 ArrayRef<int> ExtMask) {
7763 unsigned VF = Mask.size();
7764 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7765 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7766 if (ExtMask[I] == PoisonMaskElem)
7767 continue;
7768 int MaskedIdx = Mask[ExtMask[I] % VF];
7769 NewMask[I] =
7770 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7771 }
7772 Mask.swap(NewMask);
7773 }
7774
7775 /// Looks through shuffles trying to reduce final number of shuffles in the
7776 /// code. The function looks through the previously emitted shuffle
7777 /// instructions and properly mark indices in mask as undef.
7778 /// For example, given the code
7779 /// \code
7780 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7781 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7782 /// \endcode
7783 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7784 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7785 /// <0, 1, 2, 3> for the shuffle.
7786 /// If 2 operands are of different size, the smallest one will be resized and
7787 /// the mask recalculated properly.
7788 /// For example, given the code
7789 /// \code
7790 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7791 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7792 /// \endcode
7793 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7794 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7795 /// <0, 1, 2, 3> for the shuffle.
7796 /// So, it tries to transform permutations to simple vector merge, if
7797 /// possible.
7798 /// \param V The input vector which must be shuffled using the given \p Mask.
7799 /// If the better candidate is found, \p V is set to this best candidate
7800 /// vector.
7801 /// \param Mask The input mask for the shuffle. If the best candidate is found
7802 /// during looking-through-shuffles attempt, it is updated accordingly.
7803 /// \param SinglePermute true if the shuffle operation is originally a
7804 /// single-value-permutation. In this case the look-through-shuffles procedure
7805 /// may look for resizing shuffles as the best candidates.
7806 /// \return true if the shuffle results in the non-resizing identity shuffle
7807 /// (and thus can be ignored), false - otherwise.
peekThroughShuffles(Value * & V,SmallVectorImpl<int> & Mask,bool SinglePermute)7808 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7809 bool SinglePermute) {
7810 Value *Op = V;
7811 ShuffleVectorInst *IdentityOp = nullptr;
7812 SmallVector<int> IdentityMask;
7813 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
7814 // Exit if not a fixed vector type or changing size shuffle.
7815 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7816 if (!SVTy)
7817 break;
7818 // Remember the identity or broadcast mask, if it is not a resizing
7819 // shuffle. If no better candidates are found, this Op and Mask will be
7820 // used in the final shuffle.
7821 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
7822 if (!IdentityOp || !SinglePermute ||
7823 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
7824 !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,
7825 IdentityMask.size()))) {
7826 IdentityOp = SV;
7827 // Store current mask in the IdentityMask so later we did not lost
7828 // this info if IdentityOp is selected as the best candidate for the
7829 // permutation.
7830 IdentityMask.assign(Mask);
7831 }
7832 }
7833 // Remember the broadcast mask. If no better candidates are found, this Op
7834 // and Mask will be used in the final shuffle.
7835 // Zero splat can be used as identity too, since it might be used with
7836 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7837 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7838 // expensive, the analysis founds out, that the source vector is just a
7839 // broadcast, this original mask can be transformed to identity mask <0,
7840 // 1, 2, 3>.
7841 // \code
7842 // %0 = shuffle %v, poison, zeroinitalizer
7843 // %res = shuffle %0, poison, <3, 1, 2, 0>
7844 // \endcode
7845 // may be transformed to
7846 // \code
7847 // %0 = shuffle %v, poison, zeroinitalizer
7848 // %res = shuffle %0, poison, <0, 1, 2, 3>
7849 // \endcode
7850 if (SV->isZeroEltSplat()) {
7851 IdentityOp = SV;
7852 IdentityMask.assign(Mask);
7853 }
7854 int LocalVF = Mask.size();
7855 if (auto *SVOpTy =
7856 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7857 LocalVF = SVOpTy->getNumElements();
7858 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7859 for (auto [Idx, I] : enumerate(Mask)) {
7860 if (I == PoisonMaskElem ||
7861 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7862 continue;
7863 ExtMask[Idx] = SV->getMaskValue(I);
7864 }
7865 bool IsOp1Undef =
7866 isUndefVector(SV->getOperand(0),
7867 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
7868 .all();
7869 bool IsOp2Undef =
7870 isUndefVector(SV->getOperand(1),
7871 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
7872 .all();
7873 if (!IsOp1Undef && !IsOp2Undef) {
7874 // Update mask and mark undef elems.
7875 for (int &I : Mask) {
7876 if (I == PoisonMaskElem)
7877 continue;
7878 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
7879 PoisonMaskElem)
7880 I = PoisonMaskElem;
7881 }
7882 break;
7883 }
7884 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7885 SV->getShuffleMask().end());
7886 combineMasks(LocalVF, ShuffleMask, Mask);
7887 Mask.swap(ShuffleMask);
7888 if (IsOp2Undef)
7889 Op = SV->getOperand(0);
7890 else
7891 Op = SV->getOperand(1);
7892 }
7893 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
7894 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7895 ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())) {
7896 if (IdentityOp) {
7897 V = IdentityOp;
7898 assert(Mask.size() == IdentityMask.size() &&
7899 "Expected masks of same sizes.");
7900 // Clear known poison elements.
7901 for (auto [I, Idx] : enumerate(Mask))
7902 if (Idx == PoisonMaskElem)
7903 IdentityMask[I] = PoisonMaskElem;
7904 Mask.swap(IdentityMask);
7905 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7906 return SinglePermute &&
7907 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
7908 /*IsStrict=*/true) ||
7909 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7910 Shuffle->isZeroEltSplat() &&
7911 ShuffleVectorInst::isZeroEltSplatMask(Mask, Mask.size())));
7912 }
7913 V = Op;
7914 return false;
7915 }
7916 V = Op;
7917 return true;
7918 }
7919
7920 /// Smart shuffle instruction emission, walks through shuffles trees and
7921 /// tries to find the best matching vector for the actual shuffle
7922 /// instruction.
7923 template <typename T, typename ShuffleBuilderTy>
createShuffle(Value * V1,Value * V2,ArrayRef<int> Mask,ShuffleBuilderTy & Builder)7924 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7925 ShuffleBuilderTy &Builder) {
7926 assert(V1 && "Expected at least one vector value.");
7927 if (V2)
7928 Builder.resizeToMatch(V1, V2);
7929 int VF = Mask.size();
7930 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
7931 VF = FTy->getNumElements();
7932 if (V2 &&
7933 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
7934 // Peek through shuffles.
7935 Value *Op1 = V1;
7936 Value *Op2 = V2;
7937 int VF =
7938 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7939 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7940 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7941 for (int I = 0, E = Mask.size(); I < E; ++I) {
7942 if (Mask[I] < VF)
7943 CombinedMask1[I] = Mask[I];
7944 else
7945 CombinedMask2[I] = Mask[I] - VF;
7946 }
7947 Value *PrevOp1;
7948 Value *PrevOp2;
7949 do {
7950 PrevOp1 = Op1;
7951 PrevOp2 = Op2;
7952 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
7953 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
7954 // Check if we have 2 resizing shuffles - need to peek through operands
7955 // again.
7956 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7957 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7958 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7959 for (auto [Idx, I] : enumerate(CombinedMask1)) {
7960 if (I == PoisonMaskElem)
7961 continue;
7962 ExtMask1[Idx] = SV1->getMaskValue(I);
7963 }
7964 SmallBitVector UseMask1 = buildUseMask(
7965 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7966 ->getNumElements(),
7967 ExtMask1, UseMask::SecondArg);
7968 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7969 for (auto [Idx, I] : enumerate(CombinedMask2)) {
7970 if (I == PoisonMaskElem)
7971 continue;
7972 ExtMask2[Idx] = SV2->getMaskValue(I);
7973 }
7974 SmallBitVector UseMask2 = buildUseMask(
7975 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7976 ->getNumElements(),
7977 ExtMask2, UseMask::SecondArg);
7978 if (SV1->getOperand(0)->getType() ==
7979 SV2->getOperand(0)->getType() &&
7980 SV1->getOperand(0)->getType() != SV1->getType() &&
7981 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
7982 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
7983 Op1 = SV1->getOperand(0);
7984 Op2 = SV2->getOperand(0);
7985 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7986 SV1->getShuffleMask().end());
7987 int LocalVF = ShuffleMask1.size();
7988 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
7989 LocalVF = FTy->getNumElements();
7990 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7991 CombinedMask1.swap(ShuffleMask1);
7992 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7993 SV2->getShuffleMask().end());
7994 LocalVF = ShuffleMask2.size();
7995 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
7996 LocalVF = FTy->getNumElements();
7997 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7998 CombinedMask2.swap(ShuffleMask2);
7999 }
8000 }
8001 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
8002 Builder.resizeToMatch(Op1, Op2);
8003 VF = std::max(cast<VectorType>(Op1->getType())
8004 ->getElementCount()
8005 .getKnownMinValue(),
8006 cast<VectorType>(Op2->getType())
8007 ->getElementCount()
8008 .getKnownMinValue());
8009 for (int I = 0, E = Mask.size(); I < E; ++I) {
8010 if (CombinedMask2[I] != PoisonMaskElem) {
8011 assert(CombinedMask1[I] == PoisonMaskElem &&
8012 "Expected undefined mask element");
8013 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
8014 }
8015 }
8016 if (Op1 == Op2 &&
8017 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
8018 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
8019 isa<ShuffleVectorInst>(Op1) &&
8020 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
8021 ArrayRef(CombinedMask1))))
8022 return Builder.createIdentity(Op1);
8023 return Builder.createShuffleVector(
8024 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
8025 CombinedMask1);
8026 }
8027 if (isa<PoisonValue>(V1))
8028 return Builder.createPoison(
8029 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
8030 SmallVector<int> NewMask(Mask.begin(), Mask.end());
8031 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
8032 assert(V1 && "Expected non-null value after looking through shuffles.");
8033
8034 if (!IsIdentity)
8035 return Builder.createShuffleVector(V1, NewMask);
8036 return Builder.createIdentity(V1);
8037 }
8038 };
8039 } // namespace
8040
8041 /// Returns the cost of the shuffle instructions with the given \p Kind, vector
8042 /// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
8043 /// subvector pattern.
8044 static InstructionCost
getShuffleCost(const TargetTransformInfo & TTI,TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask=std::nullopt,TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput,int Index=0,VectorType * SubTp=nullptr,ArrayRef<const Value * > Args=std::nullopt)8045 getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
8046 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
8047 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
8048 int Index = 0, VectorType *SubTp = nullptr,
8049 ArrayRef<const Value *> Args = std::nullopt) {
8050 if (Kind != TTI::SK_PermuteTwoSrc)
8051 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8052 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
8053 int NumSubElts;
8054 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
8055 Mask, NumSrcElts, NumSubElts, Index)) {
8056 if (Index + NumSubElts > NumSrcElts &&
8057 Index + NumSrcElts <= static_cast<int>(Mask.size()))
8058 return TTI.getShuffleCost(
8059 TTI::SK_InsertSubvector,
8060 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
8061 TTI::TCK_RecipThroughput, Index, Tp);
8062 }
8063 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8064 }
8065
8066 /// Calculate the scalar and the vector costs from vectorizing set of GEPs.
8067 static std::pair<InstructionCost, InstructionCost>
getGEPCosts(const TargetTransformInfo & TTI,ArrayRef<Value * > Ptrs,Value * BasePtr,unsigned Opcode,TTI::TargetCostKind CostKind,Type * ScalarTy,VectorType * VecTy)8068 getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
8069 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
8070 Type *ScalarTy, VectorType *VecTy) {
8071 InstructionCost ScalarCost = 0;
8072 InstructionCost VecCost = 0;
8073 // Here we differentiate two cases: (1) when Ptrs represent a regular
8074 // vectorization tree node (as they are pointer arguments of scattered
8075 // loads) or (2) when Ptrs are the arguments of loads or stores being
8076 // vectorized as plane wide unit-stride load/store since all the
8077 // loads/stores are known to be from/to adjacent locations.
8078 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
8079 // Case 2: estimate costs for pointer related costs when vectorizing to
8080 // a wide load/store.
8081 // Scalar cost is estimated as a set of pointers with known relationship
8082 // between them.
8083 // For vector code we will use BasePtr as argument for the wide load/store
8084 // but we also need to account all the instructions which are going to
8085 // stay in vectorized code due to uses outside of these scalar
8086 // loads/stores.
8087 ScalarCost = TTI.getPointersChainCost(
8088 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
8089 CostKind);
8090
8091 SmallVector<const Value *> PtrsRetainedInVecCode;
8092 for (Value *V : Ptrs) {
8093 if (V == BasePtr) {
8094 PtrsRetainedInVecCode.push_back(V);
8095 continue;
8096 }
8097 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
8098 // For simplicity assume Ptr to stay in vectorized code if it's not a
8099 // GEP instruction. We don't care since it's cost considered free.
8100 // TODO: We should check for any uses outside of vectorizable tree
8101 // rather than just single use.
8102 if (!Ptr || !Ptr->hasOneUse())
8103 PtrsRetainedInVecCode.push_back(V);
8104 }
8105
8106 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
8107 // If all pointers stay in vectorized code then we don't have
8108 // any savings on that.
8109 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
8110 }
8111 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
8112 TTI::PointersChainInfo::getKnownStride(),
8113 VecTy, CostKind);
8114 } else {
8115 // Case 1: Ptrs are the arguments of loads that we are going to transform
8116 // into masked gather load intrinsic.
8117 // All the scalar GEPs will be removed as a result of vectorization.
8118 // For any external uses of some lanes extract element instructions will
8119 // be generated (which cost is estimated separately).
8120 TTI::PointersChainInfo PtrsInfo =
8121 all_of(Ptrs,
8122 [](const Value *V) {
8123 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
8124 return Ptr && !Ptr->hasAllConstantIndices();
8125 })
8126 ? TTI::PointersChainInfo::getUnknownStride()
8127 : TTI::PointersChainInfo::getKnownStride();
8128
8129 ScalarCost =
8130 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
8131 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
8132 if (!BaseGEP) {
8133 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
8134 if (It != Ptrs.end())
8135 BaseGEP = cast<GEPOperator>(*It);
8136 }
8137 if (BaseGEP) {
8138 SmallVector<const Value *> Indices(BaseGEP->indices());
8139 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
8140 BaseGEP->getPointerOperand(), Indices, VecTy,
8141 CostKind);
8142 }
8143 }
8144
8145 return std::make_pair(ScalarCost, VecCost);
8146 }
8147
transformNodes()8148 void BoUpSLP::transformNodes() {
8149 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8150 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8151 TreeEntry &E = *TE;
8152 switch (E.getOpcode()) {
8153 case Instruction::Load: {
8154 // No need to reorder masked gather loads, just reorder the scalar
8155 // operands.
8156 if (E.State != TreeEntry::Vectorize)
8157 break;
8158 Type *ScalarTy = E.getMainOp()->getType();
8159 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
8160 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
8161 // Check if profitable to represent consecutive load + reverse as strided
8162 // load with stride -1.
8163 if (isReverseOrder(E.ReorderIndices) &&
8164 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8165 SmallVector<int> Mask;
8166 inversePermutation(E.ReorderIndices, Mask);
8167 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8168 InstructionCost OriginalVecCost =
8169 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
8170 BaseLI->getPointerAddressSpace(), CostKind,
8171 TTI::OperandValueInfo()) +
8172 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
8173 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
8174 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8175 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
8176 if (StridedCost < OriginalVecCost)
8177 // Strided load is more profitable than consecutive load + reverse -
8178 // transform the node to strided load.
8179 E.State = TreeEntry::StridedVectorize;
8180 }
8181 break;
8182 }
8183 case Instruction::Store: {
8184 Type *ScalarTy =
8185 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8186 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
8187 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8188 // Check if profitable to represent consecutive load + reverse as strided
8189 // load with stride -1.
8190 if (isReverseOrder(E.ReorderIndices) &&
8191 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8192 SmallVector<int> Mask;
8193 inversePermutation(E.ReorderIndices, Mask);
8194 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8195 InstructionCost OriginalVecCost =
8196 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
8197 BaseSI->getPointerAddressSpace(), CostKind,
8198 TTI::OperandValueInfo()) +
8199 ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
8200 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
8201 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8202 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
8203 if (StridedCost < OriginalVecCost)
8204 // Strided load is more profitable than consecutive load + reverse -
8205 // transform the node to strided load.
8206 E.State = TreeEntry::StridedVectorize;
8207 }
8208 break;
8209 }
8210 default:
8211 break;
8212 }
8213 }
8214 }
8215
8216 /// Merges shuffle masks and emits final shuffle instruction, if required. It
8217 /// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
8218 /// when the actual shuffle instruction is generated only if this is actually
8219 /// required. Otherwise, the shuffle instruction emission is delayed till the
8220 /// end of the process, to reduce the number of emitted instructions and further
8221 /// analysis/transformations.
8222 class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
8223 bool IsFinalized = false;
8224 SmallVector<int> CommonMask;
8225 SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
8226 Type *ScalarTy = nullptr;
8227 const TargetTransformInfo &TTI;
8228 InstructionCost Cost = 0;
8229 SmallDenseSet<Value *> VectorizedVals;
8230 BoUpSLP &R;
8231 SmallPtrSetImpl<Value *> &CheckedExtracts;
8232 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8233 /// While set, still trying to estimate the cost for the same nodes and we
8234 /// can delay actual cost estimation (virtual shuffle instruction emission).
8235 /// May help better estimate the cost if same nodes must be permuted + allows
8236 /// to move most of the long shuffles cost estimation to TTI.
8237 bool SameNodesEstimated = true;
8238
getAllOnesValue(const DataLayout & DL,Type * Ty)8239 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
8240 if (Ty->getScalarType()->isPointerTy()) {
8241 Constant *Res = ConstantExpr::getIntToPtr(
8242 ConstantInt::getAllOnesValue(
8243 IntegerType::get(Ty->getContext(),
8244 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
8245 Ty->getScalarType());
8246 if (auto *VTy = dyn_cast<VectorType>(Ty))
8247 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
8248 return Res;
8249 }
8250 return Constant::getAllOnesValue(Ty);
8251 }
8252
getBuildVectorCost(ArrayRef<Value * > VL,Value * Root)8253 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
8254 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
8255 return TTI::TCC_Free;
8256 auto *VecTy = getWidenedType(ScalarTy, VL.size());
8257 InstructionCost GatherCost = 0;
8258 SmallVector<Value *> Gathers(VL.begin(), VL.end());
8259 // Improve gather cost for gather of loads, if we can group some of the
8260 // loads into vector loads.
8261 InstructionsState S = getSameOpcode(VL, *R.TLI);
8262 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8263 unsigned MinVF = R.getMinVF(2 * Sz);
8264 if (VL.size() > 2 &&
8265 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8266 (InVectors.empty() &&
8267 any_of(seq<unsigned>(0, VL.size() / MinVF),
8268 [&](unsigned Idx) {
8269 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8270 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8271 return S.getOpcode() == Instruction::Load &&
8272 !S.isAltShuffle();
8273 }))) &&
8274 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
8275 !isSplat(Gathers)) {
8276 InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
8277 SetVector<Value *> VectorizedLoads;
8278 SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
8279 SmallVector<unsigned> ScatterVectorized;
8280 unsigned StartIdx = 0;
8281 unsigned VF = VL.size() / 2;
8282 for (; VF >= MinVF; VF /= 2) {
8283 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
8284 Cnt += VF) {
8285 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
8286 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8287 InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
8288 if (SliceS.getOpcode() != Instruction::Load ||
8289 SliceS.isAltShuffle())
8290 continue;
8291 }
8292 if (!VectorizedLoads.count(Slice.front()) &&
8293 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
8294 SmallVector<Value *> PointerOps;
8295 OrdersType CurrentOrder;
8296 LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
8297 CurrentOrder, PointerOps);
8298 switch (LS) {
8299 case LoadsState::Vectorize:
8300 case LoadsState::ScatterVectorize:
8301 case LoadsState::StridedVectorize:
8302 // Mark the vectorized loads so that we don't vectorize them
8303 // again.
8304 // TODO: better handling of loads with reorders.
8305 if (((LS == LoadsState::Vectorize ||
8306 LS == LoadsState::StridedVectorize) &&
8307 CurrentOrder.empty()) ||
8308 (LS == LoadsState::StridedVectorize &&
8309 isReverseOrder(CurrentOrder)))
8310 VectorizedStarts.emplace_back(Cnt, LS);
8311 else
8312 ScatterVectorized.push_back(Cnt);
8313 VectorizedLoads.insert(Slice.begin(), Slice.end());
8314 // If we vectorized initial block, no need to try to vectorize
8315 // it again.
8316 if (Cnt == StartIdx)
8317 StartIdx += VF;
8318 break;
8319 case LoadsState::Gather:
8320 break;
8321 }
8322 }
8323 }
8324 // Check if the whole array was vectorized already - exit.
8325 if (StartIdx >= VL.size())
8326 break;
8327 // Found vectorizable parts - exit.
8328 if (!VectorizedLoads.empty())
8329 break;
8330 }
8331 if (!VectorizedLoads.empty()) {
8332 unsigned NumParts = TTI.getNumberOfParts(VecTy);
8333 bool NeedInsertSubvectorAnalysis =
8334 !NumParts || (VL.size() / VF) > NumParts;
8335 // Get the cost for gathered loads.
8336 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
8337 if (VectorizedLoads.contains(VL[I]))
8338 continue;
8339 GatherCost +=
8340 getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
8341 }
8342 // Exclude potentially vectorized loads from list of gathered
8343 // scalars.
8344 Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
8345 // The cost for vectorized loads.
8346 InstructionCost ScalarsCost = 0;
8347 for (Value *V : VectorizedLoads) {
8348 auto *LI = cast<LoadInst>(V);
8349 ScalarsCost +=
8350 TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
8351 LI->getAlign(), LI->getPointerAddressSpace(),
8352 CostKind, TTI::OperandValueInfo(), LI);
8353 }
8354 auto *LoadTy = getWidenedType(VL.front()->getType(), VF);
8355 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
8356 auto *LI = cast<LoadInst>(VL[P.first]);
8357 Align Alignment = LI->getAlign();
8358 GatherCost +=
8359 P.second == LoadsState::Vectorize
8360 ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
8361 LI->getPointerAddressSpace(), CostKind,
8362 TTI::OperandValueInfo(), LI)
8363 : TTI.getStridedMemoryOpCost(
8364 Instruction::Load, LoadTy, LI->getPointerOperand(),
8365 /*VariableMask=*/false, Alignment, CostKind, LI);
8366 // Estimate GEP cost.
8367 SmallVector<Value *> PointerOps(VF);
8368 for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
8369 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8370 auto [ScalarGEPCost, VectorGEPCost] =
8371 getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
8372 Instruction::Load, CostKind, LI->getType(), LoadTy);
8373 GatherCost += VectorGEPCost - ScalarGEPCost;
8374 }
8375 for (unsigned P : ScatterVectorized) {
8376 auto *LI0 = cast<LoadInst>(VL[P]);
8377 ArrayRef<Value *> Slice = VL.slice(P, VF);
8378 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8379 GatherCost += TTI.getGatherScatterOpCost(
8380 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8381 /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
8382 // Estimate GEP cost.
8383 SmallVector<Value *> PointerOps(VF);
8384 for (auto [I, V] : enumerate(Slice))
8385 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8386 OrdersType Order;
8387 if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
8388 Order)) {
8389 // TODO: improve checks if GEPs can be vectorized.
8390 Value *Ptr0 = PointerOps.front();
8391 Type *ScalarTy = Ptr0->getType();
8392 auto *VecTy = getWidenedType(ScalarTy, VF);
8393 auto [ScalarGEPCost, VectorGEPCost] =
8394 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
8395 CostKind, ScalarTy, VecTy);
8396 GatherCost += VectorGEPCost - ScalarGEPCost;
8397 if (!Order.empty()) {
8398 SmallVector<int> Mask;
8399 inversePermutation(Order, Mask);
8400 GatherCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
8401 VecTy, Mask, CostKind);
8402 }
8403 } else {
8404 GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
8405 PointerOps.front()->getType());
8406 }
8407 }
8408 if (NeedInsertSubvectorAnalysis) {
8409 // Add the cost for the subvectors insert.
8410 SmallVector<int> ShuffleMask(VL.size());
8411 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8412 for (unsigned Idx : seq<unsigned>(0, E))
8413 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8414 GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
8415 ShuffleMask, CostKind, I, LoadTy);
8416 }
8417 }
8418 GatherCost -= ScalarsCost;
8419 }
8420 GatherCost = std::min(BaseCost, GatherCost);
8421 } else if (!Root && isSplat(VL)) {
8422 // Found the broadcasting of the single scalar, calculate the cost as
8423 // the broadcast.
8424 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
8425 assert(It != VL.end() && "Expected at least one non-undef value.");
8426 // Add broadcast for non-identity shuffle only.
8427 bool NeedShuffle =
8428 count(VL, *It) > 1 &&
8429 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
8430 if (!NeedShuffle)
8431 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
8432 CostKind, std::distance(VL.begin(), It),
8433 PoisonValue::get(VecTy), *It);
8434
8435 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8436 transform(VL, ShuffleMask.begin(), [](Value *V) {
8437 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8438 });
8439 InstructionCost InsertCost =
8440 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
8441 PoisonValue::get(VecTy), *It);
8442 return InsertCost + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
8443 VecTy, ShuffleMask, CostKind,
8444 /*Index=*/0, /*SubTp=*/nullptr,
8445 /*Args=*/*It);
8446 }
8447 return GatherCost +
8448 (all_of(Gathers, IsaPred<UndefValue>)
8449 ? TTI::TCC_Free
8450 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
8451 ScalarTy));
8452 };
8453
8454 /// Compute the cost of creating a vector containing the extracted values from
8455 /// \p VL.
8456 InstructionCost
computeExtractCost(ArrayRef<Value * > VL,ArrayRef<int> Mask,ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,unsigned NumParts)8457 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8458 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8459 unsigned NumParts) {
8460 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8461 unsigned NumElts =
8462 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
8463 auto *EE = dyn_cast<ExtractElementInst>(V);
8464 if (!EE)
8465 return Sz;
8466 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8467 if (!VecTy)
8468 return Sz;
8469 return std::max(Sz, VecTy->getNumElements());
8470 });
8471 // FIXME: this must be moved to TTI for better estimation.
8472 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
8473 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
8474 SmallVectorImpl<unsigned> &Indices)
8475 -> std::optional<TTI::ShuffleKind> {
8476 if (NumElts <= EltsPerVector)
8477 return std::nullopt;
8478 int OffsetReg0 =
8479 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
8480 [](int S, int I) {
8481 if (I == PoisonMaskElem)
8482 return S;
8483 return std::min(S, I);
8484 }),
8485 EltsPerVector);
8486 int OffsetReg1 = OffsetReg0;
8487 DenseSet<int> RegIndices;
8488 // Check that if trying to permute same single/2 input vectors.
8489 TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
8490 int FirstRegId = -1;
8491 Indices.assign(1, OffsetReg0);
8492 for (auto [Pos, I] : enumerate(Mask)) {
8493 if (I == PoisonMaskElem)
8494 continue;
8495 int Idx = I - OffsetReg0;
8496 int RegId =
8497 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
8498 if (FirstRegId < 0)
8499 FirstRegId = RegId;
8500 RegIndices.insert(RegId);
8501 if (RegIndices.size() > 2)
8502 return std::nullopt;
8503 if (RegIndices.size() == 2) {
8504 ShuffleKind = TTI::SK_PermuteTwoSrc;
8505 if (Indices.size() == 1) {
8506 OffsetReg1 = alignDown(
8507 std::accumulate(
8508 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
8509 [&](int S, int I) {
8510 if (I == PoisonMaskElem)
8511 return S;
8512 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
8513 ((I - OffsetReg0) % NumElts) / EltsPerVector;
8514 if (RegId == FirstRegId)
8515 return S;
8516 return std::min(S, I);
8517 }),
8518 EltsPerVector);
8519 Indices.push_back(OffsetReg1 % NumElts);
8520 }
8521 Idx = I - OffsetReg1;
8522 }
8523 I = (Idx % NumElts) % EltsPerVector +
8524 (RegId == FirstRegId ? 0 : EltsPerVector);
8525 }
8526 return ShuffleKind;
8527 };
8528 InstructionCost Cost = 0;
8529
8530 // Process extracts in blocks of EltsPerVector to check if the source vector
8531 // operand can be re-used directly. If not, add the cost of creating a
8532 // shuffle to extract the values into a vector register.
8533 for (unsigned Part : seq<unsigned>(NumParts)) {
8534 if (!ShuffleKinds[Part])
8535 continue;
8536 ArrayRef<int> MaskSlice = Mask.slice(
8537 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
8538 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8539 copy(MaskSlice, SubMask.begin());
8540 SmallVector<unsigned, 2> Indices;
8541 std::optional<TTI::ShuffleKind> RegShuffleKind =
8542 CheckPerRegistersShuffle(SubMask, Indices);
8543 if (!RegShuffleKind) {
8544 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
8545 !ShuffleVectorInst::isIdentityMask(
8546 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
8547 Cost +=
8548 ::getShuffleCost(TTI, *ShuffleKinds[Part],
8549 getWidenedType(ScalarTy, NumElts), MaskSlice);
8550 continue;
8551 }
8552 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8553 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
8554 Cost +=
8555 ::getShuffleCost(TTI, *RegShuffleKind,
8556 getWidenedType(ScalarTy, EltsPerVector), SubMask);
8557 }
8558 for (unsigned Idx : Indices) {
8559 assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) &&
8560 "SK_ExtractSubvector index out of range");
8561 Cost += ::getShuffleCost(
8562 TTI, TTI::SK_ExtractSubvector,
8563 getWidenedType(ScalarTy, alignTo(NumElts, EltsPerVector)),
8564 std::nullopt, CostKind, Idx,
8565 getWidenedType(ScalarTy, EltsPerVector));
8566 }
8567 // Second attempt to check, if just a permute is better estimated than
8568 // subvector extract.
8569 SubMask.assign(NumElts, PoisonMaskElem);
8570 copy(MaskSlice, SubMask.begin());
8571 InstructionCost OriginalCost = ::getShuffleCost(
8572 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
8573 if (OriginalCost < Cost)
8574 Cost = OriginalCost;
8575 }
8576 return Cost;
8577 }
8578 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8579 /// shuffle emission.
transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,ArrayRef<int> Mask)8580 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8581 ArrayRef<int> Mask) {
8582 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8583 if (Mask[Idx] != PoisonMaskElem)
8584 CommonMask[Idx] = Idx;
8585 }
8586 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8587 /// mask \p Mask, register number \p Part, that includes \p SliceSize
8588 /// elements.
estimateNodesPermuteCost(const TreeEntry & E1,const TreeEntry * E2,ArrayRef<int> Mask,unsigned Part,unsigned SliceSize)8589 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8590 ArrayRef<int> Mask, unsigned Part,
8591 unsigned SliceSize) {
8592 if (SameNodesEstimated) {
8593 // Delay the cost estimation if the same nodes are reshuffling.
8594 // If we already requested the cost of reshuffling of E1 and E2 before, no
8595 // need to estimate another cost with the sub-Mask, instead include this
8596 // sub-Mask into the CommonMask to estimate it later and avoid double cost
8597 // estimation.
8598 if ((InVectors.size() == 2 &&
8599 InVectors.front().get<const TreeEntry *>() == &E1 &&
8600 InVectors.back().get<const TreeEntry *>() == E2) ||
8601 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8602 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
8603 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
8604 [](int Idx) { return Idx == PoisonMaskElem; }) &&
8605 "Expected all poisoned elements.");
8606 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
8607 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
8608 return;
8609 }
8610 // Found non-matching nodes - need to estimate the cost for the matched
8611 // and transform mask.
8612 Cost += createShuffle(InVectors.front(),
8613 InVectors.size() == 1 ? nullptr : InVectors.back(),
8614 CommonMask);
8615 transformMaskAfterShuffle(CommonMask, CommonMask);
8616 }
8617 SameNodesEstimated = false;
8618 if (!E2 && InVectors.size() == 1) {
8619 unsigned VF = E1.getVectorFactor();
8620 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8621 VF = std::max(VF,
8622 cast<FixedVectorType>(V1->getType())->getNumElements());
8623 } else {
8624 const auto *E = InVectors.front().get<const TreeEntry *>();
8625 VF = std::max(VF, E->getVectorFactor());
8626 }
8627 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8628 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8629 CommonMask[Idx] = Mask[Idx] + VF;
8630 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
8631 transformMaskAfterShuffle(CommonMask, CommonMask);
8632 } else {
8633 Cost += createShuffle(&E1, E2, Mask);
8634 transformMaskAfterShuffle(CommonMask, Mask);
8635 }
8636 }
8637
8638 class ShuffleCostBuilder {
8639 const TargetTransformInfo &TTI;
8640
isEmptyOrIdentity(ArrayRef<int> Mask,unsigned VF)8641 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8642 int Index = -1;
8643 return Mask.empty() ||
8644 (VF == Mask.size() &&
8645 ShuffleVectorInst::isIdentityMask(Mask, VF)) ||
8646 (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&
8647 Index == 0);
8648 }
8649
8650 public:
ShuffleCostBuilder(const TargetTransformInfo & TTI)8651 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8652 ~ShuffleCostBuilder() = default;
createShuffleVector(Value * V1,Value *,ArrayRef<int> Mask) const8653 InstructionCost createShuffleVector(Value *V1, Value *,
8654 ArrayRef<int> Mask) const {
8655 // Empty mask or identity mask are free.
8656 unsigned VF =
8657 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8658 if (isEmptyOrIdentity(Mask, VF))
8659 return TTI::TCC_Free;
8660 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
8661 cast<VectorType>(V1->getType()), Mask);
8662 }
createShuffleVector(Value * V1,ArrayRef<int> Mask) const8663 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8664 // Empty mask or identity mask are free.
8665 unsigned VF =
8666 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8667 if (isEmptyOrIdentity(Mask, VF))
8668 return TTI::TCC_Free;
8669 return TTI.getShuffleCost(TTI::SK_PermuteSingleSrc,
8670 cast<VectorType>(V1->getType()), Mask);
8671 }
createIdentity(Value *) const8672 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
createPoison(Type * Ty,unsigned VF) const8673 InstructionCost createPoison(Type *Ty, unsigned VF) const {
8674 return TTI::TCC_Free;
8675 }
resizeToMatch(Value * &,Value * &) const8676 void resizeToMatch(Value *&, Value *&) const {}
8677 };
8678
8679 /// Smart shuffle instruction emission, walks through shuffles trees and
8680 /// tries to find the best matching vector for the actual shuffle
8681 /// instruction.
8682 InstructionCost
createShuffle(const PointerUnion<Value *,const TreeEntry * > & P1,const PointerUnion<Value *,const TreeEntry * > & P2,ArrayRef<int> Mask)8683 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8684 const PointerUnion<Value *, const TreeEntry *> &P2,
8685 ArrayRef<int> Mask) {
8686 ShuffleCostBuilder Builder(TTI);
8687 SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8688 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8689 unsigned CommonVF = Mask.size();
8690 InstructionCost ExtraCost = 0;
8691 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
8692 unsigned VF) -> InstructionCost {
8693 if (E.isGather() && allConstant(E.Scalars))
8694 return TTI::TCC_Free;
8695 Type *EScalarTy = E.Scalars.front()->getType();
8696 bool IsSigned = true;
8697 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8698 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
8699 IsSigned = It->second.second;
8700 }
8701 if (EScalarTy != ScalarTy) {
8702 unsigned CastOpcode = Instruction::Trunc;
8703 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8704 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8705 if (DstSz > SrcSz)
8706 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8707 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
8708 getWidenedType(EScalarTy, VF),
8709 TTI::CastContextHint::None, CostKind);
8710 }
8711 return TTI::TCC_Free;
8712 };
8713 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
8714 if (isa<Constant>(V))
8715 return TTI::TCC_Free;
8716 auto *VecTy = cast<VectorType>(V->getType());
8717 Type *EScalarTy = VecTy->getElementType();
8718 if (EScalarTy != ScalarTy) {
8719 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
8720 unsigned CastOpcode = Instruction::Trunc;
8721 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8722 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8723 if (DstSz > SrcSz)
8724 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8725 return TTI.getCastInstrCost(
8726 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
8727 VecTy, TTI::CastContextHint::None, CostKind);
8728 }
8729 return TTI::TCC_Free;
8730 };
8731 if (!V1 && !V2 && !P2.isNull()) {
8732 // Shuffle 2 entry nodes.
8733 const TreeEntry *E = P1.get<const TreeEntry *>();
8734 unsigned VF = E->getVectorFactor();
8735 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8736 CommonVF = std::max(VF, E2->getVectorFactor());
8737 assert(all_of(Mask,
8738 [=](int Idx) {
8739 return Idx < 2 * static_cast<int>(CommonVF);
8740 }) &&
8741 "All elements in mask must be less than 2 * CommonVF.");
8742 if (E->Scalars.size() == E2->Scalars.size()) {
8743 SmallVector<int> EMask = E->getCommonMask();
8744 SmallVector<int> E2Mask = E2->getCommonMask();
8745 if (!EMask.empty() || !E2Mask.empty()) {
8746 for (int &Idx : CommonMask) {
8747 if (Idx == PoisonMaskElem)
8748 continue;
8749 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8750 Idx = EMask[Idx];
8751 else if (Idx >= static_cast<int>(CommonVF))
8752 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8753 E->Scalars.size();
8754 }
8755 }
8756 CommonVF = E->Scalars.size();
8757 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8758 GetNodeMinBWAffectedCost(*E2, CommonVF);
8759 } else {
8760 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8761 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8762 }
8763 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8764 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8765 } else if (!V1 && P2.isNull()) {
8766 // Shuffle single entry node.
8767 const TreeEntry *E = P1.get<const TreeEntry *>();
8768 unsigned VF = E->getVectorFactor();
8769 CommonVF = VF;
8770 assert(
8771 all_of(Mask,
8772 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8773 "All elements in mask must be less than CommonVF.");
8774 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8775 SmallVector<int> EMask = E->getCommonMask();
8776 assert(!EMask.empty() && "Expected non-empty common mask.");
8777 for (int &Idx : CommonMask) {
8778 if (Idx != PoisonMaskElem)
8779 Idx = EMask[Idx];
8780 }
8781 CommonVF = E->Scalars.size();
8782 }
8783 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8784 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8785 // Not identity/broadcast? Try to see if the original vector is better.
8786 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8787 CommonVF == CommonMask.size() &&
8788 any_of(enumerate(CommonMask),
8789 [](const auto &&P) {
8790 return P.value() != PoisonMaskElem &&
8791 static_cast<unsigned>(P.value()) != P.index();
8792 }) &&
8793 any_of(CommonMask,
8794 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8795 SmallVector<int> ReorderMask;
8796 inversePermutation(E->ReorderIndices, ReorderMask);
8797 ::addMask(CommonMask, ReorderMask);
8798 }
8799 } else if (V1 && P2.isNull()) {
8800 // Shuffle single vector.
8801 ExtraCost += GetValueMinBWAffectedCost(V1);
8802 CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
8803 assert(
8804 all_of(Mask,
8805 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8806 "All elements in mask must be less than CommonVF.");
8807 } else if (V1 && !V2) {
8808 // Shuffle vector and tree node.
8809 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8810 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8811 CommonVF = std::max(VF, E2->getVectorFactor());
8812 assert(all_of(Mask,
8813 [=](int Idx) {
8814 return Idx < 2 * static_cast<int>(CommonVF);
8815 }) &&
8816 "All elements in mask must be less than 2 * CommonVF.");
8817 if (E2->Scalars.size() == VF && VF != CommonVF) {
8818 SmallVector<int> E2Mask = E2->getCommonMask();
8819 assert(!E2Mask.empty() && "Expected non-empty common mask.");
8820 for (int &Idx : CommonMask) {
8821 if (Idx == PoisonMaskElem)
8822 continue;
8823 if (Idx >= static_cast<int>(CommonVF))
8824 Idx = E2Mask[Idx - CommonVF] + VF;
8825 }
8826 CommonVF = VF;
8827 }
8828 ExtraCost += GetValueMinBWAffectedCost(V1);
8829 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8830 ExtraCost += GetNodeMinBWAffectedCost(
8831 *E2, std::min(CommonVF, E2->getVectorFactor()));
8832 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8833 } else if (!V1 && V2) {
8834 // Shuffle vector and tree node.
8835 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8836 const TreeEntry *E1 = P1.get<const TreeEntry *>();
8837 CommonVF = std::max(VF, E1->getVectorFactor());
8838 assert(all_of(Mask,
8839 [=](int Idx) {
8840 return Idx < 2 * static_cast<int>(CommonVF);
8841 }) &&
8842 "All elements in mask must be less than 2 * CommonVF.");
8843 if (E1->Scalars.size() == VF && VF != CommonVF) {
8844 SmallVector<int> E1Mask = E1->getCommonMask();
8845 assert(!E1Mask.empty() && "Expected non-empty common mask.");
8846 for (int &Idx : CommonMask) {
8847 if (Idx == PoisonMaskElem)
8848 continue;
8849 if (Idx >= static_cast<int>(CommonVF))
8850 Idx = E1Mask[Idx - CommonVF] + VF;
8851 else
8852 Idx = E1Mask[Idx];
8853 }
8854 CommonVF = VF;
8855 }
8856 ExtraCost += GetNodeMinBWAffectedCost(
8857 *E1, std::min(CommonVF, E1->getVectorFactor()));
8858 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8859 ExtraCost += GetValueMinBWAffectedCost(V2);
8860 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8861 } else {
8862 assert(V1 && V2 && "Expected both vectors.");
8863 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8864 CommonVF =
8865 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8866 assert(all_of(Mask,
8867 [=](int Idx) {
8868 return Idx < 2 * static_cast<int>(CommonVF);
8869 }) &&
8870 "All elements in mask must be less than 2 * CommonVF.");
8871 ExtraCost +=
8872 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8873 if (V1->getType() != V2->getType()) {
8874 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8875 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8876 } else {
8877 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
8878 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
8879 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8880 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
8881 }
8882 }
8883 InVectors.front() =
8884 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
8885 if (InVectors.size() == 2)
8886 InVectors.pop_back();
8887 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8888 V1, V2, CommonMask, Builder);
8889 }
8890
8891 public:
ShuffleCostEstimator(Type * ScalarTy,TargetTransformInfo & TTI,ArrayRef<Value * > VectorizedVals,BoUpSLP & R,SmallPtrSetImpl<Value * > & CheckedExtracts)8892 ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
8893 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8894 SmallPtrSetImpl<Value *> &CheckedExtracts)
8895 : ScalarTy(ScalarTy), TTI(TTI),
8896 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8897 CheckedExtracts(CheckedExtracts) {}
adjustExtracts(const TreeEntry * E,MutableArrayRef<int> Mask,ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,unsigned NumParts,bool & UseVecBaseAsInput)8898 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8899 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8900 unsigned NumParts, bool &UseVecBaseAsInput) {
8901 UseVecBaseAsInput = false;
8902 if (Mask.empty())
8903 return nullptr;
8904 Value *VecBase = nullptr;
8905 ArrayRef<Value *> VL = E->Scalars;
8906 // If the resulting type is scalarized, do not adjust the cost.
8907 if (NumParts == VL.size())
8908 return nullptr;
8909 // Check if it can be considered reused if same extractelements were
8910 // vectorized already.
8911 bool PrevNodeFound = any_of(
8912 ArrayRef(R.VectorizableTree).take_front(E->Idx),
8913 [&](const std::unique_ptr<TreeEntry> &TE) {
8914 return ((!TE->isAltShuffle() &&
8915 TE->getOpcode() == Instruction::ExtractElement) ||
8916 TE->isGather()) &&
8917 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8918 return VL.size() > Data.index() &&
8919 (Mask[Data.index()] == PoisonMaskElem ||
8920 isa<UndefValue>(VL[Data.index()]) ||
8921 Data.value() == VL[Data.index()]);
8922 });
8923 });
8924 SmallPtrSet<Value *, 4> UniqueBases;
8925 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
8926 for (unsigned Part : seq<unsigned>(NumParts)) {
8927 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
8928 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
8929 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
8930 // Ignore non-extractelement scalars.
8931 if (isa<UndefValue>(V) ||
8932 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8933 continue;
8934 // If all users of instruction are going to be vectorized and this
8935 // instruction itself is not going to be vectorized, consider this
8936 // instruction as dead and remove its cost from the final cost of the
8937 // vectorized tree.
8938 // Also, avoid adjusting the cost for extractelements with multiple uses
8939 // in different graph entries.
8940 auto *EE = cast<ExtractElementInst>(V);
8941 VecBase = EE->getVectorOperand();
8942 UniqueBases.insert(VecBase);
8943 const TreeEntry *VE = R.getTreeEntry(V);
8944 if (!CheckedExtracts.insert(V).second ||
8945 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8946 any_of(EE->users(),
8947 [&](User *U) {
8948 return isa<GetElementPtrInst>(U) &&
8949 !R.areAllUsersVectorized(cast<Instruction>(U),
8950 &VectorizedVals);
8951 }) ||
8952 (VE && VE != E))
8953 continue;
8954 std::optional<unsigned> EEIdx = getExtractIndex(EE);
8955 if (!EEIdx)
8956 continue;
8957 unsigned Idx = *EEIdx;
8958 // Take credit for instruction that will become dead.
8959 if (EE->hasOneUse() || !PrevNodeFound) {
8960 Instruction *Ext = EE->user_back();
8961 if (isa<SExtInst, ZExtInst>(Ext) &&
8962 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8963 // Use getExtractWithExtendCost() to calculate the cost of
8964 // extractelement/ext pair.
8965 Cost -=
8966 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
8967 EE->getVectorOperandType(), Idx);
8968 // Add back the cost of s|zext which is subtracted separately.
8969 Cost += TTI.getCastInstrCost(
8970 Ext->getOpcode(), Ext->getType(), EE->getType(),
8971 TTI::getCastContextHint(Ext), CostKind, Ext);
8972 continue;
8973 }
8974 }
8975 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
8976 CostKind, Idx);
8977 }
8978 }
8979 // Check that gather of extractelements can be represented as just a
8980 // shuffle of a single/two vectors the scalars are extracted from.
8981 // Found the bunch of extractelement instructions that must be gathered
8982 // into a vector and can be represented as a permutation elements in a
8983 // single input vector or of 2 input vectors.
8984 // Done for reused if same extractelements were vectorized already.
8985 if (!PrevNodeFound)
8986 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8987 InVectors.assign(1, E);
8988 CommonMask.assign(Mask.begin(), Mask.end());
8989 transformMaskAfterShuffle(CommonMask, CommonMask);
8990 SameNodesEstimated = false;
8991 if (NumParts != 1 && UniqueBases.size() != 1) {
8992 UseVecBaseAsInput = true;
8993 VecBase =
8994 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
8995 }
8996 return VecBase;
8997 }
8998 /// Checks if the specified entry \p E needs to be delayed because of its
8999 /// dependency nodes.
9000 std::optional<InstructionCost>
needToDelay(const TreeEntry *,ArrayRef<SmallVector<const TreeEntry * >>) const9001 needToDelay(const TreeEntry *,
9002 ArrayRef<SmallVector<const TreeEntry *>>) const {
9003 // No need to delay the cost estimation during analysis.
9004 return std::nullopt;
9005 }
add(const TreeEntry & E1,const TreeEntry & E2,ArrayRef<int> Mask)9006 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
9007 if (&E1 == &E2) {
9008 assert(all_of(Mask,
9009 [&](int Idx) {
9010 return Idx < static_cast<int>(E1.getVectorFactor());
9011 }) &&
9012 "Expected single vector shuffle mask.");
9013 add(E1, Mask);
9014 return;
9015 }
9016 if (InVectors.empty()) {
9017 CommonMask.assign(Mask.begin(), Mask.end());
9018 InVectors.assign({&E1, &E2});
9019 return;
9020 }
9021 assert(!CommonMask.empty() && "Expected non-empty common mask.");
9022 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9023 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9024 if (NumParts == 0 || NumParts >= Mask.size())
9025 NumParts = 1;
9026 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9027 const auto *It =
9028 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
9029 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9030 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
9031 }
add(const TreeEntry & E1,ArrayRef<int> Mask)9032 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
9033 if (InVectors.empty()) {
9034 CommonMask.assign(Mask.begin(), Mask.end());
9035 InVectors.assign(1, &E1);
9036 return;
9037 }
9038 assert(!CommonMask.empty() && "Expected non-empty common mask.");
9039 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9040 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9041 if (NumParts == 0 || NumParts >= Mask.size())
9042 NumParts = 1;
9043 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9044 const auto *It =
9045 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
9046 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9047 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
9048 if (!SameNodesEstimated && InVectors.size() == 1)
9049 InVectors.emplace_back(&E1);
9050 }
9051 /// Adds 2 input vectors and the mask for their shuffling.
add(Value * V1,Value * V2,ArrayRef<int> Mask)9052 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
9053 // May come only for shuffling of 2 vectors with extractelements, already
9054 // handled in adjustExtracts.
9055 assert(InVectors.size() == 1 &&
9056 all_of(enumerate(CommonMask),
9057 [&](auto P) {
9058 if (P.value() == PoisonMaskElem)
9059 return Mask[P.index()] == PoisonMaskElem;
9060 auto *EI =
9061 cast<ExtractElementInst>(InVectors.front()
9062 .get<const TreeEntry *>()
9063 ->Scalars[P.index()]);
9064 return EI->getVectorOperand() == V1 ||
9065 EI->getVectorOperand() == V2;
9066 }) &&
9067 "Expected extractelement vectors.");
9068 }
9069 /// Adds another one input vector and the mask for the shuffling.
add(Value * V1,ArrayRef<int> Mask,bool ForExtracts=false)9070 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
9071 if (InVectors.empty()) {
9072 assert(CommonMask.empty() && !ForExtracts &&
9073 "Expected empty input mask/vectors.");
9074 CommonMask.assign(Mask.begin(), Mask.end());
9075 InVectors.assign(1, V1);
9076 return;
9077 }
9078 if (ForExtracts) {
9079 // No need to add vectors here, already handled them in adjustExtracts.
9080 assert(InVectors.size() == 1 &&
9081 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
9082 all_of(enumerate(CommonMask),
9083 [&](auto P) {
9084 Value *Scalar = InVectors.front()
9085 .get<const TreeEntry *>()
9086 ->Scalars[P.index()];
9087 if (P.value() == PoisonMaskElem)
9088 return P.value() == Mask[P.index()] ||
9089 isa<UndefValue>(Scalar);
9090 if (isa<Constant>(V1))
9091 return true;
9092 auto *EI = cast<ExtractElementInst>(Scalar);
9093 return EI->getVectorOperand() == V1;
9094 }) &&
9095 "Expected only tree entry for extractelement vectors.");
9096 return;
9097 }
9098 assert(!InVectors.empty() && !CommonMask.empty() &&
9099 "Expected only tree entries from extracts/reused buildvectors.");
9100 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
9101 if (InVectors.size() == 2) {
9102 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
9103 transformMaskAfterShuffle(CommonMask, CommonMask);
9104 VF = std::max<unsigned>(VF, CommonMask.size());
9105 } else if (const auto *InTE =
9106 InVectors.front().dyn_cast<const TreeEntry *>()) {
9107 VF = std::max(VF, InTE->getVectorFactor());
9108 } else {
9109 VF = std::max(
9110 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
9111 ->getNumElements());
9112 }
9113 InVectors.push_back(V1);
9114 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9115 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
9116 CommonMask[Idx] = Mask[Idx] + VF;
9117 }
gather(ArrayRef<Value * > VL,unsigned MaskVF=0,Value * Root=nullptr)9118 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
9119 Value *Root = nullptr) {
9120 Cost += getBuildVectorCost(VL, Root);
9121 if (!Root) {
9122 // FIXME: Need to find a way to avoid use of getNullValue here.
9123 SmallVector<Constant *> Vals;
9124 unsigned VF = VL.size();
9125 if (MaskVF != 0)
9126 VF = std::min(VF, MaskVF);
9127 for (Value *V : VL.take_front(VF)) {
9128 if (isa<UndefValue>(V)) {
9129 Vals.push_back(cast<Constant>(V));
9130 continue;
9131 }
9132 Vals.push_back(Constant::getNullValue(V->getType()));
9133 }
9134 return ConstantVector::get(Vals);
9135 }
9136 return ConstantVector::getSplat(
9137 ElementCount::getFixed(
9138 cast<FixedVectorType>(Root->getType())->getNumElements()),
9139 getAllOnesValue(*R.DL, ScalarTy));
9140 }
createFreeze(InstructionCost Cost)9141 InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
9142 /// Finalize emission of the shuffles.
9143 InstructionCost
finalize(ArrayRef<int> ExtMask,unsigned VF=0,function_ref<void (Value * &,SmallVectorImpl<int> &)> Action={})9144 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
9145 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
9146 IsFinalized = true;
9147 if (Action) {
9148 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
9149 if (InVectors.size() == 2)
9150 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
9151 else
9152 Cost += createShuffle(Vec, nullptr, CommonMask);
9153 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9154 if (CommonMask[Idx] != PoisonMaskElem)
9155 CommonMask[Idx] = Idx;
9156 assert(VF > 0 &&
9157 "Expected vector length for the final value before action.");
9158 Value *V = Vec.get<Value *>();
9159 Action(V, CommonMask);
9160 InVectors.front() = V;
9161 }
9162 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
9163 if (CommonMask.empty()) {
9164 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
9165 return Cost;
9166 }
9167 return Cost +
9168 createShuffle(InVectors.front(),
9169 InVectors.size() == 2 ? InVectors.back() : nullptr,
9170 CommonMask);
9171 }
9172
~ShuffleCostEstimator()9173 ~ShuffleCostEstimator() {
9174 assert((IsFinalized || CommonMask.empty()) &&
9175 "Shuffle construction must be finalized.");
9176 }
9177 };
9178
getOperandEntry(const TreeEntry * E,unsigned Idx) const9179 const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
9180 unsigned Idx) const {
9181 Value *Op = E->getOperand(Idx).front();
9182 if (const TreeEntry *TE = getTreeEntry(Op)) {
9183 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9184 return EI.EdgeIdx == Idx && EI.UserTE == E;
9185 }) != TE->UserTreeIndices.end())
9186 return TE;
9187 auto MIt = MultiNodeScalars.find(Op);
9188 if (MIt != MultiNodeScalars.end()) {
9189 for (const TreeEntry *TE : MIt->second) {
9190 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9191 return EI.EdgeIdx == Idx && EI.UserTE == E;
9192 }) != TE->UserTreeIndices.end())
9193 return TE;
9194 }
9195 }
9196 }
9197 const auto *It =
9198 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9199 return TE->isGather() &&
9200 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9201 return EI.EdgeIdx == Idx && EI.UserTE == E;
9202 }) != TE->UserTreeIndices.end();
9203 });
9204 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
9205 return It->get();
9206 }
9207
getCastContextHint(const TreeEntry & TE) const9208 TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
9209 if (TE.State == TreeEntry::ScatterVectorize ||
9210 TE.State == TreeEntry::StridedVectorize)
9211 return TTI::CastContextHint::GatherScatter;
9212 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
9213 !TE.isAltShuffle()) {
9214 if (TE.ReorderIndices.empty())
9215 return TTI::CastContextHint::Normal;
9216 SmallVector<int> Mask;
9217 inversePermutation(TE.ReorderIndices, Mask);
9218 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
9219 return TTI::CastContextHint::Reversed;
9220 }
9221 return TTI::CastContextHint::None;
9222 }
9223
9224 /// Builds the arguments types vector for the given call instruction with the
9225 /// given \p ID for the specified vector factor.
buildIntrinsicArgTypes(const CallInst * CI,const Intrinsic::ID ID,const unsigned VF,unsigned MinBW)9226 static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
9227 const Intrinsic::ID ID,
9228 const unsigned VF,
9229 unsigned MinBW) {
9230 SmallVector<Type *> ArgTys;
9231 for (auto [Idx, Arg] : enumerate(CI->args())) {
9232 if (ID != Intrinsic::not_intrinsic) {
9233 if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx)) {
9234 ArgTys.push_back(Arg->getType());
9235 continue;
9236 }
9237 if (MinBW > 0) {
9238 ArgTys.push_back(
9239 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
9240 continue;
9241 }
9242 }
9243 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
9244 }
9245 return ArgTys;
9246 }
9247
9248 InstructionCost
getEntryCost(const TreeEntry * E,ArrayRef<Value * > VectorizedVals,SmallPtrSetImpl<Value * > & CheckedExtracts)9249 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9250 SmallPtrSetImpl<Value *> &CheckedExtracts) {
9251 ArrayRef<Value *> VL = E->Scalars;
9252
9253 Type *ScalarTy = VL[0]->getType();
9254 if (!E->isGather()) {
9255 if (auto *SI = dyn_cast<StoreInst>(VL[0]))
9256 ScalarTy = SI->getValueOperand()->getType();
9257 else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
9258 ScalarTy = CI->getOperand(0)->getType();
9259 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9260 ScalarTy = IE->getOperand(1)->getType();
9261 }
9262 if (!isValidElementType(ScalarTy))
9263 return InstructionCost::getInvalid();
9264 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
9265
9266 // If we have computed a smaller type for the expression, update VecTy so
9267 // that the costs will be accurate.
9268 auto It = MinBWs.find(E);
9269 Type *OrigScalarTy = ScalarTy;
9270 if (It != MinBWs.end())
9271 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
9272 auto *VecTy = getWidenedType(ScalarTy, VL.size());
9273 unsigned EntryVF = E->getVectorFactor();
9274 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
9275
9276 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9277 if (E->isGather()) {
9278 if (allConstant(VL))
9279 return 0;
9280 if (isa<InsertElementInst>(VL[0]))
9281 return InstructionCost::getInvalid();
9282 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9283 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
9284 }
9285 InstructionCost CommonCost = 0;
9286 SmallVector<int> Mask;
9287 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
9288 if (!E->ReorderIndices.empty() &&
9289 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9290 SmallVector<int> NewMask;
9291 if (E->getOpcode() == Instruction::Store) {
9292 // For stores the order is actually a mask.
9293 NewMask.resize(E->ReorderIndices.size());
9294 copy(E->ReorderIndices, NewMask.begin());
9295 } else {
9296 inversePermutation(E->ReorderIndices, NewMask);
9297 }
9298 ::addMask(Mask, NewMask);
9299 }
9300 if (NeedToShuffleReuses)
9301 ::addMask(Mask, E->ReuseShuffleIndices);
9302 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
9303 CommonCost =
9304 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
9305 assert((E->State == TreeEntry::Vectorize ||
9306 E->State == TreeEntry::ScatterVectorize ||
9307 E->State == TreeEntry::StridedVectorize) &&
9308 "Unhandled state");
9309 assert(E->getOpcode() &&
9310 ((allSameType(VL) && allSameBlock(VL)) ||
9311 (E->getOpcode() == Instruction::GetElementPtr &&
9312 E->getMainOp()->getType()->isPointerTy())) &&
9313 "Invalid VL");
9314 Instruction *VL0 = E->getMainOp();
9315 unsigned ShuffleOrOp =
9316 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
9317 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
9318 const unsigned Sz = UniqueValues.size();
9319 SmallBitVector UsedScalars(Sz, false);
9320 for (unsigned I = 0; I < Sz; ++I) {
9321 if (getTreeEntry(UniqueValues[I]) == E)
9322 continue;
9323 UsedScalars.set(I);
9324 }
9325 auto GetCastContextHint = [&](Value *V) {
9326 if (const TreeEntry *OpTE = getTreeEntry(V))
9327 return getCastContextHint(*OpTE);
9328 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
9329 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9330 return TTI::CastContextHint::GatherScatter;
9331 return TTI::CastContextHint::None;
9332 };
9333 auto GetCostDiff =
9334 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
9335 function_ref<InstructionCost(InstructionCost)> VectorCost) {
9336 // Calculate the cost of this instruction.
9337 InstructionCost ScalarCost = 0;
9338 if (isa<CastInst, CallInst>(VL0)) {
9339 // For some of the instructions no need to calculate cost for each
9340 // particular instruction, we can use the cost of the single
9341 // instruction x total number of scalar instructions.
9342 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9343 } else {
9344 for (unsigned I = 0; I < Sz; ++I) {
9345 if (UsedScalars.test(I))
9346 continue;
9347 ScalarCost += ScalarEltCost(I);
9348 }
9349 }
9350
9351 InstructionCost VecCost = VectorCost(CommonCost);
9352 // Check if the current node must be resized, if the parent node is not
9353 // resized.
9354 if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
9355 const EdgeInfo &EI = E->UserTreeIndices.front();
9356 if ((EI.UserTE->getOpcode() != Instruction::Select ||
9357 EI.EdgeIdx != 0) &&
9358 It != MinBWs.end()) {
9359 auto UserBWIt = MinBWs.find(EI.UserTE);
9360 Type *UserScalarTy =
9361 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9362 if (UserBWIt != MinBWs.end())
9363 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
9364 UserBWIt->second.first);
9365 if (ScalarTy != UserScalarTy) {
9366 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9367 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
9368 unsigned VecOpcode;
9369 auto *UserVecTy =
9370 getWidenedType(UserScalarTy, E->getVectorFactor());
9371 if (BWSz > SrcBWSz)
9372 VecOpcode = Instruction::Trunc;
9373 else
9374 VecOpcode =
9375 It->second.second ? Instruction::SExt : Instruction::ZExt;
9376 TTI::CastContextHint CCH = GetCastContextHint(VL0);
9377 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
9378 CostKind);
9379 }
9380 }
9381 }
9382 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9383 ScalarCost, "Calculated costs for Tree"));
9384 return VecCost - ScalarCost;
9385 };
9386 // Calculate cost difference from vectorizing set of GEPs.
9387 // Negative value means vectorizing is profitable.
9388 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
9389 assert((E->State == TreeEntry::Vectorize ||
9390 E->State == TreeEntry::StridedVectorize) &&
9391 "Entry state expected to be Vectorize or StridedVectorize here.");
9392 InstructionCost ScalarCost = 0;
9393 InstructionCost VecCost = 0;
9394 std::tie(ScalarCost, VecCost) = getGEPCosts(
9395 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
9396 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9397 "Calculated GEPs cost for Tree"));
9398
9399 return VecCost - ScalarCost;
9400 };
9401
9402 switch (ShuffleOrOp) {
9403 case Instruction::PHI: {
9404 // Count reused scalars.
9405 InstructionCost ScalarCost = 0;
9406 SmallPtrSet<const TreeEntry *, 4> CountedOps;
9407 for (Value *V : UniqueValues) {
9408 auto *PHI = dyn_cast<PHINode>(V);
9409 if (!PHI)
9410 continue;
9411
9412 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
9413 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
9414 Value *Op = PHI->getIncomingValue(I);
9415 Operands[I] = Op;
9416 }
9417 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
9418 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
9419 if (!OpTE->ReuseShuffleIndices.empty())
9420 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9421 OpTE->Scalars.size());
9422 }
9423
9424 return CommonCost - ScalarCost;
9425 }
9426 case Instruction::ExtractValue:
9427 case Instruction::ExtractElement: {
9428 auto GetScalarCost = [&](unsigned Idx) {
9429 auto *I = cast<Instruction>(UniqueValues[Idx]);
9430 VectorType *SrcVecTy;
9431 if (ShuffleOrOp == Instruction::ExtractElement) {
9432 auto *EE = cast<ExtractElementInst>(I);
9433 SrcVecTy = EE->getVectorOperandType();
9434 } else {
9435 auto *EV = cast<ExtractValueInst>(I);
9436 Type *AggregateTy = EV->getAggregateOperand()->getType();
9437 unsigned NumElts;
9438 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9439 NumElts = ATy->getNumElements();
9440 else
9441 NumElts = AggregateTy->getStructNumElements();
9442 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
9443 }
9444 if (I->hasOneUse()) {
9445 Instruction *Ext = I->user_back();
9446 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9447 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9448 // Use getExtractWithExtendCost() to calculate the cost of
9449 // extractelement/ext pair.
9450 InstructionCost Cost = TTI->getExtractWithExtendCost(
9451 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
9452 // Subtract the cost of s|zext which is subtracted separately.
9453 Cost -= TTI->getCastInstrCost(
9454 Ext->getOpcode(), Ext->getType(), I->getType(),
9455 TTI::getCastContextHint(Ext), CostKind, Ext);
9456 return Cost;
9457 }
9458 }
9459 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
9460 CostKind, *getExtractIndex(I));
9461 };
9462 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9463 return GetCostDiff(GetScalarCost, GetVectorCost);
9464 }
9465 case Instruction::InsertElement: {
9466 assert(E->ReuseShuffleIndices.empty() &&
9467 "Unique insertelements only are expected.");
9468 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
9469 unsigned const NumElts = SrcVecTy->getNumElements();
9470 unsigned const NumScalars = VL.size();
9471
9472 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
9473
9474 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9475 unsigned OffsetBeg = *getElementIndex(VL.front());
9476 unsigned OffsetEnd = OffsetBeg;
9477 InsertMask[OffsetBeg] = 0;
9478 for (auto [I, V] : enumerate(VL.drop_front())) {
9479 unsigned Idx = *getElementIndex(V);
9480 if (OffsetBeg > Idx)
9481 OffsetBeg = Idx;
9482 else if (OffsetEnd < Idx)
9483 OffsetEnd = Idx;
9484 InsertMask[Idx] = I + 1;
9485 }
9486 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
9487 if (NumOfParts > 0)
9488 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9489 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9490 VecScalarsSz;
9491 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9492 unsigned InsertVecSz = std::min<unsigned>(
9493 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
9494 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9495 bool IsWholeSubvector =
9496 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9497 // Check if we can safely insert a subvector. If it is not possible, just
9498 // generate a whole-sized vector and shuffle the source vector and the new
9499 // subvector.
9500 if (OffsetBeg + InsertVecSz > VecSz) {
9501 // Align OffsetBeg to generate correct mask.
9502 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
9503 InsertVecSz = VecSz;
9504 }
9505
9506 APInt DemandedElts = APInt::getZero(NumElts);
9507 // TODO: Add support for Instruction::InsertValue.
9508 SmallVector<int> Mask;
9509 if (!E->ReorderIndices.empty()) {
9510 inversePermutation(E->ReorderIndices, Mask);
9511 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
9512 } else {
9513 Mask.assign(VecSz, PoisonMaskElem);
9514 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
9515 }
9516 bool IsIdentity = true;
9517 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9518 Mask.swap(PrevMask);
9519 for (unsigned I = 0; I < NumScalars; ++I) {
9520 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
9521 DemandedElts.setBit(InsertIdx);
9522 IsIdentity &= InsertIdx - OffsetBeg == I;
9523 Mask[InsertIdx - OffsetBeg] = I;
9524 }
9525 assert(Offset < NumElts && "Failed to find vector index offset");
9526
9527 InstructionCost Cost = 0;
9528 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
9529 /*Insert*/ true, /*Extract*/ false,
9530 CostKind);
9531
9532 // First cost - resize to actual vector size if not identity shuffle or
9533 // need to shift the vector.
9534 // Do not calculate the cost if the actual size is the register size and
9535 // we can merge this shuffle with the following SK_Select.
9536 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
9537 if (!IsIdentity)
9538 Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
9539 InsertVecTy, Mask);
9540 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
9541 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9542 }));
9543 // Second cost - permutation with subvector, if some elements are from the
9544 // initial vector or inserting a subvector.
9545 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9546 // subvector of ActualVecTy.
9547 SmallBitVector InMask =
9548 isUndefVector(FirstInsert->getOperand(0),
9549 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9550 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9551 if (InsertVecSz != VecSz) {
9552 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
9553 Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy,
9554 std::nullopt, CostKind, OffsetBeg - Offset,
9555 InsertVecTy);
9556 } else {
9557 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9558 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
9559 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9560 I <= End; ++I)
9561 if (Mask[I] != PoisonMaskElem)
9562 Mask[I] = I + VecSz;
9563 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9564 Mask[I] =
9565 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
9566 Cost +=
9567 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
9568 }
9569 }
9570 return Cost;
9571 }
9572 case Instruction::ZExt:
9573 case Instruction::SExt:
9574 case Instruction::FPToUI:
9575 case Instruction::FPToSI:
9576 case Instruction::FPExt:
9577 case Instruction::PtrToInt:
9578 case Instruction::IntToPtr:
9579 case Instruction::SIToFP:
9580 case Instruction::UIToFP:
9581 case Instruction::Trunc:
9582 case Instruction::FPTrunc:
9583 case Instruction::BitCast: {
9584 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9585 Type *SrcScalarTy = VL0->getOperand(0)->getType();
9586 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
9587 unsigned Opcode = ShuffleOrOp;
9588 unsigned VecOpcode = Opcode;
9589 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9590 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9591 // Check if the values are candidates to demote.
9592 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
9593 if (SrcIt != MinBWs.end()) {
9594 SrcBWSz = SrcIt->second.first;
9595 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
9596 SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
9597 }
9598 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9599 if (BWSz == SrcBWSz) {
9600 VecOpcode = Instruction::BitCast;
9601 } else if (BWSz < SrcBWSz) {
9602 VecOpcode = Instruction::Trunc;
9603 } else if (It != MinBWs.end()) {
9604 assert(BWSz > SrcBWSz && "Invalid cast!");
9605 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9606 } else if (SrcIt != MinBWs.end()) {
9607 assert(BWSz > SrcBWSz && "Invalid cast!");
9608 VecOpcode =
9609 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9610 }
9611 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9612 !SrcIt->second.second) {
9613 VecOpcode = Instruction::UIToFP;
9614 }
9615 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9616 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9617 return TTI->getCastInstrCost(Opcode, VL0->getType(),
9618 VL0->getOperand(0)->getType(),
9619 TTI::getCastContextHint(VI), CostKind, VI);
9620 };
9621 auto GetVectorCost = [=](InstructionCost CommonCost) {
9622 // Do not count cost here if minimum bitwidth is in effect and it is just
9623 // a bitcast (here it is just a noop).
9624 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9625 return CommonCost;
9626 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9627 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
9628 return CommonCost +
9629 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
9630 VecOpcode == Opcode ? VI : nullptr);
9631 };
9632 return GetCostDiff(GetScalarCost, GetVectorCost);
9633 }
9634 case Instruction::FCmp:
9635 case Instruction::ICmp:
9636 case Instruction::Select: {
9637 CmpInst::Predicate VecPred, SwappedVecPred;
9638 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
9639 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
9640 match(VL0, MatchCmp))
9641 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
9642 else
9643 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9644 ? CmpInst::BAD_FCMP_PREDICATE
9645 : CmpInst::BAD_ICMP_PREDICATE;
9646 auto GetScalarCost = [&](unsigned Idx) {
9647 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9648 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9649 ? CmpInst::BAD_FCMP_PREDICATE
9650 : CmpInst::BAD_ICMP_PREDICATE;
9651 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
9652 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
9653 !match(VI, MatchCmp)) ||
9654 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9655 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9656 ? CmpInst::BAD_FCMP_PREDICATE
9657 : CmpInst::BAD_ICMP_PREDICATE;
9658
9659 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
9660 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
9661 CostKind, VI);
9662 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI);
9663 if (MinMaxID != Intrinsic::not_intrinsic) {
9664 Type *CanonicalType = OrigScalarTy;
9665 if (CanonicalType->isPtrOrPtrVectorTy())
9666 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9667 CanonicalType->getContext(),
9668 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9669
9670 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9671 {CanonicalType, CanonicalType});
9672 InstructionCost IntrinsicCost =
9673 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9674 // If the selects are the only uses of the compares, they will be
9675 // dead and we can adjust the cost by removing their cost.
9676 if (SelectOnly) {
9677 auto *CI = cast<CmpInst>(VI->getOperand(0));
9678 IntrinsicCost -= TTI->getCmpSelInstrCost(
9679 CI->getOpcode(), OrigScalarTy, Builder.getInt1Ty(),
9680 CI->getPredicate(), CostKind, CI);
9681 }
9682 ScalarCost = std::min(ScalarCost, IntrinsicCost);
9683 }
9684
9685 return ScalarCost;
9686 };
9687 auto GetVectorCost = [&](InstructionCost CommonCost) {
9688 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
9689
9690 InstructionCost VecCost = TTI->getCmpSelInstrCost(
9691 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9692 // Check if it is possible and profitable to use min/max for selects
9693 // in VL.
9694 //
9695 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL);
9696 if (MinMaxID != Intrinsic::not_intrinsic) {
9697 Type *CanonicalType = VecTy;
9698 if (CanonicalType->isPtrOrPtrVectorTy())
9699 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9700 CanonicalType->getContext(),
9701 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9702 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9703 {CanonicalType, CanonicalType});
9704 InstructionCost IntrinsicCost =
9705 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9706 // If the selects are the only uses of the compares, they will be
9707 // dead and we can adjust the cost by removing their cost.
9708 if (SelectOnly) {
9709 auto *CI =
9710 cast<CmpInst>(cast<Instruction>(VL.front())->getOperand(0));
9711 IntrinsicCost -= TTI->getCmpSelInstrCost(CI->getOpcode(), VecTy,
9712 MaskTy, VecPred, CostKind);
9713 }
9714 VecCost = std::min(VecCost, IntrinsicCost);
9715 }
9716 return VecCost + CommonCost;
9717 };
9718 return GetCostDiff(GetScalarCost, GetVectorCost);
9719 }
9720 case Instruction::FNeg:
9721 case Instruction::Add:
9722 case Instruction::FAdd:
9723 case Instruction::Sub:
9724 case Instruction::FSub:
9725 case Instruction::Mul:
9726 case Instruction::FMul:
9727 case Instruction::UDiv:
9728 case Instruction::SDiv:
9729 case Instruction::FDiv:
9730 case Instruction::URem:
9731 case Instruction::SRem:
9732 case Instruction::FRem:
9733 case Instruction::Shl:
9734 case Instruction::LShr:
9735 case Instruction::AShr:
9736 case Instruction::And:
9737 case Instruction::Or:
9738 case Instruction::Xor: {
9739 auto GetScalarCost = [&](unsigned Idx) {
9740 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9741 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9742 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
9743 TTI::OperandValueInfo Op2Info =
9744 TTI::getOperandInfo(VI->getOperand(OpIdx));
9745 SmallVector<const Value *> Operands(VI->operand_values());
9746 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
9747 Op1Info, Op2Info, Operands, VI);
9748 };
9749 auto GetVectorCost = [=](InstructionCost CommonCost) {
9750 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
9751 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9752 ArrayRef<Value *> Ops = E->getOperand(I);
9753 if (all_of(Ops, [&](Value *Op) {
9754 auto *CI = dyn_cast<ConstantInt>(Op);
9755 return CI && CI->getValue().countr_one() >= It->second.first;
9756 }))
9757 return CommonCost;
9758 }
9759 }
9760 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9761 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
9762 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
9763 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
9764 Op2Info, std::nullopt, nullptr, TLI) +
9765 CommonCost;
9766 };
9767 return GetCostDiff(GetScalarCost, GetVectorCost);
9768 }
9769 case Instruction::GetElementPtr: {
9770 return CommonCost + GetGEPCostDiff(VL, VL0);
9771 }
9772 case Instruction::Load: {
9773 auto GetScalarCost = [&](unsigned Idx) {
9774 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
9775 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
9776 VI->getAlign(), VI->getPointerAddressSpace(),
9777 CostKind, TTI::OperandValueInfo(), VI);
9778 };
9779 auto *LI0 = cast<LoadInst>(VL0);
9780 auto GetVectorCost = [&](InstructionCost CommonCost) {
9781 InstructionCost VecLdCost;
9782 if (E->State == TreeEntry::Vectorize) {
9783 VecLdCost = TTI->getMemoryOpCost(
9784 Instruction::Load, VecTy, LI0->getAlign(),
9785 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
9786 } else if (E->State == TreeEntry::StridedVectorize) {
9787 Align CommonAlignment =
9788 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9789 VecLdCost = TTI->getStridedMemoryOpCost(
9790 Instruction::Load, VecTy, LI0->getPointerOperand(),
9791 /*VariableMask=*/false, CommonAlignment, CostKind);
9792 } else {
9793 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9794 Align CommonAlignment =
9795 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9796 VecLdCost = TTI->getGatherScatterOpCost(
9797 Instruction::Load, VecTy, LI0->getPointerOperand(),
9798 /*VariableMask=*/false, CommonAlignment, CostKind);
9799 }
9800 return VecLdCost + CommonCost;
9801 };
9802
9803 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9804 // If this node generates masked gather load then it is not a terminal node.
9805 // Hence address operand cost is estimated separately.
9806 if (E->State == TreeEntry::ScatterVectorize)
9807 return Cost;
9808
9809 // Estimate cost of GEPs since this tree node is a terminator.
9810 SmallVector<Value *> PointerOps(VL.size());
9811 for (auto [I, V] : enumerate(VL))
9812 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
9813 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9814 }
9815 case Instruction::Store: {
9816 bool IsReorder = !E->ReorderIndices.empty();
9817 auto GetScalarCost = [=](unsigned Idx) {
9818 auto *VI = cast<StoreInst>(VL[Idx]);
9819 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
9820 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
9821 VI->getAlign(), VI->getPointerAddressSpace(),
9822 CostKind, OpInfo, VI);
9823 };
9824 auto *BaseSI =
9825 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9826 auto GetVectorCost = [=](InstructionCost CommonCost) {
9827 // We know that we can merge the stores. Calculate the cost.
9828 InstructionCost VecStCost;
9829 if (E->State == TreeEntry::StridedVectorize) {
9830 Align CommonAlignment =
9831 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
9832 VecStCost = TTI->getStridedMemoryOpCost(
9833 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9834 /*VariableMask=*/false, CommonAlignment, CostKind);
9835 } else {
9836 assert(E->State == TreeEntry::Vectorize &&
9837 "Expected either strided or consecutive stores.");
9838 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9839 VecStCost = TTI->getMemoryOpCost(
9840 Instruction::Store, VecTy, BaseSI->getAlign(),
9841 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
9842 }
9843 return VecStCost + CommonCost;
9844 };
9845 SmallVector<Value *> PointerOps(VL.size());
9846 for (auto [I, V] : enumerate(VL)) {
9847 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9848 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
9849 }
9850
9851 return GetCostDiff(GetScalarCost, GetVectorCost) +
9852 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9853 }
9854 case Instruction::Call: {
9855 auto GetScalarCost = [&](unsigned Idx) {
9856 auto *CI = cast<CallInst>(UniqueValues[Idx]);
9857 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9858 if (ID != Intrinsic::not_intrinsic) {
9859 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9860 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9861 }
9862 return TTI->getCallInstrCost(CI->getCalledFunction(),
9863 CI->getFunctionType()->getReturnType(),
9864 CI->getFunctionType()->params(), CostKind);
9865 };
9866 auto GetVectorCost = [=](InstructionCost CommonCost) {
9867 auto *CI = cast<CallInst>(VL0);
9868 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9869 SmallVector<Type *> ArgTys =
9870 buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
9871 It != MinBWs.end() ? It->second.first : 0);
9872 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9873 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9874 };
9875 return GetCostDiff(GetScalarCost, GetVectorCost);
9876 }
9877 case Instruction::ShuffleVector: {
9878 assert(E->isAltShuffle() &&
9879 ((Instruction::isBinaryOp(E->getOpcode()) &&
9880 Instruction::isBinaryOp(E->getAltOpcode())) ||
9881 (Instruction::isCast(E->getOpcode()) &&
9882 Instruction::isCast(E->getAltOpcode())) ||
9883 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9884 "Invalid Shuffle Vector Operand");
9885 // Try to find the previous shuffle node with the same operands and same
9886 // main/alternate ops.
9887 auto TryFindNodeWithEqualOperands = [=]() {
9888 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9889 if (TE.get() == E)
9890 break;
9891 if (TE->isAltShuffle() &&
9892 ((TE->getOpcode() == E->getOpcode() &&
9893 TE->getAltOpcode() == E->getAltOpcode()) ||
9894 (TE->getOpcode() == E->getAltOpcode() &&
9895 TE->getAltOpcode() == E->getOpcode())) &&
9896 TE->hasEqualOperands(*E))
9897 return true;
9898 }
9899 return false;
9900 };
9901 auto GetScalarCost = [&](unsigned Idx) {
9902 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9903 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9904 (void)E;
9905 return TTI->getInstructionCost(VI, CostKind);
9906 };
9907 // Need to clear CommonCost since the final shuffle cost is included into
9908 // vector cost.
9909 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9910 // VecCost is equal to sum of the cost of creating 2 vectors
9911 // and the cost of creating shuffle.
9912 InstructionCost VecCost = 0;
9913 if (TryFindNodeWithEqualOperands()) {
9914 LLVM_DEBUG({
9915 dbgs() << "SLP: diamond match for alternate node found.\n";
9916 E->dump();
9917 });
9918 // No need to add new vector costs here since we're going to reuse
9919 // same main/alternate vector ops, just do different shuffling.
9920 } else if (Instruction::isBinaryOp(E->getOpcode())) {
9921 VecCost =
9922 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
9923 VecCost +=
9924 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
9925 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9926 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
9927 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9928 CI0->getPredicate(), CostKind, VL0);
9929 VecCost += TTIRef.getCmpSelInstrCost(
9930 E->getOpcode(), VecTy, MaskTy,
9931 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
9932 E->getAltOp());
9933 } else {
9934 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9935 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
9936 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9937 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9938 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9939 unsigned SrcBWSz =
9940 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9941 if (SrcIt != MinBWs.end()) {
9942 SrcBWSz = SrcIt->second.first;
9943 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
9944 SrcTy = getWidenedType(SrcSclTy, VL.size());
9945 }
9946 if (BWSz <= SrcBWSz) {
9947 if (BWSz < SrcBWSz)
9948 VecCost =
9949 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9950 TTI::CastContextHint::None, CostKind);
9951 LLVM_DEBUG({
9952 dbgs()
9953 << "SLP: alternate extension, which should be truncated.\n";
9954 E->dump();
9955 });
9956 return VecCost;
9957 }
9958 }
9959 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9960 TTI::CastContextHint::None, CostKind);
9961 VecCost +=
9962 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9963 TTI::CastContextHint::None, CostKind);
9964 }
9965 SmallVector<int> Mask;
9966 E->buildAltOpShuffleMask(
9967 [E](Instruction *I) {
9968 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9969 return I->getOpcode() == E->getAltOpcode();
9970 },
9971 Mask);
9972 VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc,
9973 FinalVecTy, Mask);
9974 // Patterns like [fadd,fsub] can be combined into a single instruction
9975 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9976 // need to take into account their order when looking for the most used
9977 // order.
9978 unsigned Opcode0 = E->getOpcode();
9979 unsigned Opcode1 = E->getAltOpcode();
9980 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
9981 // If this pattern is supported by the target then we consider the
9982 // order.
9983 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9984 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9985 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9986 return AltVecCost < VecCost ? AltVecCost : VecCost;
9987 }
9988 // TODO: Check the reverse order too.
9989 return VecCost;
9990 };
9991 return GetCostDiff(GetScalarCost, GetVectorCost);
9992 }
9993 default:
9994 llvm_unreachable("Unknown instruction");
9995 }
9996 }
9997
isFullyVectorizableTinyTree(bool ForReduction) const9998 bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9999 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
10000 << VectorizableTree.size() << " is fully vectorizable .\n");
10001
10002 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
10003 SmallVector<int> Mask;
10004 return TE->isGather() &&
10005 !any_of(TE->Scalars,
10006 [this](Value *V) { return EphValues.contains(V); }) &&
10007 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
10008 TE->Scalars.size() < Limit ||
10009 ((TE->getOpcode() == Instruction::ExtractElement ||
10010 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
10011 isFixedVectorShuffle(TE->Scalars, Mask)) ||
10012 (TE->isGather() && TE->getOpcode() == Instruction::Load &&
10013 !TE->isAltShuffle()));
10014 };
10015
10016 // We only handle trees of heights 1 and 2.
10017 if (VectorizableTree.size() == 1 &&
10018 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
10019 (ForReduction &&
10020 AreVectorizableGathers(VectorizableTree[0].get(),
10021 VectorizableTree[0]->Scalars.size()) &&
10022 VectorizableTree[0]->getVectorFactor() > 2)))
10023 return true;
10024
10025 if (VectorizableTree.size() != 2)
10026 return false;
10027
10028 // Handle splat and all-constants stores. Also try to vectorize tiny trees
10029 // with the second gather nodes if they have less scalar operands rather than
10030 // the initial tree element (may be profitable to shuffle the second gather)
10031 // or they are extractelements, which form shuffle.
10032 SmallVector<int> Mask;
10033 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
10034 AreVectorizableGathers(VectorizableTree[1].get(),
10035 VectorizableTree[0]->Scalars.size()))
10036 return true;
10037
10038 // Gathering cost would be too much for tiny trees.
10039 if (VectorizableTree[0]->isGather() ||
10040 (VectorizableTree[1]->isGather() &&
10041 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
10042 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
10043 return false;
10044
10045 return true;
10046 }
10047
isLoadCombineCandidateImpl(Value * Root,unsigned NumElts,TargetTransformInfo * TTI,bool MustMatchOrInst)10048 static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
10049 TargetTransformInfo *TTI,
10050 bool MustMatchOrInst) {
10051 // Look past the root to find a source value. Arbitrarily follow the
10052 // path through operand 0 of any 'or'. Also, peek through optional
10053 // shift-left-by-multiple-of-8-bits.
10054 Value *ZextLoad = Root;
10055 const APInt *ShAmtC;
10056 bool FoundOr = false;
10057 while (!isa<ConstantExpr>(ZextLoad) &&
10058 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
10059 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
10060 ShAmtC->urem(8) == 0))) {
10061 auto *BinOp = cast<BinaryOperator>(ZextLoad);
10062 ZextLoad = BinOp->getOperand(0);
10063 if (BinOp->getOpcode() == Instruction::Or)
10064 FoundOr = true;
10065 }
10066 // Check if the input is an extended load of the required or/shift expression.
10067 Value *Load;
10068 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
10069 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
10070 return false;
10071
10072 // Require that the total load bit width is a legal integer type.
10073 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
10074 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
10075 Type *SrcTy = Load->getType();
10076 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
10077 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
10078 return false;
10079
10080 // Everything matched - assume that we can fold the whole sequence using
10081 // load combining.
10082 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
10083 << *(cast<Instruction>(Root)) << "\n");
10084
10085 return true;
10086 }
10087
isLoadCombineReductionCandidate(RecurKind RdxKind) const10088 bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
10089 if (RdxKind != RecurKind::Or)
10090 return false;
10091
10092 unsigned NumElts = VectorizableTree[0]->Scalars.size();
10093 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
10094 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
10095 /* MatchOr */ false);
10096 }
10097
isLoadCombineCandidate(ArrayRef<Value * > Stores) const10098 bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {
10099 // Peek through a final sequence of stores and check if all operations are
10100 // likely to be load-combined.
10101 unsigned NumElts = Stores.size();
10102 for (Value *Scalar : Stores) {
10103 Value *X;
10104 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
10105 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
10106 return false;
10107 }
10108 return true;
10109 }
10110
isTreeTinyAndNotFullyVectorizable(bool ForReduction) const10111 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
10112 // No need to vectorize inserts of gathered values.
10113 if (VectorizableTree.size() == 2 &&
10114 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
10115 VectorizableTree[1]->isGather() &&
10116 (VectorizableTree[1]->getVectorFactor() <= 2 ||
10117 !(isSplat(VectorizableTree[1]->Scalars) ||
10118 allConstant(VectorizableTree[1]->Scalars))))
10119 return true;
10120
10121 // If the graph includes only PHI nodes and gathers, it is defnitely not
10122 // profitable for the vectorization, we can skip it, if the cost threshold is
10123 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
10124 // gathers/buildvectors.
10125 constexpr int Limit = 4;
10126 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
10127 !VectorizableTree.empty() &&
10128 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10129 return (TE->isGather() &&
10130 TE->getOpcode() != Instruction::ExtractElement &&
10131 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
10132 TE->getOpcode() == Instruction::PHI;
10133 }))
10134 return true;
10135
10136 // We can vectorize the tree if its size is greater than or equal to the
10137 // minimum size specified by the MinTreeSize command line option.
10138 if (VectorizableTree.size() >= MinTreeSize)
10139 return false;
10140
10141 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
10142 // can vectorize it if we can prove it fully vectorizable.
10143 if (isFullyVectorizableTinyTree(ForReduction))
10144 return false;
10145
10146 // Check if any of the gather node forms an insertelement buildvector
10147 // somewhere.
10148 bool IsAllowedSingleBVNode =
10149 VectorizableTree.size() > 1 ||
10150 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
10151 !VectorizableTree.front()->isAltShuffle() &&
10152 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
10153 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
10154 allSameBlock(VectorizableTree.front()->Scalars));
10155 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10156 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
10157 return isa<ExtractElementInst, UndefValue>(V) ||
10158 (IsAllowedSingleBVNode &&
10159 !V->hasNUsesOrMore(UsesLimit) &&
10160 any_of(V->users(), IsaPred<InsertElementInst>));
10161 });
10162 }))
10163 return false;
10164
10165 assert(VectorizableTree.empty()
10166 ? ExternalUses.empty()
10167 : true && "We shouldn't have any external users");
10168
10169 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
10170 // vectorizable.
10171 return true;
10172 }
10173
getSpillCost() const10174 InstructionCost BoUpSLP::getSpillCost() const {
10175 // Walk from the bottom of the tree to the top, tracking which values are
10176 // live. When we see a call instruction that is not part of our tree,
10177 // query TTI to see if there is a cost to keeping values live over it
10178 // (for example, if spills and fills are required).
10179 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
10180 InstructionCost Cost = 0;
10181
10182 SmallPtrSet<Instruction *, 4> LiveValues;
10183 Instruction *PrevInst = nullptr;
10184
10185 // The entries in VectorizableTree are not necessarily ordered by their
10186 // position in basic blocks. Collect them and order them by dominance so later
10187 // instructions are guaranteed to be visited first. For instructions in
10188 // different basic blocks, we only scan to the beginning of the block, so
10189 // their order does not matter, as long as all instructions in a basic block
10190 // are grouped together. Using dominance ensures a deterministic order.
10191 SmallVector<Instruction *, 16> OrderedScalars;
10192 for (const auto &TEPtr : VectorizableTree) {
10193 if (TEPtr->State != TreeEntry::Vectorize)
10194 continue;
10195 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
10196 if (!Inst)
10197 continue;
10198 OrderedScalars.push_back(Inst);
10199 }
10200 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
10201 auto *NodeA = DT->getNode(A->getParent());
10202 auto *NodeB = DT->getNode(B->getParent());
10203 assert(NodeA && "Should only process reachable instructions");
10204 assert(NodeB && "Should only process reachable instructions");
10205 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10206 "Different nodes should have different DFS numbers");
10207 if (NodeA != NodeB)
10208 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
10209 return B->comesBefore(A);
10210 });
10211
10212 for (Instruction *Inst : OrderedScalars) {
10213 if (!PrevInst) {
10214 PrevInst = Inst;
10215 continue;
10216 }
10217
10218 // Update LiveValues.
10219 LiveValues.erase(PrevInst);
10220 for (auto &J : PrevInst->operands()) {
10221 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10222 LiveValues.insert(cast<Instruction>(&*J));
10223 }
10224
10225 LLVM_DEBUG({
10226 dbgs() << "SLP: #LV: " << LiveValues.size();
10227 for (auto *X : LiveValues)
10228 dbgs() << " " << X->getName();
10229 dbgs() << ", Looking at ";
10230 Inst->dump();
10231 });
10232
10233 // Now find the sequence of instructions between PrevInst and Inst.
10234 unsigned NumCalls = 0;
10235 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
10236 PrevInstIt =
10237 PrevInst->getIterator().getReverse();
10238 while (InstIt != PrevInstIt) {
10239 if (PrevInstIt == PrevInst->getParent()->rend()) {
10240 PrevInstIt = Inst->getParent()->rbegin();
10241 continue;
10242 }
10243
10244 auto NoCallIntrinsic = [this](Instruction *I) {
10245 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
10246 if (II->isAssumeLikeIntrinsic())
10247 return true;
10248 FastMathFlags FMF;
10249 SmallVector<Type *, 4> Tys;
10250 for (auto &ArgOp : II->args())
10251 Tys.push_back(ArgOp->getType());
10252 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
10253 FMF = FPMO->getFastMathFlags();
10254 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
10255 FMF);
10256 InstructionCost IntrCost =
10257 TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
10258 InstructionCost CallCost = TTI->getCallInstrCost(
10259 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
10260 if (IntrCost < CallCost)
10261 return true;
10262 }
10263 return false;
10264 };
10265
10266 // Debug information does not impact spill cost.
10267 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10268 &*PrevInstIt != PrevInst)
10269 NumCalls++;
10270
10271 ++PrevInstIt;
10272 }
10273
10274 if (NumCalls) {
10275 SmallVector<Type *, 4> V;
10276 for (auto *II : LiveValues) {
10277 auto *ScalarTy = II->getType();
10278 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10279 ScalarTy = VectorTy->getElementType();
10280 V.push_back(getWidenedType(ScalarTy, BundleWidth));
10281 }
10282 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
10283 }
10284
10285 PrevInst = Inst;
10286 }
10287
10288 return Cost;
10289 }
10290
10291 /// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
10292 /// buildvector sequence.
isFirstInsertElement(const InsertElementInst * IE1,const InsertElementInst * IE2)10293 static bool isFirstInsertElement(const InsertElementInst *IE1,
10294 const InsertElementInst *IE2) {
10295 if (IE1 == IE2)
10296 return false;
10297 const auto *I1 = IE1;
10298 const auto *I2 = IE2;
10299 const InsertElementInst *PrevI1;
10300 const InsertElementInst *PrevI2;
10301 unsigned Idx1 = *getElementIndex(IE1);
10302 unsigned Idx2 = *getElementIndex(IE2);
10303 do {
10304 if (I2 == IE1)
10305 return true;
10306 if (I1 == IE2)
10307 return false;
10308 PrevI1 = I1;
10309 PrevI2 = I2;
10310 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10311 getElementIndex(I1).value_or(Idx2) != Idx2)
10312 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10313 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
10314 getElementIndex(I2).value_or(Idx1) != Idx1)
10315 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10316 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10317 llvm_unreachable("Two different buildvectors not expected.");
10318 }
10319
10320 namespace {
10321 /// Returns incoming Value *, if the requested type is Value * too, or a default
10322 /// value, otherwise.
10323 struct ValueSelect {
10324 template <typename U>
get__anon5824d949b311::ValueSelect10325 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
10326 return V;
10327 }
10328 template <typename U>
get__anon5824d949b311::ValueSelect10329 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
10330 return U();
10331 }
10332 };
10333 } // namespace
10334
10335 /// Does the analysis of the provided shuffle masks and performs the requested
10336 /// actions on the vectors with the given shuffle masks. It tries to do it in
10337 /// several steps.
10338 /// 1. If the Base vector is not undef vector, resizing the very first mask to
10339 /// have common VF and perform action for 2 input vectors (including non-undef
10340 /// Base). Other shuffle masks are combined with the resulting after the 1 stage
10341 /// and processed as a shuffle of 2 elements.
10342 /// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
10343 /// action only for 1 vector with the given mask, if it is not the identity
10344 /// mask.
10345 /// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
10346 /// vectors, combing the masks properly between the steps.
10347 template <typename T>
performExtractsShuffleAction(MutableArrayRef<std::pair<T *,SmallVector<int>>> ShuffleMask,Value * Base,function_ref<unsigned (T *)> GetVF,function_ref<std::pair<T *,bool> (T *,ArrayRef<int>,bool)> ResizeAction,function_ref<T * (ArrayRef<int>,ArrayRef<T * >)> Action)10348 static T *performExtractsShuffleAction(
10349 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
10350 function_ref<unsigned(T *)> GetVF,
10351 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
10352 function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
10353 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
10354 SmallVector<int> Mask(ShuffleMask.begin()->second);
10355 auto VMIt = std::next(ShuffleMask.begin());
10356 T *Prev = nullptr;
10357 SmallBitVector UseMask =
10358 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10359 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
10360 if (!IsBaseUndef.all()) {
10361 // Base is not undef, need to combine it with the next subvectors.
10362 std::pair<T *, bool> Res =
10363 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
10364 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
10365 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
10366 if (Mask[Idx] == PoisonMaskElem)
10367 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
10368 else
10369 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
10370 }
10371 auto *V = ValueSelect::get<T *>(Base);
10372 (void)V;
10373 assert((!V || GetVF(V) == Mask.size()) &&
10374 "Expected base vector of VF number of elements.");
10375 Prev = Action(Mask, {nullptr, Res.first});
10376 } else if (ShuffleMask.size() == 1) {
10377 // Base is undef and only 1 vector is shuffled - perform the action only for
10378 // single vector, if the mask is not the identity mask.
10379 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10380 /*ForSingleMask=*/true);
10381 if (Res.second)
10382 // Identity mask is found.
10383 Prev = Res.first;
10384 else
10385 Prev = Action(Mask, {ShuffleMask.begin()->first});
10386 } else {
10387 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
10388 // shuffles step by step, combining shuffle between the steps.
10389 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10390 unsigned Vec2VF = GetVF(VMIt->first);
10391 if (Vec1VF == Vec2VF) {
10392 // No need to resize the input vectors since they are of the same size, we
10393 // can shuffle them directly.
10394 ArrayRef<int> SecMask = VMIt->second;
10395 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10396 if (SecMask[I] != PoisonMaskElem) {
10397 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10398 Mask[I] = SecMask[I] + Vec1VF;
10399 }
10400 }
10401 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10402 } else {
10403 // Vectors of different sizes - resize and reshuffle.
10404 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10405 /*ForSingleMask=*/false);
10406 std::pair<T *, bool> Res2 =
10407 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10408 ArrayRef<int> SecMask = VMIt->second;
10409 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10410 if (Mask[I] != PoisonMaskElem) {
10411 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10412 if (Res1.second)
10413 Mask[I] = I;
10414 } else if (SecMask[I] != PoisonMaskElem) {
10415 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10416 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
10417 }
10418 }
10419 Prev = Action(Mask, {Res1.first, Res2.first});
10420 }
10421 VMIt = std::next(VMIt);
10422 }
10423 bool IsBaseNotUndef = !IsBaseUndef.all();
10424 (void)IsBaseNotUndef;
10425 // Perform requested actions for the remaining masks/vectors.
10426 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10427 // Shuffle other input vectors, if any.
10428 std::pair<T *, bool> Res =
10429 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10430 ArrayRef<int> SecMask = VMIt->second;
10431 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10432 if (SecMask[I] != PoisonMaskElem) {
10433 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
10434 "Multiple uses of scalars.");
10435 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
10436 } else if (Mask[I] != PoisonMaskElem) {
10437 Mask[I] = I;
10438 }
10439 }
10440 Prev = Action(Mask, {Prev, Res.first});
10441 }
10442 return Prev;
10443 }
10444
getTreeCost(ArrayRef<Value * > VectorizedVals)10445 InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
10446 InstructionCost Cost = 0;
10447 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
10448 << VectorizableTree.size() << ".\n");
10449
10450 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10451
10452 SmallPtrSet<Value *, 4> CheckedExtracts;
10453 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
10454 TreeEntry &TE = *VectorizableTree[I];
10455 if (TE.isGather()) {
10456 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
10457 E && E->getVectorFactor() == TE.getVectorFactor() &&
10458 E->isSame(TE.Scalars)) {
10459 // Some gather nodes might be absolutely the same as some vectorizable
10460 // nodes after reordering, need to handle it.
10461 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
10462 << shortBundleName(TE.Scalars) << ".\n"
10463 << "SLP: Current total cost = " << Cost << "\n");
10464 continue;
10465 }
10466 }
10467
10468 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
10469 Cost += C;
10470 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
10471 << shortBundleName(TE.Scalars) << ".\n"
10472 << "SLP: Current total cost = " << Cost << "\n");
10473 }
10474
10475 SmallPtrSet<Value *, 16> ExtractCostCalculated;
10476 InstructionCost ExtractCost = 0;
10477 SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
10478 SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
10479 SmallVector<APInt> DemandedElts;
10480 SmallDenseSet<Value *, 4> UsedInserts;
10481 DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
10482 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10483 for (ExternalUser &EU : ExternalUses) {
10484 // We only add extract cost once for the same scalar.
10485 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10486 !ExtractCostCalculated.insert(EU.Scalar).second)
10487 continue;
10488
10489 // Uses by ephemeral values are free (because the ephemeral value will be
10490 // removed prior to code generation, and so the extraction will be
10491 // removed as well).
10492 if (EphValues.count(EU.User))
10493 continue;
10494
10495 // No extract cost for vector "scalar"
10496 if (isa<FixedVectorType>(EU.Scalar->getType()))
10497 continue;
10498
10499 // If found user is an insertelement, do not calculate extract cost but try
10500 // to detect it as a final shuffled/identity match.
10501 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
10502 VU && VU->getOperand(1) == EU.Scalar) {
10503 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
10504 if (!UsedInserts.insert(VU).second)
10505 continue;
10506 std::optional<unsigned> InsertIdx = getElementIndex(VU);
10507 if (InsertIdx) {
10508 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10509 auto *It = find_if(
10510 FirstUsers,
10511 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10512 return areTwoInsertFromSameBuildVector(
10513 VU, cast<InsertElementInst>(Pair.first),
10514 [this](InsertElementInst *II) -> Value * {
10515 Value *Op0 = II->getOperand(0);
10516 if (getTreeEntry(II) && !getTreeEntry(Op0))
10517 return nullptr;
10518 return Op0;
10519 });
10520 });
10521 int VecId = -1;
10522 if (It == FirstUsers.end()) {
10523 (void)ShuffleMasks.emplace_back();
10524 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10525 if (Mask.empty())
10526 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10527 // Find the insertvector, vectorized in tree, if any.
10528 Value *Base = VU;
10529 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
10530 if (IEBase != EU.User &&
10531 (!IEBase->hasOneUse() ||
10532 getElementIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
10533 break;
10534 // Build the mask for the vectorized insertelement instructions.
10535 if (const TreeEntry *E = getTreeEntry(IEBase)) {
10536 VU = IEBase;
10537 do {
10538 IEBase = cast<InsertElementInst>(Base);
10539 int Idx = *getElementIndex(IEBase);
10540 assert(Mask[Idx] == PoisonMaskElem &&
10541 "InsertElementInstruction used already.");
10542 Mask[Idx] = Idx;
10543 Base = IEBase->getOperand(0);
10544 } while (E == getTreeEntry(Base));
10545 break;
10546 }
10547 Base = cast<InsertElementInst>(Base)->getOperand(0);
10548 }
10549 FirstUsers.emplace_back(VU, ScalarTE);
10550 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10551 VecId = FirstUsers.size() - 1;
10552 auto It = MinBWs.find(ScalarTE);
10553 if (It != MinBWs.end() &&
10554 VectorCasts
10555 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
10556 .second) {
10557 unsigned BWSz = It->second.first;
10558 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
10559 unsigned VecOpcode;
10560 if (DstBWSz < BWSz)
10561 VecOpcode = Instruction::Trunc;
10562 else
10563 VecOpcode =
10564 It->second.second ? Instruction::SExt : Instruction::ZExt;
10565 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10566 InstructionCost C = TTI->getCastInstrCost(
10567 VecOpcode, FTy,
10568 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
10569 FTy->getNumElements()),
10570 TTI::CastContextHint::None, CostKind);
10571 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10572 << " for extending externally used vector with "
10573 "non-equal minimum bitwidth.\n");
10574 Cost += C;
10575 }
10576 } else {
10577 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10578 It->first = VU;
10579 VecId = std::distance(FirstUsers.begin(), It);
10580 }
10581 int InIdx = *InsertIdx;
10582 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10583 if (Mask.empty())
10584 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10585 Mask[InIdx] = EU.Lane;
10586 DemandedElts[VecId].setBit(InIdx);
10587 continue;
10588 }
10589 }
10590 }
10591 // Leave the GEPs as is, they are free in most cases and better to keep them
10592 // as GEPs.
10593 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10594 if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10595 if (!ValueToExtUses) {
10596 ValueToExtUses.emplace();
10597 for_each(enumerate(ExternalUses), [&](const auto &P) {
10598 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10599 });
10600 }
10601 // Can use original GEP, if no operands vectorized or they are marked as
10602 // externally used already.
10603 bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10604 if (!getTreeEntry(V))
10605 return true;
10606 auto It = ValueToExtUses->find(V);
10607 if (It != ValueToExtUses->end()) {
10608 // Replace all uses to avoid compiler crash.
10609 ExternalUses[It->second].User = nullptr;
10610 return true;
10611 }
10612 return false;
10613 });
10614 if (CanBeUsedAsGEP) {
10615 ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10616 ExternalUsesAsGEPs.insert(EU.Scalar);
10617 continue;
10618 }
10619 }
10620
10621 // If we plan to rewrite the tree in a smaller type, we will need to sign
10622 // extend the extracted value back to the original type. Here, we account
10623 // for the extract and the added cost of the sign extend if needed.
10624 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
10625 auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10626 if (It != MinBWs.end()) {
10627 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10628 unsigned Extend =
10629 It->second.second ? Instruction::SExt : Instruction::ZExt;
10630 VecTy = getWidenedType(MinTy, BundleWidth);
10631 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10632 VecTy, EU.Lane);
10633 } else {
10634 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10635 CostKind, EU.Lane);
10636 }
10637 }
10638 // Add reduced value cost, if resized.
10639 if (!VectorizedVals.empty()) {
10640 const TreeEntry &Root = *VectorizableTree.front();
10641 auto BWIt = MinBWs.find(&Root);
10642 if (BWIt != MinBWs.end()) {
10643 Type *DstTy = Root.Scalars.front()->getType();
10644 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
10645 unsigned SrcSz =
10646 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10647 if (OriginalSz != SrcSz) {
10648 unsigned Opcode = Instruction::Trunc;
10649 if (OriginalSz > SrcSz)
10650 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10651 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
10652 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
10653 TTI::CastContextHint::None,
10654 TTI::TCK_RecipThroughput);
10655 }
10656 }
10657 }
10658
10659 InstructionCost SpillCost = getSpillCost();
10660 Cost += SpillCost + ExtractCost;
10661 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10662 bool) {
10663 InstructionCost C = 0;
10664 unsigned VF = Mask.size();
10665 unsigned VecVF = TE->getVectorFactor();
10666 if (VF != VecVF &&
10667 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10668 !ShuffleVectorInst::isIdentityMask(Mask, VF))) {
10669 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10670 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
10671 OrigMask.begin());
10672 C = TTI->getShuffleCost(TTI::SK_PermuteSingleSrc,
10673 getWidenedType(TE->getMainOp()->getType(), VecVF),
10674 OrigMask);
10675 LLVM_DEBUG(
10676 dbgs() << "SLP: Adding cost " << C
10677 << " for final shuffle of insertelement external users.\n";
10678 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10679 Cost += C;
10680 return std::make_pair(TE, true);
10681 }
10682 return std::make_pair(TE, false);
10683 };
10684 // Calculate the cost of the reshuffled vectors, if any.
10685 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10686 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10687 auto Vector = ShuffleMasks[I].takeVector();
10688 unsigned VF = 0;
10689 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10690 ArrayRef<const TreeEntry *> TEs) {
10691 assert((TEs.size() == 1 || TEs.size() == 2) &&
10692 "Expected exactly 1 or 2 tree entries.");
10693 if (TEs.size() == 1) {
10694 if (VF == 0)
10695 VF = TEs.front()->getVectorFactor();
10696 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
10697 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
10698 !all_of(enumerate(Mask), [=](const auto &Data) {
10699 return Data.value() == PoisonMaskElem ||
10700 (Data.index() < VF &&
10701 static_cast<int>(Data.index()) == Data.value());
10702 })) {
10703 InstructionCost C =
10704 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);
10705 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10706 << " for final shuffle of insertelement "
10707 "external users.\n";
10708 TEs.front()->dump();
10709 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10710 Cost += C;
10711 }
10712 } else {
10713 if (VF == 0) {
10714 if (TEs.front() &&
10715 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10716 VF = TEs.front()->getVectorFactor();
10717 else
10718 VF = Mask.size();
10719 }
10720 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
10721 InstructionCost C =
10722 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
10723 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10724 << " for final shuffle of vector node and external "
10725 "insertelement users.\n";
10726 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10727 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10728 Cost += C;
10729 }
10730 VF = Mask.size();
10731 return TEs.back();
10732 };
10733 (void)performExtractsShuffleAction<const TreeEntry>(
10734 MutableArrayRef(Vector.data(), Vector.size()), Base,
10735 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
10736 EstimateShufflesCost);
10737 InstructionCost InsertCost = TTI->getScalarizationOverhead(
10738 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
10739 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
10740 Cost -= InsertCost;
10741 }
10742
10743 // Add the cost for reduced value resize (if required).
10744 if (ReductionBitWidth != 0) {
10745 assert(UserIgnoreList && "Expected reduction tree.");
10746 const TreeEntry &E = *VectorizableTree.front();
10747 auto It = MinBWs.find(&E);
10748 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10749 unsigned SrcSize = It->second.first;
10750 unsigned DstSize = ReductionBitWidth;
10751 unsigned Opcode = Instruction::Trunc;
10752 if (SrcSize < DstSize)
10753 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10754 auto *SrcVecTy =
10755 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
10756 auto *DstVecTy =
10757 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
10758 TTI::CastContextHint CCH = getCastContextHint(E);
10759 InstructionCost CastCost;
10760 switch (E.getOpcode()) {
10761 case Instruction::SExt:
10762 case Instruction::ZExt:
10763 case Instruction::Trunc: {
10764 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10765 CCH = getCastContextHint(*OpTE);
10766 break;
10767 }
10768 default:
10769 break;
10770 }
10771 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
10772 TTI::TCK_RecipThroughput);
10773 Cost += CastCost;
10774 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10775 << " for final resize for reduction from " << SrcVecTy
10776 << " to " << DstVecTy << "\n";
10777 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10778 }
10779 }
10780
10781 #ifndef NDEBUG
10782 SmallString<256> Str;
10783 {
10784 raw_svector_ostream OS(Str);
10785 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10786 << "SLP: Extract Cost = " << ExtractCost << ".\n"
10787 << "SLP: Total Cost = " << Cost << ".\n";
10788 }
10789 LLVM_DEBUG(dbgs() << Str);
10790 if (ViewSLPTree)
10791 ViewGraph(this, "SLP" + F->getName(), false, Str);
10792 #endif
10793
10794 return Cost;
10795 }
10796
10797 /// Tries to find extractelement instructions with constant indices from fixed
10798 /// vector type and gather such instructions into a bunch, which highly likely
10799 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10800 /// successful, the matched scalars are replaced by poison values in \p VL for
10801 /// future analysis.
10802 std::optional<TTI::ShuffleKind>
tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value * > VL,SmallVectorImpl<int> & Mask) const10803 BoUpSLP::tryToGatherSingleRegisterExtractElements(
10804 MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {
10805 // Scan list of gathered scalars for extractelements that can be represented
10806 // as shuffles.
10807 MapVector<Value *, SmallVector<int>> VectorOpToIdx;
10808 SmallVector<int> UndefVectorExtracts;
10809 for (int I = 0, E = VL.size(); I < E; ++I) {
10810 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10811 if (!EI) {
10812 if (isa<UndefValue>(VL[I]))
10813 UndefVectorExtracts.push_back(I);
10814 continue;
10815 }
10816 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10817 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10818 continue;
10819 std::optional<unsigned> Idx = getExtractIndex(EI);
10820 // Undefined index.
10821 if (!Idx) {
10822 UndefVectorExtracts.push_back(I);
10823 continue;
10824 }
10825 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10826 ExtractMask.reset(*Idx);
10827 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
10828 UndefVectorExtracts.push_back(I);
10829 continue;
10830 }
10831 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
10832 }
10833 // Sort the vector operands by the maximum number of uses in extractelements.
10834 SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =
10835 VectorOpToIdx.takeVector();
10836 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
10837 return P1.second.size() > P2.second.size();
10838 });
10839 // Find the best pair of the vectors or a single vector.
10840 const int UndefSz = UndefVectorExtracts.size();
10841 unsigned SingleMax = 0;
10842 unsigned PairMax = 0;
10843 if (!Vectors.empty()) {
10844 SingleMax = Vectors.front().second.size() + UndefSz;
10845 if (Vectors.size() > 1) {
10846 auto *ItNext = std::next(Vectors.begin());
10847 PairMax = SingleMax + ItNext->second.size();
10848 }
10849 }
10850 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10851 return std::nullopt;
10852 // Check if better to perform a shuffle of 2 vectors or just of a single
10853 // vector.
10854 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10855 SmallVector<Value *> GatheredExtracts(
10856 VL.size(), PoisonValue::get(VL.front()->getType()));
10857 if (SingleMax >= PairMax && SingleMax) {
10858 for (int Idx : Vectors.front().second)
10859 std::swap(GatheredExtracts[Idx], VL[Idx]);
10860 } else if (!Vectors.empty()) {
10861 for (unsigned Idx : {0, 1})
10862 for (int Idx : Vectors[Idx].second)
10863 std::swap(GatheredExtracts[Idx], VL[Idx]);
10864 }
10865 // Add extracts from undefs too.
10866 for (int Idx : UndefVectorExtracts)
10867 std::swap(GatheredExtracts[Idx], VL[Idx]);
10868 // Check that gather of extractelements can be represented as just a
10869 // shuffle of a single/two vectors the scalars are extracted from.
10870 std::optional<TTI::ShuffleKind> Res =
10871 isFixedVectorShuffle(GatheredExtracts, Mask);
10872 if (!Res) {
10873 // TODO: try to check other subsets if possible.
10874 // Restore the original VL if attempt was not successful.
10875 copy(SavedVL, VL.begin());
10876 return std::nullopt;
10877 }
10878 // Restore unused scalars from mask, if some of the extractelements were not
10879 // selected for shuffle.
10880 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
10881 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
10882 isa<UndefValue>(GatheredExtracts[I])) {
10883 std::swap(VL[I], GatheredExtracts[I]);
10884 continue;
10885 }
10886 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10887 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10888 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10889 is_contained(UndefVectorExtracts, I))
10890 continue;
10891 }
10892 return Res;
10893 }
10894
10895 /// Tries to find extractelement instructions with constant indices from fixed
10896 /// vector type and gather such instructions into a bunch, which highly likely
10897 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10898 /// successful, the matched scalars are replaced by poison values in \p VL for
10899 /// future analysis.
10900 SmallVector<std::optional<TTI::ShuffleKind>>
tryToGatherExtractElements(SmallVectorImpl<Value * > & VL,SmallVectorImpl<int> & Mask,unsigned NumParts) const10901 BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10902 SmallVectorImpl<int> &Mask,
10903 unsigned NumParts) const {
10904 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
10905 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10906 Mask.assign(VL.size(), PoisonMaskElem);
10907 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10908 for (unsigned Part : seq<unsigned>(NumParts)) {
10909 // Scan list of gathered scalars for extractelements that can be represented
10910 // as shuffles.
10911 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
10912 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
10913 SmallVector<int> SubMask;
10914 std::optional<TTI::ShuffleKind> Res =
10915 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10916 ShufflesRes[Part] = Res;
10917 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
10918 }
10919 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
10920 return Res.has_value();
10921 }))
10922 ShufflesRes.clear();
10923 return ShufflesRes;
10924 }
10925
10926 std::optional<TargetTransformInfo::ShuffleKind>
isGatherShuffledSingleRegisterEntry(const TreeEntry * TE,ArrayRef<Value * > VL,MutableArrayRef<int> Mask,SmallVectorImpl<const TreeEntry * > & Entries,unsigned Part,bool ForOrder)10927 BoUpSLP::isGatherShuffledSingleRegisterEntry(
10928 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
10929 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
10930 Entries.clear();
10931 // TODO: currently checking only for Scalars in the tree entry, need to count
10932 // reused elements too for better cost estimation.
10933 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10934 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10935 const BasicBlock *TEInsertBlock = nullptr;
10936 // Main node of PHI entries keeps the correct order of operands/incoming
10937 // blocks.
10938 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10939 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10940 TEInsertPt = TEInsertBlock->getTerminator();
10941 } else {
10942 TEInsertBlock = TEInsertPt->getParent();
10943 }
10944 if (!DT->isReachableFromEntry(TEInsertBlock))
10945 return std::nullopt;
10946 auto *NodeUI = DT->getNode(TEInsertBlock);
10947 assert(NodeUI && "Should only process reachable instructions");
10948 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
10949 auto CheckOrdering = [&](const Instruction *InsertPt) {
10950 // Argument InsertPt is an instruction where vector code for some other
10951 // tree entry (one that shares one or more scalars with TE) is going to be
10952 // generated. This lambda returns true if insertion point of vector code
10953 // for the TE dominates that point (otherwise dependency is the other way
10954 // around). The other node is not limited to be of a gather kind. Gather
10955 // nodes are not scheduled and their vector code is inserted before their
10956 // first user. If user is PHI, that is supposed to be at the end of a
10957 // predecessor block. Otherwise it is the last instruction among scalars of
10958 // the user node. So, instead of checking dependency between instructions
10959 // themselves, we check dependency between their insertion points for vector
10960 // code (since each scalar instruction ends up as a lane of a vector
10961 // instruction).
10962 const BasicBlock *InsertBlock = InsertPt->getParent();
10963 auto *NodeEUI = DT->getNode(InsertBlock);
10964 if (!NodeEUI)
10965 return false;
10966 assert((NodeUI == NodeEUI) ==
10967 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10968 "Different nodes should have different DFS numbers");
10969 // Check the order of the gather nodes users.
10970 if (TEInsertPt->getParent() != InsertBlock &&
10971 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
10972 return false;
10973 if (TEInsertPt->getParent() == InsertBlock &&
10974 TEInsertPt->comesBefore(InsertPt))
10975 return false;
10976 return true;
10977 };
10978 // Find all tree entries used by the gathered values. If no common entries
10979 // found - not a shuffle.
10980 // Here we build a set of tree nodes for each gathered value and trying to
10981 // find the intersection between these sets. If we have at least one common
10982 // tree node for each gathered value - we have just a permutation of the
10983 // single vector. If we have 2 different sets, we're in situation where we
10984 // have a permutation of 2 input vectors.
10985 SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
10986 DenseMap<Value *, int> UsedValuesEntry;
10987 for (Value *V : VL) {
10988 if (isConstant(V))
10989 continue;
10990 // Build a list of tree entries where V is used.
10991 SmallPtrSet<const TreeEntry *, 4> VToTEs;
10992 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10993 if (TEPtr == TE)
10994 continue;
10995 assert(any_of(TEPtr->Scalars,
10996 [&](Value *V) { return GatheredScalars.contains(V); }) &&
10997 "Must contain at least single gathered value.");
10998 assert(TEPtr->UserTreeIndices.size() == 1 &&
10999 "Expected only single user of a gather node.");
11000 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
11001
11002 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
11003 const Instruction *InsertPt =
11004 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
11005 : &getLastInstructionInBundle(UseEI.UserTE);
11006 if (TEInsertPt == InsertPt) {
11007 // If 2 gathers are operands of the same entry (regardless of whether
11008 // user is PHI or else), compare operands indices, use the earlier one
11009 // as the base.
11010 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
11011 continue;
11012 // If the user instruction is used for some reason in different
11013 // vectorized nodes - make it depend on index.
11014 if (TEUseEI.UserTE != UseEI.UserTE &&
11015 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
11016 continue;
11017 }
11018
11019 // Check if the user node of the TE comes after user node of TEPtr,
11020 // otherwise TEPtr depends on TE.
11021 if ((TEInsertBlock != InsertPt->getParent() ||
11022 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
11023 !CheckOrdering(InsertPt))
11024 continue;
11025 VToTEs.insert(TEPtr);
11026 }
11027 if (const TreeEntry *VTE = getTreeEntry(V)) {
11028 if (ForOrder) {
11029 if (VTE->State != TreeEntry::Vectorize) {
11030 auto It = MultiNodeScalars.find(V);
11031 if (It == MultiNodeScalars.end())
11032 continue;
11033 VTE = *It->getSecond().begin();
11034 // Iterate through all vectorized nodes.
11035 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
11036 return MTE->State == TreeEntry::Vectorize;
11037 });
11038 if (MIt == It->getSecond().end())
11039 continue;
11040 VTE = *MIt;
11041 }
11042 }
11043 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
11044 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
11045 continue;
11046 VToTEs.insert(VTE);
11047 }
11048 if (VToTEs.empty())
11049 continue;
11050 if (UsedTEs.empty()) {
11051 // The first iteration, just insert the list of nodes to vector.
11052 UsedTEs.push_back(VToTEs);
11053 UsedValuesEntry.try_emplace(V, 0);
11054 } else {
11055 // Need to check if there are any previously used tree nodes which use V.
11056 // If there are no such nodes, consider that we have another one input
11057 // vector.
11058 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
11059 unsigned Idx = 0;
11060 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
11061 // Do we have a non-empty intersection of previously listed tree entries
11062 // and tree entries using current V?
11063 set_intersect(VToTEs, Set);
11064 if (!VToTEs.empty()) {
11065 // Yes, write the new subset and continue analysis for the next
11066 // scalar.
11067 Set.swap(VToTEs);
11068 break;
11069 }
11070 VToTEs = SavedVToTEs;
11071 ++Idx;
11072 }
11073 // No non-empty intersection found - need to add a second set of possible
11074 // source vectors.
11075 if (Idx == UsedTEs.size()) {
11076 // If the number of input vectors is greater than 2 - not a permutation,
11077 // fallback to the regular gather.
11078 // TODO: support multiple reshuffled nodes.
11079 if (UsedTEs.size() == 2)
11080 continue;
11081 UsedTEs.push_back(SavedVToTEs);
11082 Idx = UsedTEs.size() - 1;
11083 }
11084 UsedValuesEntry.try_emplace(V, Idx);
11085 }
11086 }
11087
11088 if (UsedTEs.empty()) {
11089 Entries.clear();
11090 return std::nullopt;
11091 }
11092
11093 unsigned VF = 0;
11094 if (UsedTEs.size() == 1) {
11095 // Keep the order to avoid non-determinism.
11096 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
11097 UsedTEs.front().end());
11098 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
11099 return TE1->Idx < TE2->Idx;
11100 });
11101 // Try to find the perfect match in another gather node at first.
11102 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
11103 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
11104 });
11105 if (It != FirstEntries.end() &&
11106 ((*It)->getVectorFactor() == VL.size() ||
11107 ((*It)->getVectorFactor() == TE->Scalars.size() &&
11108 TE->ReuseShuffleIndices.size() == VL.size() &&
11109 (*It)->isSame(TE->Scalars)))) {
11110 Entries.push_back(*It);
11111 if ((*It)->getVectorFactor() == VL.size()) {
11112 std::iota(std::next(Mask.begin(), Part * VL.size()),
11113 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
11114 } else {
11115 SmallVector<int> CommonMask = TE->getCommonMask();
11116 copy(CommonMask, Mask.begin());
11117 }
11118 // Clear undef scalars.
11119 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11120 if (isa<PoisonValue>(VL[I]))
11121 Mask[I] = PoisonMaskElem;
11122 return TargetTransformInfo::SK_PermuteSingleSrc;
11123 }
11124 // No perfect match, just shuffle, so choose the first tree node from the
11125 // tree.
11126 Entries.push_back(FirstEntries.front());
11127 } else {
11128 // Try to find nodes with the same vector factor.
11129 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
11130 // Keep the order of tree nodes to avoid non-determinism.
11131 DenseMap<int, const TreeEntry *> VFToTE;
11132 for (const TreeEntry *TE : UsedTEs.front()) {
11133 unsigned VF = TE->getVectorFactor();
11134 auto It = VFToTE.find(VF);
11135 if (It != VFToTE.end()) {
11136 if (It->second->Idx > TE->Idx)
11137 It->getSecond() = TE;
11138 continue;
11139 }
11140 VFToTE.try_emplace(VF, TE);
11141 }
11142 // Same, keep the order to avoid non-determinism.
11143 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
11144 UsedTEs.back().end());
11145 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
11146 return TE1->Idx < TE2->Idx;
11147 });
11148 for (const TreeEntry *TE : SecondEntries) {
11149 auto It = VFToTE.find(TE->getVectorFactor());
11150 if (It != VFToTE.end()) {
11151 VF = It->first;
11152 Entries.push_back(It->second);
11153 Entries.push_back(TE);
11154 break;
11155 }
11156 }
11157 // No 2 source vectors with the same vector factor - just choose 2 with max
11158 // index.
11159 if (Entries.empty()) {
11160 Entries.push_back(*llvm::max_element(
11161 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
11162 return TE1->Idx < TE2->Idx;
11163 }));
11164 Entries.push_back(SecondEntries.front());
11165 VF = std::max(Entries.front()->getVectorFactor(),
11166 Entries.back()->getVectorFactor());
11167 }
11168 }
11169
11170 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
11171 // Checks if the 2 PHIs are compatible in terms of high possibility to be
11172 // vectorized.
11173 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
11174 auto *PHI = cast<PHINode>(V);
11175 auto *PHI1 = cast<PHINode>(V1);
11176 // Check that all incoming values are compatible/from same parent (if they
11177 // are instructions).
11178 // The incoming values are compatible if they all are constants, or
11179 // instruction with the same/alternate opcodes from the same basic block.
11180 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
11181 Value *In = PHI->getIncomingValue(I);
11182 Value *In1 = PHI1->getIncomingValue(I);
11183 if (isConstant(In) && isConstant(In1))
11184 continue;
11185 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
11186 return false;
11187 if (cast<Instruction>(In)->getParent() !=
11188 cast<Instruction>(In1)->getParent())
11189 return false;
11190 }
11191 return true;
11192 };
11193 // Check if the value can be ignored during analysis for shuffled gathers.
11194 // We suppose it is better to ignore instruction, which do not form splats,
11195 // are not vectorized/not extractelements (these instructions will be handled
11196 // by extractelements processing) or may form vector node in future.
11197 auto MightBeIgnored = [=](Value *V) {
11198 auto *I = dyn_cast<Instruction>(V);
11199 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
11200 !isVectorLikeInstWithConstOps(I) &&
11201 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
11202 };
11203 // Check that the neighbor instruction may form a full vector node with the
11204 // current instruction V. It is possible, if they have same/alternate opcode
11205 // and same parent basic block.
11206 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
11207 Value *V1 = VL[Idx];
11208 bool UsedInSameVTE = false;
11209 auto It = UsedValuesEntry.find(V1);
11210 if (It != UsedValuesEntry.end())
11211 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
11212 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11213 getSameOpcode({V, V1}, *TLI).getOpcode() &&
11214 cast<Instruction>(V)->getParent() ==
11215 cast<Instruction>(V1)->getParent() &&
11216 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11217 };
11218 // Build a shuffle mask for better cost estimation and vector emission.
11219 SmallBitVector UsedIdxs(Entries.size());
11220 SmallVector<std::pair<unsigned, int>> EntryLanes;
11221 for (int I = 0, E = VL.size(); I < E; ++I) {
11222 Value *V = VL[I];
11223 auto It = UsedValuesEntry.find(V);
11224 if (It == UsedValuesEntry.end())
11225 continue;
11226 // Do not try to shuffle scalars, if they are constants, or instructions
11227 // that can be vectorized as a result of the following vector build
11228 // vectorization.
11229 if (isConstant(V) || (MightBeIgnored(V) &&
11230 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
11231 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
11232 continue;
11233 unsigned Idx = It->second;
11234 EntryLanes.emplace_back(Idx, I);
11235 UsedIdxs.set(Idx);
11236 }
11237 // Iterate through all shuffled scalars and select entries, which can be used
11238 // for final shuffle.
11239 SmallVector<const TreeEntry *> TempEntries;
11240 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
11241 if (!UsedIdxs.test(I))
11242 continue;
11243 // Fix the entry number for the given scalar. If it is the first entry, set
11244 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
11245 // These indices are used when calculating final shuffle mask as the vector
11246 // offset.
11247 for (std::pair<unsigned, int> &Pair : EntryLanes)
11248 if (Pair.first == I)
11249 Pair.first = TempEntries.size();
11250 TempEntries.push_back(Entries[I]);
11251 }
11252 Entries.swap(TempEntries);
11253 if (EntryLanes.size() == Entries.size() &&
11254 !VL.equals(ArrayRef(TE->Scalars)
11255 .slice(Part * VL.size(),
11256 std::min<int>(VL.size(), TE->Scalars.size())))) {
11257 // We may have here 1 or 2 entries only. If the number of scalars is equal
11258 // to the number of entries, no need to do the analysis, it is not very
11259 // profitable. Since VL is not the same as TE->Scalars, it means we already
11260 // have some shuffles before. Cut off not profitable case.
11261 Entries.clear();
11262 return std::nullopt;
11263 }
11264 // Build the final mask, check for the identity shuffle, if possible.
11265 bool IsIdentity = Entries.size() == 1;
11266 // Pair.first is the offset to the vector, while Pair.second is the index of
11267 // scalar in the list.
11268 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
11269 unsigned Idx = Part * VL.size() + Pair.second;
11270 Mask[Idx] =
11271 Pair.first * VF +
11272 (ForOrder ? std::distance(
11273 Entries[Pair.first]->Scalars.begin(),
11274 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11275 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11276 IsIdentity &= Mask[Idx] == Pair.second;
11277 }
11278 switch (Entries.size()) {
11279 case 1:
11280 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11281 return TargetTransformInfo::SK_PermuteSingleSrc;
11282 break;
11283 case 2:
11284 if (EntryLanes.size() > 2 || VL.size() <= 2)
11285 return TargetTransformInfo::SK_PermuteTwoSrc;
11286 break;
11287 default:
11288 break;
11289 }
11290 Entries.clear();
11291 // Clear the corresponding mask elements.
11292 std::fill(std::next(Mask.begin(), Part * VL.size()),
11293 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
11294 return std::nullopt;
11295 }
11296
11297 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
isGatherShuffledEntry(const TreeEntry * TE,ArrayRef<Value * > VL,SmallVectorImpl<int> & Mask,SmallVectorImpl<SmallVector<const TreeEntry * >> & Entries,unsigned NumParts,bool ForOrder)11298 BoUpSLP::isGatherShuffledEntry(
11299 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
11300 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
11301 bool ForOrder) {
11302 assert(NumParts > 0 && NumParts < VL.size() &&
11303 "Expected positive number of registers.");
11304 Entries.clear();
11305 // No need to check for the topmost gather node.
11306 if (TE == VectorizableTree.front().get())
11307 return {};
11308 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11309 if (TE->isNonPowOf2Vec())
11310 return {};
11311 Mask.assign(VL.size(), PoisonMaskElem);
11312 assert(TE->UserTreeIndices.size() == 1 &&
11313 "Expected only single user of the gather node.");
11314 assert(VL.size() % NumParts == 0 &&
11315 "Number of scalars must be divisible by NumParts.");
11316 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
11317 SmallVector<std::optional<TTI::ShuffleKind>> Res;
11318 for (unsigned Part : seq<unsigned>(NumParts)) {
11319 ArrayRef<Value *> SubVL =
11320 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
11321 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
11322 std::optional<TTI::ShuffleKind> SubRes =
11323 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11324 ForOrder);
11325 if (!SubRes)
11326 SubEntries.clear();
11327 Res.push_back(SubRes);
11328 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
11329 SubEntries.front()->getVectorFactor() == VL.size() &&
11330 (SubEntries.front()->isSame(TE->Scalars) ||
11331 SubEntries.front()->isSame(VL))) {
11332 SmallVector<const TreeEntry *> LocalSubEntries;
11333 LocalSubEntries.swap(SubEntries);
11334 Entries.clear();
11335 Res.clear();
11336 std::iota(Mask.begin(), Mask.end(), 0);
11337 // Clear undef scalars.
11338 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11339 if (isa<PoisonValue>(VL[I]))
11340 Mask[I] = PoisonMaskElem;
11341 Entries.emplace_back(1, LocalSubEntries.front());
11342 Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
11343 return Res;
11344 }
11345 }
11346 if (all_of(Res,
11347 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
11348 Entries.clear();
11349 return {};
11350 }
11351 return Res;
11352 }
11353
getGatherCost(ArrayRef<Value * > VL,bool ForPoisonSrc,Type * ScalarTy) const11354 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
11355 Type *ScalarTy) const {
11356 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11357 bool DuplicateNonConst = false;
11358 // Find the cost of inserting/extracting values from the vector.
11359 // Check if the same elements are inserted several times and count them as
11360 // shuffle candidates.
11361 APInt ShuffledElements = APInt::getZero(VL.size());
11362 DenseMap<Value *, unsigned> UniqueElements;
11363 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
11364 InstructionCost Cost;
11365 auto EstimateInsertCost = [&](unsigned I, Value *V) {
11366 if (V->getType() != ScalarTy) {
11367 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
11368 TTI::CastContextHint::None, CostKind);
11369 V = nullptr;
11370 }
11371 if (!ForPoisonSrc)
11372 Cost +=
11373 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
11374 I, Constant::getNullValue(VecTy), V);
11375 };
11376 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
11377 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
11378 Value *V = VL[I];
11379 // No need to shuffle duplicates for constants.
11380 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
11381 ShuffledElements.setBit(I);
11382 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
11383 continue;
11384 }
11385
11386 auto Res = UniqueElements.try_emplace(V, I);
11387 if (Res.second) {
11388 EstimateInsertCost(I, V);
11389 ShuffleMask[I] = I;
11390 continue;
11391 }
11392
11393 DuplicateNonConst = true;
11394 ShuffledElements.setBit(I);
11395 ShuffleMask[I] = Res.first->second;
11396 }
11397 if (ForPoisonSrc)
11398 Cost =
11399 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
11400 /*Extract*/ false, CostKind);
11401 if (DuplicateNonConst)
11402 Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
11403 VecTy, ShuffleMask);
11404 return Cost;
11405 }
11406
11407 // Perform operand reordering on the instructions in VL and return the reordered
11408 // operands in Left and Right.
reorderInputsAccordingToOpcode(ArrayRef<Value * > VL,SmallVectorImpl<Value * > & Left,SmallVectorImpl<Value * > & Right,const BoUpSLP & R)11409 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
11410 SmallVectorImpl<Value *> &Left,
11411 SmallVectorImpl<Value *> &Right,
11412 const BoUpSLP &R) {
11413 if (VL.empty())
11414 return;
11415 VLOperands Ops(VL, R);
11416 // Reorder the operands in place.
11417 Ops.reorder();
11418 Left = Ops.getVL(0);
11419 Right = Ops.getVL(1);
11420 }
11421
getLastInstructionInBundle(const TreeEntry * E)11422 Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
11423 auto &Res = EntryToLastInstruction.FindAndConstruct(E);
11424 if (Res.second)
11425 return *Res.second;
11426 // Get the basic block this bundle is in. All instructions in the bundle
11427 // should be in this block (except for extractelement-like instructions with
11428 // constant indeces).
11429 auto *Front = E->getMainOp();
11430 auto *BB = Front->getParent();
11431 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
11432 if (E->getOpcode() == Instruction::GetElementPtr &&
11433 !isa<GetElementPtrInst>(V))
11434 return true;
11435 auto *I = cast<Instruction>(V);
11436 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11437 isVectorLikeInstWithConstOps(I);
11438 }));
11439
11440 auto FindLastInst = [&]() {
11441 Instruction *LastInst = Front;
11442 for (Value *V : E->Scalars) {
11443 auto *I = dyn_cast<Instruction>(V);
11444 if (!I)
11445 continue;
11446 if (LastInst->getParent() == I->getParent()) {
11447 if (LastInst->comesBefore(I))
11448 LastInst = I;
11449 continue;
11450 }
11451 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11452 !isa<GetElementPtrInst>(I)) ||
11453 (isVectorLikeInstWithConstOps(LastInst) &&
11454 isVectorLikeInstWithConstOps(I))) &&
11455 "Expected vector-like or non-GEP in GEP node insts only.");
11456 if (!DT->isReachableFromEntry(LastInst->getParent())) {
11457 LastInst = I;
11458 continue;
11459 }
11460 if (!DT->isReachableFromEntry(I->getParent()))
11461 continue;
11462 auto *NodeA = DT->getNode(LastInst->getParent());
11463 auto *NodeB = DT->getNode(I->getParent());
11464 assert(NodeA && "Should only process reachable instructions");
11465 assert(NodeB && "Should only process reachable instructions");
11466 assert((NodeA == NodeB) ==
11467 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11468 "Different nodes should have different DFS numbers");
11469 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11470 LastInst = I;
11471 }
11472 BB = LastInst->getParent();
11473 return LastInst;
11474 };
11475
11476 auto FindFirstInst = [&]() {
11477 Instruction *FirstInst = Front;
11478 for (Value *V : E->Scalars) {
11479 auto *I = dyn_cast<Instruction>(V);
11480 if (!I)
11481 continue;
11482 if (FirstInst->getParent() == I->getParent()) {
11483 if (I->comesBefore(FirstInst))
11484 FirstInst = I;
11485 continue;
11486 }
11487 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11488 !isa<GetElementPtrInst>(I)) ||
11489 (isVectorLikeInstWithConstOps(FirstInst) &&
11490 isVectorLikeInstWithConstOps(I))) &&
11491 "Expected vector-like or non-GEP in GEP node insts only.");
11492 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
11493 FirstInst = I;
11494 continue;
11495 }
11496 if (!DT->isReachableFromEntry(I->getParent()))
11497 continue;
11498 auto *NodeA = DT->getNode(FirstInst->getParent());
11499 auto *NodeB = DT->getNode(I->getParent());
11500 assert(NodeA && "Should only process reachable instructions");
11501 assert(NodeB && "Should only process reachable instructions");
11502 assert((NodeA == NodeB) ==
11503 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11504 "Different nodes should have different DFS numbers");
11505 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11506 FirstInst = I;
11507 }
11508 return FirstInst;
11509 };
11510
11511 // Set the insert point to the beginning of the basic block if the entry
11512 // should not be scheduled.
11513 if (doesNotNeedToSchedule(E->Scalars) ||
11514 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
11515 if ((E->getOpcode() == Instruction::GetElementPtr &&
11516 any_of(E->Scalars,
11517 [](Value *V) {
11518 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11519 })) ||
11520 all_of(E->Scalars,
11521 [](Value *V) {
11522 return !isVectorLikeInstWithConstOps(V) &&
11523 isUsedOutsideBlock(V);
11524 }) ||
11525 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
11526 return isa<ExtractElementInst, UndefValue>(V) ||
11527 areAllOperandsNonInsts(V);
11528 })))
11529 Res.second = FindLastInst();
11530 else
11531 Res.second = FindFirstInst();
11532 return *Res.second;
11533 }
11534
11535 // Find the last instruction. The common case should be that BB has been
11536 // scheduled, and the last instruction is VL.back(). So we start with
11537 // VL.back() and iterate over schedule data until we reach the end of the
11538 // bundle. The end of the bundle is marked by null ScheduleData.
11539 if (BlocksSchedules.count(BB)) {
11540 Value *V = E->isOneOf(E->Scalars.back());
11541 if (doesNotNeedToBeScheduled(V))
11542 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
11543 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11544 if (Bundle && Bundle->isPartOfBundle())
11545 for (; Bundle; Bundle = Bundle->NextInBundle)
11546 if (Bundle->OpValue == Bundle->Inst)
11547 Res.second = Bundle->Inst;
11548 }
11549
11550 // LastInst can still be null at this point if there's either not an entry
11551 // for BB in BlocksSchedules or there's no ScheduleData available for
11552 // VL.back(). This can be the case if buildTree_rec aborts for various
11553 // reasons (e.g., the maximum recursion depth is reached, the maximum region
11554 // size is reached, etc.). ScheduleData is initialized in the scheduling
11555 // "dry-run".
11556 //
11557 // If this happens, we can still find the last instruction by brute force. We
11558 // iterate forwards from Front (inclusive) until we either see all
11559 // instructions in the bundle or reach the end of the block. If Front is the
11560 // last instruction in program order, LastInst will be set to Front, and we
11561 // will visit all the remaining instructions in the block.
11562 //
11563 // One of the reasons we exit early from buildTree_rec is to place an upper
11564 // bound on compile-time. Thus, taking an additional compile-time hit here is
11565 // not ideal. However, this should be exceedingly rare since it requires that
11566 // we both exit early from buildTree_rec and that the bundle be out-of-order
11567 // (causing us to iterate all the way to the end of the block).
11568 if (!Res.second)
11569 Res.second = FindLastInst();
11570 assert(Res.second && "Failed to find last instruction in bundle");
11571 return *Res.second;
11572 }
11573
setInsertPointAfterBundle(const TreeEntry * E)11574 void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11575 auto *Front = E->getMainOp();
11576 Instruction *LastInst = &getLastInstructionInBundle(E);
11577 assert(LastInst && "Failed to find last instruction in bundle");
11578 BasicBlock::iterator LastInstIt = LastInst->getIterator();
11579 // If the instruction is PHI, set the insert point after all the PHIs.
11580 bool IsPHI = isa<PHINode>(LastInst);
11581 if (IsPHI)
11582 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11583 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
11584 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
11585 } else {
11586 // Set the insertion point after the last instruction in the bundle. Set the
11587 // debug location to Front.
11588 Builder.SetInsertPoint(
11589 LastInst->getParent(),
11590 LastInst->getNextNonDebugInstruction()->getIterator());
11591 }
11592 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11593 }
11594
gather(ArrayRef<Value * > VL,Value * Root,Type * ScalarTy)11595 Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
11596 // List of instructions/lanes from current block and/or the blocks which are
11597 // part of the current loop. These instructions will be inserted at the end to
11598 // make it possible to optimize loops and hoist invariant instructions out of
11599 // the loops body with better chances for success.
11600 SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
11601 SmallSet<int, 4> PostponedIndices;
11602 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
11603 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11604 SmallPtrSet<BasicBlock *, 4> Visited;
11605 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
11606 InsertBB = InsertBB->getSinglePredecessor();
11607 return InsertBB && InsertBB == InstBB;
11608 };
11609 for (int I = 0, E = VL.size(); I < E; ++I) {
11610 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
11611 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11612 getTreeEntry(Inst) ||
11613 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
11614 PostponedIndices.insert(I).second)
11615 PostponedInsts.emplace_back(Inst, I);
11616 }
11617
11618 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11619 Type *Ty) {
11620 Value *Scalar = V;
11621 if (Scalar->getType() != Ty) {
11622 assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11623 "Expected integer types only.");
11624 Value *V = Scalar;
11625 if (auto *CI = dyn_cast<CastInst>(Scalar);
11626 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
11627 Value *Op = CI->getOperand(0);
11628 if (auto *IOp = dyn_cast<Instruction>(Op);
11629 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
11630 V = Op;
11631 }
11632 Scalar = Builder.CreateIntCast(
11633 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
11634 }
11635
11636 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11637 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11638 if (!InsElt)
11639 return Vec;
11640 GatherShuffleExtractSeq.insert(InsElt);
11641 CSEBlocks.insert(InsElt->getParent());
11642 // Add to our 'need-to-extract' list.
11643 if (isa<Instruction>(V)) {
11644 if (TreeEntry *Entry = getTreeEntry(V)) {
11645 // Find which lane we need to extract.
11646 User *UserOp = nullptr;
11647 if (Scalar != V) {
11648 if (auto *SI = dyn_cast<Instruction>(Scalar))
11649 UserOp = SI;
11650 } else {
11651 UserOp = InsElt;
11652 }
11653 if (UserOp) {
11654 unsigned FoundLane = Entry->findLaneForValue(V);
11655 ExternalUses.emplace_back(V, UserOp, FoundLane);
11656 }
11657 }
11658 }
11659 return Vec;
11660 };
11661 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11662 Value *Vec = Root ? Root : PoisonValue::get(VecTy);
11663 SmallVector<int> NonConsts;
11664 // Insert constant values at first.
11665 for (int I = 0, E = VL.size(); I < E; ++I) {
11666 if (PostponedIndices.contains(I))
11667 continue;
11668 if (!isConstant(VL[I])) {
11669 NonConsts.push_back(I);
11670 continue;
11671 }
11672 if (Root) {
11673 if (!isa<UndefValue>(VL[I])) {
11674 NonConsts.push_back(I);
11675 continue;
11676 }
11677 if (isa<PoisonValue>(VL[I]))
11678 continue;
11679 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11680 if (SV->getMaskValue(I) == PoisonMaskElem)
11681 continue;
11682 }
11683 }
11684 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11685 }
11686 // Insert non-constant values.
11687 for (int I : NonConsts)
11688 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11689 // Append instructions, which are/may be part of the loop, in the end to make
11690 // it possible to hoist non-loop-based instructions.
11691 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11692 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11693
11694 return Vec;
11695 }
11696
11697 /// Merges shuffle masks and emits final shuffle instruction, if required. It
11698 /// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11699 /// when the actual shuffle instruction is generated only if this is actually
11700 /// required. Otherwise, the shuffle instruction emission is delayed till the
11701 /// end of the process, to reduce the number of emitted instructions and further
11702 /// analysis/transformations.
11703 /// The class also will look through the previously emitted shuffle instructions
11704 /// and properly mark indices in mask as undef.
11705 /// For example, given the code
11706 /// \code
11707 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11708 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11709 /// \endcode
11710 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11711 /// look through %s1 and %s2 and emit
11712 /// \code
11713 /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11714 /// \endcode
11715 /// instead.
11716 /// If 2 operands are of different size, the smallest one will be resized and
11717 /// the mask recalculated properly.
11718 /// For example, given the code
11719 /// \code
11720 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11721 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11722 /// \endcode
11723 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11724 /// look through %s1 and %s2 and emit
11725 /// \code
11726 /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11727 /// \endcode
11728 /// instead.
11729 class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11730 bool IsFinalized = false;
11731 /// Combined mask for all applied operands and masks. It is built during
11732 /// analysis and actual emission of shuffle vector instructions.
11733 SmallVector<int> CommonMask;
11734 /// List of operands for the shuffle vector instruction. It hold at max 2
11735 /// operands, if the 3rd is going to be added, the first 2 are combined into
11736 /// shuffle with \p CommonMask mask, the first operand sets to be the
11737 /// resulting shuffle and the second operand sets to be the newly added
11738 /// operand. The \p CommonMask is transformed in the proper way after that.
11739 SmallVector<Value *, 2> InVectors;
11740 Type *ScalarTy = nullptr;
11741 IRBuilderBase &Builder;
11742 BoUpSLP &R;
11743
11744 class ShuffleIRBuilder {
11745 IRBuilderBase &Builder;
11746 /// Holds all of the instructions that we gathered.
11747 SetVector<Instruction *> &GatherShuffleExtractSeq;
11748 /// A list of blocks that we are going to CSE.
11749 DenseSet<BasicBlock *> &CSEBlocks;
11750 /// Data layout.
11751 const DataLayout &DL;
11752
11753 public:
ShuffleIRBuilder(IRBuilderBase & Builder,SetVector<Instruction * > & GatherShuffleExtractSeq,DenseSet<BasicBlock * > & CSEBlocks,const DataLayout & DL)11754 ShuffleIRBuilder(IRBuilderBase &Builder,
11755 SetVector<Instruction *> &GatherShuffleExtractSeq,
11756 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11757 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11758 CSEBlocks(CSEBlocks), DL(DL) {}
11759 ~ShuffleIRBuilder() = default;
11760 /// Creates shufflevector for the 2 operands with the given mask.
createShuffleVector(Value * V1,Value * V2,ArrayRef<int> Mask)11761 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11762 if (V1->getType() != V2->getType()) {
11763 assert(V1->getType()->isIntOrIntVectorTy() &&
11764 V1->getType()->isIntOrIntVectorTy() &&
11765 "Expected integer vector types only.");
11766 if (V1->getType() != V2->getType()) {
11767 if (cast<VectorType>(V2->getType())
11768 ->getElementType()
11769 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
11770 ->getElementType()
11771 ->getIntegerBitWidth())
11772 V2 = Builder.CreateIntCast(
11773 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
11774 else
11775 V1 = Builder.CreateIntCast(
11776 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
11777 }
11778 }
11779 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11780 if (auto *I = dyn_cast<Instruction>(Vec)) {
11781 GatherShuffleExtractSeq.insert(I);
11782 CSEBlocks.insert(I->getParent());
11783 }
11784 return Vec;
11785 }
11786 /// Creates permutation of the single vector operand with the given mask, if
11787 /// it is not identity mask.
createShuffleVector(Value * V1,ArrayRef<int> Mask)11788 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11789 if (Mask.empty())
11790 return V1;
11791 unsigned VF = Mask.size();
11792 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
11793 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
11794 return V1;
11795 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
11796 if (auto *I = dyn_cast<Instruction>(Vec)) {
11797 GatherShuffleExtractSeq.insert(I);
11798 CSEBlocks.insert(I->getParent());
11799 }
11800 return Vec;
11801 }
createIdentity(Value * V)11802 Value *createIdentity(Value *V) { return V; }
createPoison(Type * Ty,unsigned VF)11803 Value *createPoison(Type *Ty, unsigned VF) {
11804 return PoisonValue::get(getWidenedType(Ty, VF));
11805 }
11806 /// Resizes 2 input vector to match the sizes, if the they are not equal
11807 /// yet. The smallest vector is resized to the size of the larger vector.
resizeToMatch(Value * & V1,Value * & V2)11808 void resizeToMatch(Value *&V1, Value *&V2) {
11809 if (V1->getType() == V2->getType())
11810 return;
11811 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
11812 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11813 int VF = std::max(V1VF, V2VF);
11814 int MinVF = std::min(V1VF, V2VF);
11815 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11816 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
11817 0);
11818 Value *&Op = MinVF == V1VF ? V1 : V2;
11819 Op = Builder.CreateShuffleVector(Op, IdentityMask);
11820 if (auto *I = dyn_cast<Instruction>(Op)) {
11821 GatherShuffleExtractSeq.insert(I);
11822 CSEBlocks.insert(I->getParent());
11823 }
11824 if (MinVF == V1VF)
11825 V1 = Op;
11826 else
11827 V2 = Op;
11828 }
11829 };
11830
11831 /// Smart shuffle instruction emission, walks through shuffles trees and
11832 /// tries to find the best matching vector for the actual shuffle
11833 /// instruction.
createShuffle(Value * V1,Value * V2,ArrayRef<int> Mask)11834 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11835 assert(V1 && "Expected at least one vector value.");
11836 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11837 R.CSEBlocks, *R.DL);
11838 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11839 ShuffleBuilder);
11840 }
11841
11842 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11843 /// shuffle emission.
transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,ArrayRef<int> Mask)11844 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11845 ArrayRef<int> Mask) {
11846 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11847 if (Mask[Idx] != PoisonMaskElem)
11848 CommonMask[Idx] = Idx;
11849 }
11850
11851 /// Cast value \p V to the vector type with the same number of elements, but
11852 /// the base type \p ScalarTy.
castToScalarTyElem(Value * V,std::optional<bool> IsSigned=std::nullopt)11853 Value *castToScalarTyElem(Value *V,
11854 std::optional<bool> IsSigned = std::nullopt) {
11855 auto *VecTy = cast<VectorType>(V->getType());
11856 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
11857 if (VecTy->getElementType() == ScalarTy->getScalarType())
11858 return V;
11859 return Builder.CreateIntCast(
11860 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
11861 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
11862 }
11863
11864 public:
ShuffleInstructionBuilder(Type * ScalarTy,IRBuilderBase & Builder,BoUpSLP & R)11865 ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
11866 : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11867
11868 /// Adjusts extractelements after reusing them.
adjustExtracts(const TreeEntry * E,MutableArrayRef<int> Mask,ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,unsigned NumParts,bool & UseVecBaseAsInput)11869 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
11870 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11871 unsigned NumParts, bool &UseVecBaseAsInput) {
11872 UseVecBaseAsInput = false;
11873 SmallPtrSet<Value *, 4> UniqueBases;
11874 Value *VecBase = nullptr;
11875 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11876 int Idx = Mask[I];
11877 if (Idx == PoisonMaskElem)
11878 continue;
11879 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
11880 VecBase = EI->getVectorOperand();
11881 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
11882 VecBase = TE->VectorizedValue;
11883 assert(VecBase && "Expected vectorized value.");
11884 UniqueBases.insert(VecBase);
11885 // If the only one use is vectorized - can delete the extractelement
11886 // itself.
11887 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
11888 any_of(EI->users(), [&](User *U) {
11889 const TreeEntry *UTE = R.getTreeEntry(U);
11890 return !UTE || R.MultiNodeScalars.contains(U) ||
11891 (isa<GetElementPtrInst>(U) &&
11892 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11893 count_if(R.VectorizableTree,
11894 [&](const std::unique_ptr<TreeEntry> &TE) {
11895 return any_of(TE->UserTreeIndices,
11896 [&](const EdgeInfo &Edge) {
11897 return Edge.UserTE == UTE;
11898 }) &&
11899 is_contained(TE->Scalars, EI);
11900 }) != 1;
11901 }))
11902 continue;
11903 R.eraseInstruction(EI);
11904 }
11905 if (NumParts == 1 || UniqueBases.size() == 1) {
11906 assert(VecBase && "Expected vectorized value.");
11907 return castToScalarTyElem(VecBase);
11908 }
11909 UseVecBaseAsInput = true;
11910 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11911 for (auto [I, Idx] : enumerate(Mask))
11912 if (Idx != PoisonMaskElem)
11913 Idx = I;
11914 };
11915 // Perform multi-register vector shuffle, joining them into a single virtual
11916 // long vector.
11917 // Need to shuffle each part independently and then insert all this parts
11918 // into a long virtual vector register, forming the original vector.
11919 Value *Vec = nullptr;
11920 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11921 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
11922 for (unsigned Part : seq<unsigned>(NumParts)) {
11923 unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
11924 ArrayRef<Value *> VL =
11925 ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
11926 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
11927 constexpr int MaxBases = 2;
11928 SmallVector<Value *, MaxBases> Bases(MaxBases);
11929 auto VLMask = zip(VL, SubMask);
11930 const unsigned VF = std::accumulate(
11931 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
11932 if (std::get<1>(D) == PoisonMaskElem)
11933 return S;
11934 Value *VecOp =
11935 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
11936 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11937 VecOp = TE->VectorizedValue;
11938 assert(VecOp && "Expected vectorized value.");
11939 const unsigned Size =
11940 cast<FixedVectorType>(VecOp->getType())->getNumElements();
11941 return std::max(S, Size);
11942 });
11943 for (const auto [V, I] : VLMask) {
11944 if (I == PoisonMaskElem)
11945 continue;
11946 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11947 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11948 VecOp = TE->VectorizedValue;
11949 assert(VecOp && "Expected vectorized value.");
11950 VecOp = castToScalarTyElem(VecOp);
11951 Bases[I / VF] = VecOp;
11952 }
11953 if (!Bases.front())
11954 continue;
11955 Value *SubVec;
11956 if (Bases.back()) {
11957 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11958 TransformToIdentity(SubMask);
11959 } else {
11960 SubVec = Bases.front();
11961 }
11962 if (!Vec) {
11963 Vec = SubVec;
11964 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
11965 [&](unsigned P) {
11966 ArrayRef<int> SubMask =
11967 Mask.slice(P * SliceSize,
11968 getNumElems(Mask.size(),
11969 SliceSize, P));
11970 return all_of(SubMask, [](int Idx) {
11971 return Idx == PoisonMaskElem;
11972 });
11973 })) &&
11974 "Expected first part or all previous parts masked.");
11975 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11976 } else {
11977 unsigned NewVF =
11978 cast<FixedVectorType>(Vec->getType())->getNumElements();
11979 if (Vec->getType() != SubVec->getType()) {
11980 unsigned SubVecVF =
11981 cast<FixedVectorType>(SubVec->getType())->getNumElements();
11982 NewVF = std::max(NewVF, SubVecVF);
11983 }
11984 // Adjust SubMask.
11985 for (int &Idx : SubMask)
11986 if (Idx != PoisonMaskElem)
11987 Idx += NewVF;
11988 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11989 Vec = createShuffle(Vec, SubVec, VecMask);
11990 TransformToIdentity(VecMask);
11991 }
11992 }
11993 copy(VecMask, Mask.begin());
11994 return Vec;
11995 }
11996 /// Checks if the specified entry \p E needs to be delayed because of its
11997 /// dependency nodes.
11998 std::optional<Value *>
needToDelay(const TreeEntry * E,ArrayRef<SmallVector<const TreeEntry * >> Deps) const11999 needToDelay(const TreeEntry *E,
12000 ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
12001 // No need to delay emission if all deps are ready.
12002 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
12003 return all_of(
12004 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
12005 }))
12006 return std::nullopt;
12007 // Postpone gather emission, will be emitted after the end of the
12008 // process to keep correct order.
12009 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
12010 return Builder.CreateAlignedLoad(
12011 ResVecTy,
12012 PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
12013 MaybeAlign());
12014 }
12015 /// Adds 2 input vectors (in form of tree entries) and the mask for their
12016 /// shuffling.
add(const TreeEntry & E1,const TreeEntry & E2,ArrayRef<int> Mask)12017 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
12018 Value *V1 = E1.VectorizedValue;
12019 if (V1->getType()->isIntOrIntVectorTy())
12020 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
12021 return !isKnownNonNegative(
12022 V, SimplifyQuery(*R.DL));
12023 }));
12024 Value *V2 = E2.VectorizedValue;
12025 if (V2->getType()->isIntOrIntVectorTy())
12026 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
12027 return !isKnownNonNegative(
12028 V, SimplifyQuery(*R.DL));
12029 }));
12030 add(V1, V2, Mask);
12031 }
12032 /// Adds single input vector (in form of tree entry) and the mask for its
12033 /// shuffling.
add(const TreeEntry & E1,ArrayRef<int> Mask)12034 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
12035 Value *V1 = E1.VectorizedValue;
12036 if (V1->getType()->isIntOrIntVectorTy())
12037 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
12038 return !isKnownNonNegative(
12039 V, SimplifyQuery(*R.DL));
12040 }));
12041 add(V1, Mask);
12042 }
12043 /// Adds 2 input vectors and the mask for their shuffling.
add(Value * V1,Value * V2,ArrayRef<int> Mask)12044 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
12045 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
12046 V1 = castToScalarTyElem(V1);
12047 V2 = castToScalarTyElem(V2);
12048 if (InVectors.empty()) {
12049 InVectors.push_back(V1);
12050 InVectors.push_back(V2);
12051 CommonMask.assign(Mask.begin(), Mask.end());
12052 return;
12053 }
12054 Value *Vec = InVectors.front();
12055 if (InVectors.size() == 2) {
12056 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
12057 transformMaskAfterShuffle(CommonMask, CommonMask);
12058 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
12059 Mask.size()) {
12060 Vec = createShuffle(Vec, nullptr, CommonMask);
12061 transformMaskAfterShuffle(CommonMask, CommonMask);
12062 }
12063 V1 = createShuffle(V1, V2, Mask);
12064 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12065 if (Mask[Idx] != PoisonMaskElem)
12066 CommonMask[Idx] = Idx + Sz;
12067 InVectors.front() = Vec;
12068 if (InVectors.size() == 2)
12069 InVectors.back() = V1;
12070 else
12071 InVectors.push_back(V1);
12072 }
12073 /// Adds another one input vector and the mask for the shuffling.
add(Value * V1,ArrayRef<int> Mask,bool=false)12074 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
12075 V1 = castToScalarTyElem(V1);
12076 if (InVectors.empty()) {
12077 if (!isa<FixedVectorType>(V1->getType())) {
12078 V1 = createShuffle(V1, nullptr, CommonMask);
12079 CommonMask.assign(Mask.size(), PoisonMaskElem);
12080 transformMaskAfterShuffle(CommonMask, Mask);
12081 }
12082 InVectors.push_back(V1);
12083 CommonMask.assign(Mask.begin(), Mask.end());
12084 return;
12085 }
12086 const auto *It = find(InVectors, V1);
12087 if (It == InVectors.end()) {
12088 if (InVectors.size() == 2 ||
12089 InVectors.front()->getType() != V1->getType() ||
12090 !isa<FixedVectorType>(V1->getType())) {
12091 Value *V = InVectors.front();
12092 if (InVectors.size() == 2) {
12093 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
12094 transformMaskAfterShuffle(CommonMask, CommonMask);
12095 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
12096 CommonMask.size()) {
12097 V = createShuffle(InVectors.front(), nullptr, CommonMask);
12098 transformMaskAfterShuffle(CommonMask, CommonMask);
12099 }
12100 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12101 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
12102 CommonMask[Idx] =
12103 V->getType() != V1->getType()
12104 ? Idx + Sz
12105 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
12106 ->getNumElements();
12107 if (V->getType() != V1->getType())
12108 V1 = createShuffle(V1, nullptr, Mask);
12109 InVectors.front() = V;
12110 if (InVectors.size() == 2)
12111 InVectors.back() = V1;
12112 else
12113 InVectors.push_back(V1);
12114 return;
12115 }
12116 // Check if second vector is required if the used elements are already
12117 // used from the first one.
12118 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12119 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
12120 InVectors.push_back(V1);
12121 break;
12122 }
12123 }
12124 int VF = CommonMask.size();
12125 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
12126 VF = FTy->getNumElements();
12127 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12128 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
12129 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
12130 }
12131 /// Adds another one input vector and the mask for the shuffling.
addOrdered(Value * V1,ArrayRef<unsigned> Order)12132 void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
12133 SmallVector<int> NewMask;
12134 inversePermutation(Order, NewMask);
12135 add(V1, NewMask);
12136 }
gather(ArrayRef<Value * > VL,unsigned MaskVF=0,Value * Root=nullptr)12137 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
12138 Value *Root = nullptr) {
12139 return R.gather(VL, Root, ScalarTy);
12140 }
createFreeze(Value * V)12141 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
12142 /// Finalize emission of the shuffles.
12143 /// \param Action the action (if any) to be performed before final applying of
12144 /// the \p ExtMask mask.
12145 Value *
finalize(ArrayRef<int> ExtMask,unsigned VF=0,function_ref<void (Value * &,SmallVectorImpl<int> &)> Action={})12146 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
12147 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
12148 IsFinalized = true;
12149 if (Action) {
12150 Value *Vec = InVectors.front();
12151 if (InVectors.size() == 2) {
12152 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
12153 InVectors.pop_back();
12154 } else {
12155 Vec = createShuffle(Vec, nullptr, CommonMask);
12156 }
12157 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12158 if (CommonMask[Idx] != PoisonMaskElem)
12159 CommonMask[Idx] = Idx;
12160 assert(VF > 0 &&
12161 "Expected vector length for the final value before action.");
12162 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
12163 if (VecVF < VF) {
12164 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
12165 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
12166 Vec = createShuffle(Vec, nullptr, ResizeMask);
12167 }
12168 Action(Vec, CommonMask);
12169 InVectors.front() = Vec;
12170 }
12171 if (!ExtMask.empty()) {
12172 if (CommonMask.empty()) {
12173 CommonMask.assign(ExtMask.begin(), ExtMask.end());
12174 } else {
12175 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12176 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12177 if (ExtMask[I] == PoisonMaskElem)
12178 continue;
12179 NewMask[I] = CommonMask[ExtMask[I]];
12180 }
12181 CommonMask.swap(NewMask);
12182 }
12183 }
12184 if (CommonMask.empty()) {
12185 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
12186 return InVectors.front();
12187 }
12188 if (InVectors.size() == 2)
12189 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
12190 return createShuffle(InVectors.front(), nullptr, CommonMask);
12191 }
12192
~ShuffleInstructionBuilder()12193 ~ShuffleInstructionBuilder() {
12194 assert((IsFinalized || CommonMask.empty()) &&
12195 "Shuffle construction must be finalized.");
12196 }
12197 };
12198
vectorizeOperand(TreeEntry * E,unsigned NodeIdx,bool PostponedPHIs)12199 Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
12200 bool PostponedPHIs) {
12201 ValueList &VL = E->getOperand(NodeIdx);
12202 const unsigned VF = VL.size();
12203 InstructionsState S = getSameOpcode(VL, *TLI);
12204 // Special processing for GEPs bundle, which may include non-gep values.
12205 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
12206 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
12207 if (It != VL.end())
12208 S = getSameOpcode(*It, *TLI);
12209 }
12210 if (S.getOpcode()) {
12211 auto CheckSameVE = [&](const TreeEntry *VE) {
12212 return VE->isSame(VL) &&
12213 (any_of(VE->UserTreeIndices,
12214 [E, NodeIdx](const EdgeInfo &EI) {
12215 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12216 }) ||
12217 any_of(VectorizableTree,
12218 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12219 return TE->isOperandGatherNode({E, NodeIdx}) &&
12220 VE->isSame(TE->Scalars);
12221 }));
12222 };
12223 TreeEntry *VE = getTreeEntry(S.OpValue);
12224 bool IsSameVE = VE && CheckSameVE(VE);
12225 if (!IsSameVE) {
12226 auto It = MultiNodeScalars.find(S.OpValue);
12227 if (It != MultiNodeScalars.end()) {
12228 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
12229 return TE != VE && CheckSameVE(TE);
12230 });
12231 if (I != It->getSecond().end()) {
12232 VE = *I;
12233 IsSameVE = true;
12234 }
12235 }
12236 }
12237 if (IsSameVE) {
12238 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12239 ShuffleInstructionBuilder ShuffleBuilder(
12240 cast<VectorType>(V->getType())->getElementType(), Builder, *this);
12241 ShuffleBuilder.add(V, Mask);
12242 return ShuffleBuilder.finalize(std::nullopt);
12243 };
12244 Value *V = vectorizeTree(VE, PostponedPHIs);
12245 if (VF * getNumElements(VL[0]->getType()) !=
12246 cast<FixedVectorType>(V->getType())->getNumElements()) {
12247 if (!VE->ReuseShuffleIndices.empty()) {
12248 // Reshuffle to get only unique values.
12249 // If some of the scalars are duplicated in the vectorization
12250 // tree entry, we do not vectorize them but instead generate a
12251 // mask for the reuses. But if there are several users of the
12252 // same entry, they may have different vectorization factors.
12253 // This is especially important for PHI nodes. In this case, we
12254 // need to adapt the resulting instruction for the user
12255 // vectorization factor and have to reshuffle it again to take
12256 // only unique elements of the vector. Without this code the
12257 // function incorrectly returns reduced vector instruction with
12258 // the same elements, not with the unique ones.
12259
12260 // block:
12261 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12262 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12263 // ... (use %2)
12264 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12265 // br %block
12266 SmallVector<int> Mask(VF, PoisonMaskElem);
12267 for (auto [I, V] : enumerate(VL)) {
12268 if (isa<PoisonValue>(V))
12269 continue;
12270 Mask[I] = VE->findLaneForValue(V);
12271 }
12272 V = FinalShuffle(V, Mask);
12273 } else {
12274 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12275 "Expected vectorization factor less "
12276 "than original vector size.");
12277 SmallVector<int> UniformMask(VF, 0);
12278 std::iota(UniformMask.begin(), UniformMask.end(), 0);
12279 V = FinalShuffle(V, UniformMask);
12280 }
12281 }
12282 // Need to update the operand gather node, if actually the operand is not a
12283 // vectorized node, but the buildvector/gather node, which matches one of
12284 // the vectorized nodes.
12285 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
12286 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12287 }) == VE->UserTreeIndices.end()) {
12288 auto *It = find_if(
12289 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12290 return TE->isGather() &&
12291 TE->UserTreeIndices.front().UserTE == E &&
12292 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12293 });
12294 assert(It != VectorizableTree.end() && "Expected gather node operand.");
12295 (*It)->VectorizedValue = V;
12296 }
12297 return V;
12298 }
12299 }
12300
12301 // Find the corresponding gather entry and vectorize it.
12302 // Allows to be more accurate with tree/graph transformations, checks for the
12303 // correctness of the transformations in many cases.
12304 auto *I = find_if(VectorizableTree,
12305 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
12306 return TE->isOperandGatherNode({E, NodeIdx});
12307 });
12308 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
12309 assert(I->get()->UserTreeIndices.size() == 1 &&
12310 "Expected only single user for the gather node.");
12311 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
12312 return vectorizeTree(I->get(), PostponedPHIs);
12313 }
12314
12315 template <typename BVTy, typename ResTy, typename... Args>
processBuildVector(const TreeEntry * E,Type * ScalarTy,Args &...Params)12316 ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
12317 Args &...Params) {
12318 assert(E->isGather() && "Expected gather node.");
12319 unsigned VF = E->getVectorFactor();
12320
12321 bool NeedFreeze = false;
12322 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
12323 E->ReuseShuffleIndices.end());
12324 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
12325 // Build a mask out of the reorder indices and reorder scalars per this
12326 // mask.
12327 SmallVector<int> ReorderMask;
12328 inversePermutation(E->ReorderIndices, ReorderMask);
12329 if (!ReorderMask.empty())
12330 reorderScalars(GatheredScalars, ReorderMask);
12331 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
12332 unsigned I, unsigned SliceSize) {
12333 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
12334 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12335 }))
12336 return false;
12337 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12338 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12339 if (UserTE->getNumOperands() != 2)
12340 return false;
12341 auto *It =
12342 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
12343 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
12344 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12345 }) != TE->UserTreeIndices.end();
12346 });
12347 if (It == VectorizableTree.end())
12348 return false;
12349 int Idx;
12350 if ((Mask.size() < InputVF &&
12351 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
12352 Idx == 0) ||
12353 (Mask.size() == InputVF &&
12354 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
12355 std::iota(
12356 std::next(Mask.begin(), I * SliceSize),
12357 std::next(Mask.begin(),
12358 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12359 0);
12360 } else {
12361 unsigned IVal =
12362 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
12363 std::fill(
12364 std::next(Mask.begin(), I * SliceSize),
12365 std::next(Mask.begin(),
12366 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
12367 IVal);
12368 }
12369 return true;
12370 };
12371 BVTy ShuffleBuilder(ScalarTy, Params...);
12372 ResTy Res = ResTy();
12373 SmallVector<int> Mask;
12374 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
12375 SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
12376 Value *ExtractVecBase = nullptr;
12377 bool UseVecBaseAsInput = false;
12378 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
12379 SmallVector<SmallVector<const TreeEntry *>> Entries;
12380 Type *OrigScalarTy = GatheredScalars.front()->getType();
12381 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
12382 unsigned NumParts = TTI->getNumberOfParts(VecTy);
12383 if (NumParts == 0 || NumParts >= GatheredScalars.size())
12384 NumParts = 1;
12385 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
12386 // Check for gathered extracts.
12387 bool Resized = false;
12388 ExtractShuffles =
12389 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12390 if (!ExtractShuffles.empty()) {
12391 SmallVector<const TreeEntry *> ExtractEntries;
12392 for (auto [Idx, I] : enumerate(ExtractMask)) {
12393 if (I == PoisonMaskElem)
12394 continue;
12395 if (const auto *TE = getTreeEntry(
12396 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
12397 ExtractEntries.push_back(TE);
12398 }
12399 if (std::optional<ResTy> Delayed =
12400 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12401 // Delay emission of gathers which are not ready yet.
12402 PostponedGathers.insert(E);
12403 // Postpone gather emission, will be emitted after the end of the
12404 // process to keep correct order.
12405 return *Delayed;
12406 }
12407 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
12408 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12409 ExtractVecBase = VecBase;
12410 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12411 if (VF == VecBaseTy->getNumElements() &&
12412 GatheredScalars.size() != VF) {
12413 Resized = true;
12414 GatheredScalars.append(VF - GatheredScalars.size(),
12415 PoisonValue::get(OrigScalarTy));
12416 }
12417 }
12418 }
12419 // Gather extracts after we check for full matched gathers only.
12420 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
12421 E->isAltShuffle() ||
12422 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
12423 isSplat(E->Scalars) ||
12424 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12425 GatherShuffles =
12426 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12427 }
12428 if (!GatherShuffles.empty()) {
12429 if (std::optional<ResTy> Delayed =
12430 ShuffleBuilder.needToDelay(E, Entries)) {
12431 // Delay emission of gathers which are not ready yet.
12432 PostponedGathers.insert(E);
12433 // Postpone gather emission, will be emitted after the end of the
12434 // process to keep correct order.
12435 return *Delayed;
12436 }
12437 if (GatherShuffles.size() == 1 &&
12438 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
12439 Entries.front().front()->isSame(E->Scalars)) {
12440 // Perfect match in the graph, will reuse the previously vectorized
12441 // node. Cost is 0.
12442 LLVM_DEBUG(
12443 dbgs()
12444 << "SLP: perfect diamond match for gather bundle "
12445 << shortBundleName(E->Scalars) << ".\n");
12446 // Restore the mask for previous partially matched values.
12447 Mask.resize(E->Scalars.size());
12448 const TreeEntry *FrontTE = Entries.front().front();
12449 if (FrontTE->ReorderIndices.empty() &&
12450 ((FrontTE->ReuseShuffleIndices.empty() &&
12451 E->Scalars.size() == FrontTE->Scalars.size()) ||
12452 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12453 std::iota(Mask.begin(), Mask.end(), 0);
12454 } else {
12455 for (auto [I, V] : enumerate(E->Scalars)) {
12456 if (isa<PoisonValue>(V)) {
12457 Mask[I] = PoisonMaskElem;
12458 continue;
12459 }
12460 Mask[I] = FrontTE->findLaneForValue(V);
12461 }
12462 }
12463 ShuffleBuilder.add(*FrontTE, Mask);
12464 Res = ShuffleBuilder.finalize(E->getCommonMask());
12465 return Res;
12466 }
12467 if (!Resized) {
12468 if (GatheredScalars.size() != VF &&
12469 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
12470 return any_of(TEs, [&](const TreeEntry *TE) {
12471 return TE->getVectorFactor() == VF;
12472 });
12473 }))
12474 GatheredScalars.append(VF - GatheredScalars.size(),
12475 PoisonValue::get(OrigScalarTy));
12476 }
12477 // Remove shuffled elements from list of gathers.
12478 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
12479 if (Mask[I] != PoisonMaskElem)
12480 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12481 }
12482 }
12483 }
12484 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
12485 SmallVectorImpl<int> &ReuseMask,
12486 bool IsRootPoison) {
12487 // For splats with can emit broadcasts instead of gathers, so try to find
12488 // such sequences.
12489 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
12490 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
12491 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
12492 SmallVector<int> UndefPos;
12493 DenseMap<Value *, unsigned> UniquePositions;
12494 // Gather unique non-const values and all constant values.
12495 // For repeated values, just shuffle them.
12496 int NumNonConsts = 0;
12497 int SinglePos = 0;
12498 for (auto [I, V] : enumerate(Scalars)) {
12499 if (isa<UndefValue>(V)) {
12500 if (!isa<PoisonValue>(V)) {
12501 ReuseMask[I] = I;
12502 UndefPos.push_back(I);
12503 }
12504 continue;
12505 }
12506 if (isConstant(V)) {
12507 ReuseMask[I] = I;
12508 continue;
12509 }
12510 ++NumNonConsts;
12511 SinglePos = I;
12512 Value *OrigV = V;
12513 Scalars[I] = PoisonValue::get(OrigScalarTy);
12514 if (IsSplat) {
12515 Scalars.front() = OrigV;
12516 ReuseMask[I] = 0;
12517 } else {
12518 const auto Res = UniquePositions.try_emplace(OrigV, I);
12519 Scalars[Res.first->second] = OrigV;
12520 ReuseMask[I] = Res.first->second;
12521 }
12522 }
12523 if (NumNonConsts == 1) {
12524 // Restore single insert element.
12525 if (IsSplat) {
12526 ReuseMask.assign(VF, PoisonMaskElem);
12527 std::swap(Scalars.front(), Scalars[SinglePos]);
12528 if (!UndefPos.empty() && UndefPos.front() == 0)
12529 Scalars.front() = UndefValue::get(OrigScalarTy);
12530 }
12531 ReuseMask[SinglePos] = SinglePos;
12532 } else if (!UndefPos.empty() && IsSplat) {
12533 // For undef values, try to replace them with the simple broadcast.
12534 // We can do it if the broadcasted value is guaranteed to be
12535 // non-poisonous, or by freezing the incoming scalar value first.
12536 auto *It = find_if(Scalars, [this, E](Value *V) {
12537 return !isa<UndefValue>(V) &&
12538 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
12539 (E->UserTreeIndices.size() == 1 &&
12540 any_of(V->uses(), [E](const Use &U) {
12541 // Check if the value already used in the same operation in
12542 // one of the nodes already.
12543 return E->UserTreeIndices.front().EdgeIdx !=
12544 U.getOperandNo() &&
12545 is_contained(
12546 E->UserTreeIndices.front().UserTE->Scalars,
12547 U.getUser());
12548 })));
12549 });
12550 if (It != Scalars.end()) {
12551 // Replace undefs by the non-poisoned scalars and emit broadcast.
12552 int Pos = std::distance(Scalars.begin(), It);
12553 for (int I : UndefPos) {
12554 // Set the undef position to the non-poisoned scalar.
12555 ReuseMask[I] = Pos;
12556 // Replace the undef by the poison, in the mask it is replaced by
12557 // non-poisoned scalar already.
12558 if (I != Pos)
12559 Scalars[I] = PoisonValue::get(OrigScalarTy);
12560 }
12561 } else {
12562 // Replace undefs by the poisons, emit broadcast and then emit
12563 // freeze.
12564 for (int I : UndefPos) {
12565 ReuseMask[I] = PoisonMaskElem;
12566 if (isa<UndefValue>(Scalars[I]))
12567 Scalars[I] = PoisonValue::get(OrigScalarTy);
12568 }
12569 NeedFreeze = true;
12570 }
12571 }
12572 };
12573 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12574 bool IsNonPoisoned = true;
12575 bool IsUsedInExpr = true;
12576 Value *Vec1 = nullptr;
12577 if (!ExtractShuffles.empty()) {
12578 // Gather of extractelements can be represented as just a shuffle of
12579 // a single/two vectors the scalars are extracted from.
12580 // Find input vectors.
12581 Value *Vec2 = nullptr;
12582 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12583 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12584 ExtractMask[I] = PoisonMaskElem;
12585 }
12586 if (UseVecBaseAsInput) {
12587 Vec1 = ExtractVecBase;
12588 } else {
12589 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12590 if (ExtractMask[I] == PoisonMaskElem)
12591 continue;
12592 if (isa<UndefValue>(E->Scalars[I]))
12593 continue;
12594 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12595 Value *VecOp = EI->getVectorOperand();
12596 if (const auto *TE = getTreeEntry(VecOp))
12597 if (TE->VectorizedValue)
12598 VecOp = TE->VectorizedValue;
12599 if (!Vec1) {
12600 Vec1 = VecOp;
12601 } else if (Vec1 != VecOp) {
12602 assert((!Vec2 || Vec2 == VecOp) &&
12603 "Expected only 1 or 2 vectors shuffle.");
12604 Vec2 = VecOp;
12605 }
12606 }
12607 }
12608 if (Vec2) {
12609 IsUsedInExpr = false;
12610 IsNonPoisoned &=
12611 isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
12612 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12613 } else if (Vec1) {
12614 IsUsedInExpr &= FindReusedSplat(
12615 ExtractMask,
12616 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
12617 ExtractMask.size());
12618 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12619 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
12620 } else {
12621 IsUsedInExpr = false;
12622 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
12623 /*ForExtracts=*/true);
12624 }
12625 }
12626 if (!GatherShuffles.empty()) {
12627 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
12628 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12629 for (const auto [I, TEs] : enumerate(Entries)) {
12630 if (TEs.empty()) {
12631 assert(!GatherShuffles[I] &&
12632 "No shuffles with empty entries list expected.");
12633 continue;
12634 }
12635 assert((TEs.size() == 1 || TEs.size() == 2) &&
12636 "Expected shuffle of 1 or 2 entries.");
12637 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
12638 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
12639 VecMask.assign(VecMask.size(), PoisonMaskElem);
12640 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
12641 if (TEs.size() == 1) {
12642 IsUsedInExpr &= FindReusedSplat(
12643 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12644 ShuffleBuilder.add(*TEs.front(), VecMask);
12645 if (TEs.front()->VectorizedValue)
12646 IsNonPoisoned &=
12647 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
12648 } else {
12649 IsUsedInExpr = false;
12650 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12651 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12652 IsNonPoisoned &=
12653 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
12654 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
12655 }
12656 }
12657 }
12658 // Try to figure out best way to combine values: build a shuffle and insert
12659 // elements or just build several shuffles.
12660 // Insert non-constant scalars.
12661 SmallVector<Value *> NonConstants(GatheredScalars);
12662 int EMSz = ExtractMask.size();
12663 int MSz = Mask.size();
12664 // Try to build constant vector and shuffle with it only if currently we
12665 // have a single permutation and more than 1 scalar constants.
12666 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12667 bool IsIdentityShuffle =
12668 ((UseVecBaseAsInput ||
12669 all_of(ExtractShuffles,
12670 [](const std::optional<TTI::ShuffleKind> &SK) {
12671 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12672 TTI::SK_PermuteSingleSrc;
12673 })) &&
12674 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12675 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
12676 (!GatherShuffles.empty() &&
12677 all_of(GatherShuffles,
12678 [](const std::optional<TTI::ShuffleKind> &SK) {
12679 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12680 TTI::SK_PermuteSingleSrc;
12681 }) &&
12682 none_of(Mask, [&](int I) { return I >= MSz; }) &&
12683 ShuffleVectorInst::isIdentityMask(Mask, MSz));
12684 bool EnoughConstsForShuffle =
12685 IsSingleShuffle &&
12686 (none_of(GatheredScalars,
12687 [](Value *V) {
12688 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12689 }) ||
12690 any_of(GatheredScalars,
12691 [](Value *V) {
12692 return isa<Constant>(V) && !isa<UndefValue>(V);
12693 })) &&
12694 (!IsIdentityShuffle ||
12695 (GatheredScalars.size() == 2 &&
12696 any_of(GatheredScalars,
12697 [](Value *V) { return !isa<UndefValue>(V); })) ||
12698 count_if(GatheredScalars, [](Value *V) {
12699 return isa<Constant>(V) && !isa<PoisonValue>(V);
12700 }) > 1);
12701 // NonConstants array contains just non-constant values, GatheredScalars
12702 // contains only constant to build final vector and then shuffle.
12703 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12704 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
12705 NonConstants[I] = PoisonValue::get(OrigScalarTy);
12706 else
12707 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12708 }
12709 // Generate constants for final shuffle and build a mask for them.
12710 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12711 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12712 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12713 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12714 ShuffleBuilder.add(BV, BVMask);
12715 }
12716 if (all_of(NonConstants, [=](Value *V) {
12717 return isa<PoisonValue>(V) ||
12718 (IsSingleShuffle && ((IsIdentityShuffle &&
12719 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12720 }))
12721 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12722 else
12723 Res = ShuffleBuilder.finalize(
12724 E->ReuseShuffleIndices, E->Scalars.size(),
12725 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12726 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12727 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12728 });
12729 } else if (!allConstant(GatheredScalars)) {
12730 // Gather unique scalars and all constants.
12731 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12732 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12733 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12734 ShuffleBuilder.add(BV, ReuseMask);
12735 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12736 } else {
12737 // Gather all constants.
12738 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12739 for (auto [I, V] : enumerate(E->Scalars)) {
12740 if (!isa<PoisonValue>(V))
12741 Mask[I] = I;
12742 }
12743 Value *BV = ShuffleBuilder.gather(E->Scalars);
12744 ShuffleBuilder.add(BV, Mask);
12745 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12746 }
12747
12748 if (NeedFreeze)
12749 Res = ShuffleBuilder.createFreeze(Res);
12750 return Res;
12751 }
12752
createBuildVector(const TreeEntry * E,Type * ScalarTy)12753 Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
12754 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12755 Builder, *this);
12756 }
12757
vectorizeTree(TreeEntry * E,bool PostponedPHIs)12758 Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12759 IRBuilderBase::InsertPointGuard Guard(Builder);
12760
12761 if (E->VectorizedValue &&
12762 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12763 E->isAltShuffle())) {
12764 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12765 return E->VectorizedValue;
12766 }
12767
12768 Value *V = E->Scalars.front();
12769 Type *ScalarTy = V->getType();
12770 if (auto *Store = dyn_cast<StoreInst>(V))
12771 ScalarTy = Store->getValueOperand()->getType();
12772 else if (auto *IE = dyn_cast<InsertElementInst>(V))
12773 ScalarTy = IE->getOperand(1)->getType();
12774 auto It = MinBWs.find(E);
12775 if (It != MinBWs.end())
12776 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
12777 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
12778 if (E->isGather()) {
12779 // Set insert point for non-reduction initial nodes.
12780 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12781 setInsertPointAfterBundle(E);
12782 Value *Vec = createBuildVector(E, ScalarTy);
12783 E->VectorizedValue = Vec;
12784 return Vec;
12785 }
12786
12787 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
12788 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12789 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
12790 if (E->getOpcode() == Instruction::Store &&
12791 E->State == TreeEntry::Vectorize) {
12792 ArrayRef<int> Mask =
12793 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12794 E->ReorderIndices.size());
12795 ShuffleBuilder.add(V, Mask);
12796 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12797 ShuffleBuilder.addOrdered(V, std::nullopt);
12798 } else {
12799 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12800 }
12801 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12802 };
12803
12804 assert((E->State == TreeEntry::Vectorize ||
12805 E->State == TreeEntry::ScatterVectorize ||
12806 E->State == TreeEntry::StridedVectorize) &&
12807 "Unhandled state");
12808 unsigned ShuffleOrOp =
12809 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12810 Instruction *VL0 = E->getMainOp();
12811 auto GetOperandSignedness = [&](unsigned Idx) {
12812 const TreeEntry *OpE = getOperandEntry(E, Idx);
12813 bool IsSigned = false;
12814 auto It = MinBWs.find(OpE);
12815 if (It != MinBWs.end())
12816 IsSigned = It->second.second;
12817 else
12818 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
12819 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12820 });
12821 return IsSigned;
12822 };
12823 switch (ShuffleOrOp) {
12824 case Instruction::PHI: {
12825 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12826 E != VectorizableTree.front().get() ||
12827 !E->UserTreeIndices.empty()) &&
12828 "PHI reordering is free.");
12829 if (PostponedPHIs && E->VectorizedValue)
12830 return E->VectorizedValue;
12831 auto *PH = cast<PHINode>(VL0);
12832 Builder.SetInsertPoint(PH->getParent(),
12833 PH->getParent()->getFirstNonPHIIt());
12834 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12835 if (PostponedPHIs || !E->VectorizedValue) {
12836 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
12837 E->PHI = NewPhi;
12838 Value *V = NewPhi;
12839
12840 // Adjust insertion point once all PHI's have been generated.
12841 Builder.SetInsertPoint(PH->getParent(),
12842 PH->getParent()->getFirstInsertionPt());
12843 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12844
12845 V = FinalShuffle(V, E, VecTy);
12846
12847 E->VectorizedValue = V;
12848 if (PostponedPHIs)
12849 return V;
12850 }
12851 PHINode *NewPhi = cast<PHINode>(E->PHI);
12852 // If phi node is fully emitted - exit.
12853 if (NewPhi->getNumIncomingValues() != 0)
12854 return NewPhi;
12855
12856 // PHINodes may have multiple entries from the same block. We want to
12857 // visit every block once.
12858 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
12859
12860 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12861 ValueList Operands;
12862 BasicBlock *IBB = PH->getIncomingBlock(I);
12863
12864 // Stop emission if all incoming values are generated.
12865 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12866 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12867 return NewPhi;
12868 }
12869
12870 if (!VisitedBBs.insert(IBB).second) {
12871 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
12872 continue;
12873 }
12874
12875 Builder.SetInsertPoint(IBB->getTerminator());
12876 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12877 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
12878 if (VecTy != Vec->getType()) {
12879 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
12880 MinBWs.contains(getOperandEntry(E, I))) &&
12881 "Expected item in MinBWs.");
12882 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
12883 }
12884 NewPhi->addIncoming(Vec, IBB);
12885 }
12886
12887 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12888 "Invalid number of incoming values");
12889 return NewPhi;
12890 }
12891
12892 case Instruction::ExtractElement: {
12893 Value *V = E->getSingleOperand(0);
12894 if (const TreeEntry *TE = getTreeEntry(V))
12895 V = TE->VectorizedValue;
12896 setInsertPointAfterBundle(E);
12897 V = FinalShuffle(V, E, VecTy);
12898 E->VectorizedValue = V;
12899 return V;
12900 }
12901 case Instruction::ExtractValue: {
12902 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12903 Builder.SetInsertPoint(LI);
12904 Value *Ptr = LI->getPointerOperand();
12905 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
12906 Value *NewV = propagateMetadata(V, E->Scalars);
12907 NewV = FinalShuffle(NewV, E, VecTy);
12908 E->VectorizedValue = NewV;
12909 return NewV;
12910 }
12911 case Instruction::InsertElement: {
12912 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12913 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
12914 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
12915 ArrayRef<Value *> Op = E->getOperand(1);
12916 Type *ScalarTy = Op.front()->getType();
12917 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
12918 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12919 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
12920 assert(Res.first > 0 && "Expected item in MinBWs.");
12921 V = Builder.CreateIntCast(
12922 V,
12923 getWidenedType(
12924 ScalarTy,
12925 cast<FixedVectorType>(V->getType())->getNumElements()),
12926 Res.second);
12927 }
12928
12929 // Create InsertVector shuffle if necessary
12930 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
12931 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12932 }));
12933 const unsigned NumElts =
12934 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12935 const unsigned NumScalars = E->Scalars.size();
12936
12937 unsigned Offset = *getElementIndex(VL0);
12938 assert(Offset < NumElts && "Failed to find vector index offset");
12939
12940 // Create shuffle to resize vector
12941 SmallVector<int> Mask;
12942 if (!E->ReorderIndices.empty()) {
12943 inversePermutation(E->ReorderIndices, Mask);
12944 Mask.append(NumElts - NumScalars, PoisonMaskElem);
12945 } else {
12946 Mask.assign(NumElts, PoisonMaskElem);
12947 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
12948 }
12949 // Create InsertVector shuffle if necessary
12950 bool IsIdentity = true;
12951 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12952 Mask.swap(PrevMask);
12953 for (unsigned I = 0; I < NumScalars; ++I) {
12954 Value *Scalar = E->Scalars[PrevMask[I]];
12955 unsigned InsertIdx = *getElementIndex(Scalar);
12956 IsIdentity &= InsertIdx - Offset == I;
12957 Mask[InsertIdx - Offset] = I;
12958 }
12959 if (!IsIdentity || NumElts != NumScalars) {
12960 Value *V2 = nullptr;
12961 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12962 SmallVector<int> InsertMask(Mask);
12963 if (NumElts != NumScalars && Offset == 0) {
12964 // Follow all insert element instructions from the current buildvector
12965 // sequence.
12966 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
12967 do {
12968 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
12969 if (!InsertIdx)
12970 break;
12971 if (InsertMask[*InsertIdx] == PoisonMaskElem)
12972 InsertMask[*InsertIdx] = *InsertIdx;
12973 if (!Ins->hasOneUse())
12974 break;
12975 Ins = dyn_cast_or_null<InsertElementInst>(
12976 Ins->getUniqueUndroppableUser());
12977 } while (Ins);
12978 SmallBitVector UseMask =
12979 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12980 SmallBitVector IsFirstPoison =
12981 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12982 SmallBitVector IsFirstUndef =
12983 isUndefVector(FirstInsert->getOperand(0), UseMask);
12984 if (!IsFirstPoison.all()) {
12985 unsigned Idx = 0;
12986 for (unsigned I = 0; I < NumElts; I++) {
12987 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
12988 IsFirstUndef.test(I)) {
12989 if (IsVNonPoisonous) {
12990 InsertMask[I] = I < NumScalars ? I : 0;
12991 continue;
12992 }
12993 if (!V2)
12994 V2 = UndefValue::get(V->getType());
12995 if (Idx >= NumScalars)
12996 Idx = NumScalars - 1;
12997 InsertMask[I] = NumScalars + Idx;
12998 ++Idx;
12999 } else if (InsertMask[I] != PoisonMaskElem &&
13000 Mask[I] == PoisonMaskElem) {
13001 InsertMask[I] = PoisonMaskElem;
13002 }
13003 }
13004 } else {
13005 InsertMask = Mask;
13006 }
13007 }
13008 if (!V2)
13009 V2 = PoisonValue::get(V->getType());
13010 V = Builder.CreateShuffleVector(V, V2, InsertMask);
13011 if (auto *I = dyn_cast<Instruction>(V)) {
13012 GatherShuffleExtractSeq.insert(I);
13013 CSEBlocks.insert(I->getParent());
13014 }
13015 }
13016
13017 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
13018 for (unsigned I = 0; I < NumElts; I++) {
13019 if (Mask[I] != PoisonMaskElem)
13020 InsertMask[Offset + I] = I;
13021 }
13022 SmallBitVector UseMask =
13023 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
13024 SmallBitVector IsFirstUndef =
13025 isUndefVector(FirstInsert->getOperand(0), UseMask);
13026 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
13027 NumElts != NumScalars) {
13028 if (IsFirstUndef.all()) {
13029 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
13030 SmallBitVector IsFirstPoison =
13031 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13032 if (!IsFirstPoison.all()) {
13033 for (unsigned I = 0; I < NumElts; I++) {
13034 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
13035 InsertMask[I] = I + NumElts;
13036 }
13037 }
13038 V = Builder.CreateShuffleVector(
13039 V,
13040 IsFirstPoison.all() ? PoisonValue::get(V->getType())
13041 : FirstInsert->getOperand(0),
13042 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
13043 if (auto *I = dyn_cast<Instruction>(V)) {
13044 GatherShuffleExtractSeq.insert(I);
13045 CSEBlocks.insert(I->getParent());
13046 }
13047 }
13048 } else {
13049 SmallBitVector IsFirstPoison =
13050 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13051 for (unsigned I = 0; I < NumElts; I++) {
13052 if (InsertMask[I] == PoisonMaskElem)
13053 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
13054 else
13055 InsertMask[I] += NumElts;
13056 }
13057 V = Builder.CreateShuffleVector(
13058 FirstInsert->getOperand(0), V, InsertMask,
13059 cast<Instruction>(E->Scalars.back())->getName());
13060 if (auto *I = dyn_cast<Instruction>(V)) {
13061 GatherShuffleExtractSeq.insert(I);
13062 CSEBlocks.insert(I->getParent());
13063 }
13064 }
13065 }
13066
13067 ++NumVectorInstructions;
13068 E->VectorizedValue = V;
13069 return V;
13070 }
13071 case Instruction::ZExt:
13072 case Instruction::SExt:
13073 case Instruction::FPToUI:
13074 case Instruction::FPToSI:
13075 case Instruction::FPExt:
13076 case Instruction::PtrToInt:
13077 case Instruction::IntToPtr:
13078 case Instruction::SIToFP:
13079 case Instruction::UIToFP:
13080 case Instruction::Trunc:
13081 case Instruction::FPTrunc:
13082 case Instruction::BitCast: {
13083 setInsertPointAfterBundle(E);
13084
13085 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
13086 if (E->VectorizedValue) {
13087 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13088 return E->VectorizedValue;
13089 }
13090
13091 auto *CI = cast<CastInst>(VL0);
13092 Instruction::CastOps VecOpcode = CI->getOpcode();
13093 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
13094 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
13095 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
13096 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
13097 SrcScalarTy != CI->getOperand(0)->getType())) {
13098 // Check if the values are candidates to demote.
13099 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
13100 if (SrcIt != MinBWs.end())
13101 SrcBWSz = SrcIt->second.first;
13102 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13103 if (BWSz == SrcBWSz) {
13104 VecOpcode = Instruction::BitCast;
13105 } else if (BWSz < SrcBWSz) {
13106 VecOpcode = Instruction::Trunc;
13107 } else if (It != MinBWs.end()) {
13108 assert(BWSz > SrcBWSz && "Invalid cast!");
13109 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
13110 } else if (SrcIt != MinBWs.end()) {
13111 assert(BWSz > SrcBWSz && "Invalid cast!");
13112 VecOpcode =
13113 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
13114 }
13115 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
13116 !SrcIt->second.second) {
13117 VecOpcode = Instruction::UIToFP;
13118 }
13119 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
13120 ? InVec
13121 : Builder.CreateCast(VecOpcode, InVec, VecTy);
13122 V = FinalShuffle(V, E, VecTy);
13123
13124 E->VectorizedValue = V;
13125 ++NumVectorInstructions;
13126 return V;
13127 }
13128 case Instruction::FCmp:
13129 case Instruction::ICmp: {
13130 setInsertPointAfterBundle(E);
13131
13132 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
13133 if (E->VectorizedValue) {
13134 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13135 return E->VectorizedValue;
13136 }
13137 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
13138 if (E->VectorizedValue) {
13139 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13140 return E->VectorizedValue;
13141 }
13142 if (L->getType() != R->getType()) {
13143 assert((getOperandEntry(E, 0)->isGather() ||
13144 getOperandEntry(E, 1)->isGather() ||
13145 MinBWs.contains(getOperandEntry(E, 0)) ||
13146 MinBWs.contains(getOperandEntry(E, 1))) &&
13147 "Expected item in MinBWs.");
13148 if (cast<VectorType>(L->getType())
13149 ->getElementType()
13150 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
13151 ->getElementType()
13152 ->getIntegerBitWidth()) {
13153 Type *CastTy = R->getType();
13154 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
13155 } else {
13156 Type *CastTy = L->getType();
13157 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
13158 }
13159 }
13160
13161 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
13162 Value *V = Builder.CreateCmp(P0, L, R);
13163 propagateIRFlags(V, E->Scalars, VL0);
13164 // Do not cast for cmps.
13165 VecTy = cast<FixedVectorType>(V->getType());
13166 V = FinalShuffle(V, E, VecTy);
13167
13168 E->VectorizedValue = V;
13169 ++NumVectorInstructions;
13170 return V;
13171 }
13172 case Instruction::Select: {
13173 setInsertPointAfterBundle(E);
13174
13175 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
13176 if (E->VectorizedValue) {
13177 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13178 return E->VectorizedValue;
13179 }
13180 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
13181 if (E->VectorizedValue) {
13182 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13183 return E->VectorizedValue;
13184 }
13185 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
13186 if (E->VectorizedValue) {
13187 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13188 return E->VectorizedValue;
13189 }
13190 if (True->getType() != VecTy || False->getType() != VecTy) {
13191 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
13192 getOperandEntry(E, 2)->isGather() ||
13193 MinBWs.contains(getOperandEntry(E, 1)) ||
13194 MinBWs.contains(getOperandEntry(E, 2))) &&
13195 "Expected item in MinBWs.");
13196 if (True->getType() != VecTy)
13197 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
13198 if (False->getType() != VecTy)
13199 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
13200 }
13201
13202 Value *V = Builder.CreateSelect(Cond, True, False);
13203 V = FinalShuffle(V, E, VecTy);
13204
13205 E->VectorizedValue = V;
13206 ++NumVectorInstructions;
13207 return V;
13208 }
13209 case Instruction::FNeg: {
13210 setInsertPointAfterBundle(E);
13211
13212 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
13213
13214 if (E->VectorizedValue) {
13215 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13216 return E->VectorizedValue;
13217 }
13218
13219 Value *V = Builder.CreateUnOp(
13220 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
13221 propagateIRFlags(V, E->Scalars, VL0);
13222 if (auto *I = dyn_cast<Instruction>(V))
13223 V = propagateMetadata(I, E->Scalars);
13224
13225 V = FinalShuffle(V, E, VecTy);
13226
13227 E->VectorizedValue = V;
13228 ++NumVectorInstructions;
13229
13230 return V;
13231 }
13232 case Instruction::Add:
13233 case Instruction::FAdd:
13234 case Instruction::Sub:
13235 case Instruction::FSub:
13236 case Instruction::Mul:
13237 case Instruction::FMul:
13238 case Instruction::UDiv:
13239 case Instruction::SDiv:
13240 case Instruction::FDiv:
13241 case Instruction::URem:
13242 case Instruction::SRem:
13243 case Instruction::FRem:
13244 case Instruction::Shl:
13245 case Instruction::LShr:
13246 case Instruction::AShr:
13247 case Instruction::And:
13248 case Instruction::Or:
13249 case Instruction::Xor: {
13250 setInsertPointAfterBundle(E);
13251
13252 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
13253 if (E->VectorizedValue) {
13254 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13255 return E->VectorizedValue;
13256 }
13257 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
13258 if (E->VectorizedValue) {
13259 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13260 return E->VectorizedValue;
13261 }
13262 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
13263 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13264 ArrayRef<Value *> Ops = E->getOperand(I);
13265 if (all_of(Ops, [&](Value *Op) {
13266 auto *CI = dyn_cast<ConstantInt>(Op);
13267 return CI && CI->getValue().countr_one() >= It->second.first;
13268 })) {
13269 V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
13270 E->VectorizedValue = V;
13271 ++NumVectorInstructions;
13272 return V;
13273 }
13274 }
13275 }
13276 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
13277 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
13278 getOperandEntry(E, 1)->isGather() ||
13279 MinBWs.contains(getOperandEntry(E, 0)) ||
13280 MinBWs.contains(getOperandEntry(E, 1))) &&
13281 "Expected item in MinBWs.");
13282 if (LHS->getType() != VecTy)
13283 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
13284 if (RHS->getType() != VecTy)
13285 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
13286 }
13287
13288 Value *V = Builder.CreateBinOp(
13289 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
13290 RHS);
13291 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
13292 if (auto *I = dyn_cast<Instruction>(V)) {
13293 V = propagateMetadata(I, E->Scalars);
13294 // Drop nuw flags for abs(sub(commutative), true).
13295 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
13296 any_of(E->Scalars, [](Value *V) {
13297 return isCommutative(cast<Instruction>(V));
13298 }))
13299 I->setHasNoUnsignedWrap(/*b=*/false);
13300 }
13301
13302 V = FinalShuffle(V, E, VecTy);
13303
13304 E->VectorizedValue = V;
13305 ++NumVectorInstructions;
13306
13307 return V;
13308 }
13309 case Instruction::Load: {
13310 // Loads are inserted at the head of the tree because we don't want to
13311 // sink them all the way down past store instructions.
13312 setInsertPointAfterBundle(E);
13313
13314 LoadInst *LI = cast<LoadInst>(VL0);
13315 Instruction *NewLI;
13316 Value *PO = LI->getPointerOperand();
13317 if (E->State == TreeEntry::Vectorize) {
13318 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
13319 } else if (E->State == TreeEntry::StridedVectorize) {
13320 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13321 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13322 PO = IsReverseOrder ? PtrN : Ptr0;
13323 std::optional<int> Diff = getPointersDiff(
13324 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
13325 Type *StrideTy = DL->getIndexType(PO->getType());
13326 Value *StrideVal;
13327 if (Diff) {
13328 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
13329 StrideVal =
13330 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13331 DL->getTypeAllocSize(ScalarTy));
13332 } else {
13333 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
13334 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
13335 return cast<LoadInst>(V)->getPointerOperand();
13336 });
13337 OrdersType Order;
13338 std::optional<Value *> Stride =
13339 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
13340 &*Builder.GetInsertPoint());
13341 Value *NewStride =
13342 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
13343 StrideVal = Builder.CreateMul(
13344 NewStride,
13345 ConstantInt::get(
13346 StrideTy,
13347 (IsReverseOrder ? -1 : 1) *
13348 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
13349 }
13350 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13351 auto *Inst = Builder.CreateIntrinsic(
13352 Intrinsic::experimental_vp_strided_load,
13353 {VecTy, PO->getType(), StrideTy},
13354 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
13355 Builder.getInt32(E->Scalars.size())});
13356 Inst->addParamAttr(
13357 /*ArgNo=*/0,
13358 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13359 NewLI = Inst;
13360 } else {
13361 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
13362 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13363 if (E->VectorizedValue) {
13364 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13365 return E->VectorizedValue;
13366 }
13367 // Use the minimum alignment of the gathered loads.
13368 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13369 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
13370 }
13371 Value *V = propagateMetadata(NewLI, E->Scalars);
13372
13373 V = FinalShuffle(V, E, VecTy);
13374 E->VectorizedValue = V;
13375 ++NumVectorInstructions;
13376 return V;
13377 }
13378 case Instruction::Store: {
13379 auto *SI = cast<StoreInst>(VL0);
13380
13381 setInsertPointAfterBundle(E);
13382
13383 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13384 if (VecValue->getType() != VecTy)
13385 VecValue =
13386 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13387 VecValue = FinalShuffle(VecValue, E, VecTy);
13388
13389 Value *Ptr = SI->getPointerOperand();
13390 Instruction *ST;
13391 if (E->State == TreeEntry::Vectorize) {
13392 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
13393 } else {
13394 assert(E->State == TreeEntry::StridedVectorize &&
13395 "Expected either strided or conseutive stores.");
13396 if (!E->ReorderIndices.empty()) {
13397 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
13398 Ptr = SI->getPointerOperand();
13399 }
13400 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13401 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
13402 auto *Inst = Builder.CreateIntrinsic(
13403 Intrinsic::experimental_vp_strided_store,
13404 {VecTy, Ptr->getType(), StrideTy},
13405 {VecValue, Ptr,
13406 ConstantInt::get(
13407 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
13408 Builder.getAllOnesMask(VecTy->getElementCount()),
13409 Builder.getInt32(E->Scalars.size())});
13410 Inst->addParamAttr(
13411 /*ArgNo=*/1,
13412 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13413 ST = Inst;
13414 }
13415
13416 Value *V = propagateMetadata(ST, E->Scalars);
13417
13418 E->VectorizedValue = V;
13419 ++NumVectorInstructions;
13420 return V;
13421 }
13422 case Instruction::GetElementPtr: {
13423 auto *GEP0 = cast<GetElementPtrInst>(VL0);
13424 setInsertPointAfterBundle(E);
13425
13426 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13427 if (E->VectorizedValue) {
13428 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13429 return E->VectorizedValue;
13430 }
13431
13432 SmallVector<Value *> OpVecs;
13433 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
13434 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13435 if (E->VectorizedValue) {
13436 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13437 return E->VectorizedValue;
13438 }
13439 OpVecs.push_back(OpVec);
13440 }
13441
13442 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13443 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
13444 SmallVector<Value *> GEPs;
13445 for (Value *V : E->Scalars) {
13446 if (isa<GetElementPtrInst>(V))
13447 GEPs.push_back(V);
13448 }
13449 V = propagateMetadata(I, GEPs);
13450 }
13451
13452 V = FinalShuffle(V, E, VecTy);
13453
13454 E->VectorizedValue = V;
13455 ++NumVectorInstructions;
13456
13457 return V;
13458 }
13459 case Instruction::Call: {
13460 CallInst *CI = cast<CallInst>(VL0);
13461 setInsertPointAfterBundle(E);
13462
13463 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
13464
13465 SmallVector<Type *> ArgTys =
13466 buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
13467 It != MinBWs.end() ? It->second.first : 0);
13468 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
13469 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
13470 VecCallCosts.first <= VecCallCosts.second;
13471
13472 Value *ScalarArg = nullptr;
13473 SmallVector<Value *> OpVecs;
13474 SmallVector<Type *, 2> TysForDecl;
13475 // Add return type if intrinsic is overloaded on it.
13476 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
13477 TysForDecl.push_back(VecTy);
13478 auto *CEI = cast<CallInst>(VL0);
13479 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
13480 ValueList OpVL;
13481 // Some intrinsics have scalar arguments. This argument should not be
13482 // vectorized.
13483 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
13484 ScalarArg = CEI->getArgOperand(I);
13485 // if decided to reduce bitwidth of abs intrinsic, it second argument
13486 // must be set false (do not return poison, if value issigned min).
13487 if (ID == Intrinsic::abs && It != MinBWs.end() &&
13488 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
13489 ScalarArg = Builder.getFalse();
13490 OpVecs.push_back(ScalarArg);
13491 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
13492 TysForDecl.push_back(ScalarArg->getType());
13493 continue;
13494 }
13495
13496 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
13497 if (E->VectorizedValue) {
13498 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13499 return E->VectorizedValue;
13500 }
13501 ScalarArg = CEI->getArgOperand(I);
13502 if (cast<VectorType>(OpVec->getType())->getElementType() !=
13503 ScalarArg->getType()->getScalarType() &&
13504 It == MinBWs.end()) {
13505 auto *CastTy =
13506 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
13507 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
13508 } else if (It != MinBWs.end()) {
13509 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
13510 }
13511 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
13512 OpVecs.push_back(OpVec);
13513 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
13514 TysForDecl.push_back(OpVec->getType());
13515 }
13516
13517 Function *CF;
13518 if (!UseIntrinsic) {
13519 VFShape Shape =
13520 VFShape::get(CI->getFunctionType(),
13521 ElementCount::getFixed(
13522 static_cast<unsigned>(VecTy->getNumElements())),
13523 false /*HasGlobalPred*/);
13524 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
13525 } else {
13526 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
13527 }
13528
13529 SmallVector<OperandBundleDef, 1> OpBundles;
13530 CI->getOperandBundlesAsDefs(OpBundles);
13531 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
13532
13533 propagateIRFlags(V, E->Scalars, VL0);
13534 V = FinalShuffle(V, E, VecTy);
13535
13536 E->VectorizedValue = V;
13537 ++NumVectorInstructions;
13538 return V;
13539 }
13540 case Instruction::ShuffleVector: {
13541 assert(E->isAltShuffle() &&
13542 ((Instruction::isBinaryOp(E->getOpcode()) &&
13543 Instruction::isBinaryOp(E->getAltOpcode())) ||
13544 (Instruction::isCast(E->getOpcode()) &&
13545 Instruction::isCast(E->getAltOpcode())) ||
13546 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13547 "Invalid Shuffle Vector Operand");
13548
13549 Value *LHS = nullptr, *RHS = nullptr;
13550 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
13551 setInsertPointAfterBundle(E);
13552 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13553 if (E->VectorizedValue) {
13554 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13555 return E->VectorizedValue;
13556 }
13557 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13558 } else {
13559 setInsertPointAfterBundle(E);
13560 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13561 }
13562 if (E->VectorizedValue) {
13563 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13564 return E->VectorizedValue;
13565 }
13566 if (LHS && RHS &&
13567 ((Instruction::isBinaryOp(E->getOpcode()) &&
13568 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
13569 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
13570 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
13571 getOperandEntry(E, 1)->isGather() ||
13572 MinBWs.contains(getOperandEntry(E, 0)) ||
13573 MinBWs.contains(getOperandEntry(E, 1))) &&
13574 "Expected item in MinBWs.");
13575 Type *CastTy = VecTy;
13576 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
13577 if (cast<VectorType>(LHS->getType())
13578 ->getElementType()
13579 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
13580 ->getElementType()
13581 ->getIntegerBitWidth())
13582 CastTy = RHS->getType();
13583 else
13584 CastTy = LHS->getType();
13585 }
13586 if (LHS->getType() != CastTy)
13587 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
13588 if (RHS->getType() != CastTy)
13589 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
13590 }
13591
13592 Value *V0, *V1;
13593 if (Instruction::isBinaryOp(E->getOpcode())) {
13594 V0 = Builder.CreateBinOp(
13595 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13596 V1 = Builder.CreateBinOp(
13597 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13598 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13599 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
13600 auto *AltCI = cast<CmpInst>(E->getAltOp());
13601 CmpInst::Predicate AltPred = AltCI->getPredicate();
13602 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
13603 } else {
13604 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13605 unsigned SrcBWSz = DL->getTypeSizeInBits(
13606 cast<VectorType>(LHS->getType())->getElementType());
13607 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13608 if (BWSz <= SrcBWSz) {
13609 if (BWSz < SrcBWSz)
13610 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
13611 assert(LHS->getType() == VecTy && "Expected same type as operand.");
13612 if (auto *I = dyn_cast<Instruction>(LHS))
13613 LHS = propagateMetadata(I, E->Scalars);
13614 E->VectorizedValue = LHS;
13615 ++NumVectorInstructions;
13616 return LHS;
13617 }
13618 }
13619 V0 = Builder.CreateCast(
13620 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
13621 V1 = Builder.CreateCast(
13622 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
13623 }
13624 // Add V0 and V1 to later analysis to try to find and remove matching
13625 // instruction, if any.
13626 for (Value *V : {V0, V1}) {
13627 if (auto *I = dyn_cast<Instruction>(V)) {
13628 GatherShuffleExtractSeq.insert(I);
13629 CSEBlocks.insert(I->getParent());
13630 }
13631 }
13632
13633 // Create shuffle to take alternate operations from the vector.
13634 // Also, gather up main and alt scalar ops to propagate IR flags to
13635 // each vector operation.
13636 ValueList OpScalars, AltScalars;
13637 SmallVector<int> Mask;
13638 E->buildAltOpShuffleMask(
13639 [E, this](Instruction *I) {
13640 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13641 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
13642 *TLI);
13643 },
13644 Mask, &OpScalars, &AltScalars);
13645
13646 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
13647 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
13648 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13649 // Drop nuw flags for abs(sub(commutative), true).
13650 if (auto *I = dyn_cast<Instruction>(Vec);
13651 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
13652 any_of(E->Scalars, [](Value *V) {
13653 auto *IV = cast<Instruction>(V);
13654 return IV->getOpcode() == Instruction::Sub &&
13655 isCommutative(cast<Instruction>(IV));
13656 }))
13657 I->setHasNoUnsignedWrap(/*b=*/false);
13658 };
13659 DropNuwFlag(V0, E->getOpcode());
13660 DropNuwFlag(V1, E->getAltOpcode());
13661
13662 Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
13663 if (auto *I = dyn_cast<Instruction>(V)) {
13664 V = propagateMetadata(I, E->Scalars);
13665 GatherShuffleExtractSeq.insert(I);
13666 CSEBlocks.insert(I->getParent());
13667 }
13668
13669 E->VectorizedValue = V;
13670 ++NumVectorInstructions;
13671
13672 return V;
13673 }
13674 default:
13675 llvm_unreachable("unknown inst");
13676 }
13677 return nullptr;
13678 }
13679
vectorizeTree()13680 Value *BoUpSLP::vectorizeTree() {
13681 ExtraValueToDebugLocsMap ExternallyUsedValues;
13682 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13683 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13684 }
13685
13686 namespace {
13687 /// Data type for handling buildvector sequences with the reused scalars from
13688 /// other tree entries.
13689 struct ShuffledInsertData {
13690 /// List of insertelements to be replaced by shuffles.
13691 SmallVector<InsertElementInst *> InsertElements;
13692 /// The parent vectors and shuffle mask for the given list of inserts.
13693 MapVector<Value *, SmallVector<int>> ValueMasks;
13694 };
13695 } // namespace
13696
vectorizeTree(const ExtraValueToDebugLocsMap & ExternallyUsedValues,SmallVectorImpl<std::pair<Value *,Value * >> & ReplacedExternals,Instruction * ReductionRoot)13697 Value *BoUpSLP::vectorizeTree(
13698 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13699 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13700 Instruction *ReductionRoot) {
13701 // All blocks must be scheduled before any instructions are inserted.
13702 for (auto &BSIter : BlocksSchedules) {
13703 scheduleBlock(BSIter.second.get());
13704 }
13705 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13706 // need to rebuild it.
13707 EntryToLastInstruction.clear();
13708
13709 if (ReductionRoot)
13710 Builder.SetInsertPoint(ReductionRoot->getParent(),
13711 ReductionRoot->getIterator());
13712 else
13713 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13714
13715 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13716 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13717 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13718 if (TE->State == TreeEntry::Vectorize &&
13719 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13720 TE->VectorizedValue)
13721 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
13722 // Run through the list of postponed gathers and emit them, replacing the temp
13723 // emitted allocas with actual vector instructions.
13724 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13725 DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
13726 for (const TreeEntry *E : PostponedNodes) {
13727 auto *TE = const_cast<TreeEntry *>(E);
13728 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
13729 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13730 TE->UserTreeIndices.front().EdgeIdx)) &&
13731 VecTE->isSame(TE->Scalars))
13732 // Found gather node which is absolutely the same as one of the
13733 // vectorized nodes. It may happen after reordering.
13734 continue;
13735 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13736 TE->VectorizedValue = nullptr;
13737 auto *UserI =
13738 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13739 // If user is a PHI node, its vector code have to be inserted right before
13740 // block terminator. Since the node was delayed, there were some unresolved
13741 // dependencies at the moment when stab instruction was emitted. In a case
13742 // when any of these dependencies turn out an operand of another PHI, coming
13743 // from this same block, position of a stab instruction will become invalid.
13744 // The is because source vector that supposed to feed this gather node was
13745 // inserted at the end of the block [after stab instruction]. So we need
13746 // to adjust insertion point again to the end of block.
13747 if (isa<PHINode>(UserI)) {
13748 // Insert before all users.
13749 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13750 for (User *U : PrevVec->users()) {
13751 if (U == UserI)
13752 continue;
13753 auto *UI = dyn_cast<Instruction>(U);
13754 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
13755 continue;
13756 if (UI->comesBefore(InsertPt))
13757 InsertPt = UI;
13758 }
13759 Builder.SetInsertPoint(InsertPt);
13760 } else {
13761 Builder.SetInsertPoint(PrevVec);
13762 }
13763 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13764 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
13765 if (Vec->getType() != PrevVec->getType()) {
13766 assert(Vec->getType()->isIntOrIntVectorTy() &&
13767 PrevVec->getType()->isIntOrIntVectorTy() &&
13768 "Expected integer vector types only.");
13769 std::optional<bool> IsSigned;
13770 for (Value *V : TE->Scalars) {
13771 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13772 auto It = MinBWs.find(BaseTE);
13773 if (It != MinBWs.end()) {
13774 IsSigned = IsSigned.value_or(false) || It->second.second;
13775 if (*IsSigned)
13776 break;
13777 }
13778 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
13779 auto It = MinBWs.find(MNTE);
13780 if (It != MinBWs.end()) {
13781 IsSigned = IsSigned.value_or(false) || It->second.second;
13782 if (*IsSigned)
13783 break;
13784 }
13785 }
13786 if (IsSigned.value_or(false))
13787 break;
13788 // Scan through gather nodes.
13789 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13790 auto It = MinBWs.find(BVE);
13791 if (It != MinBWs.end()) {
13792 IsSigned = IsSigned.value_or(false) || It->second.second;
13793 if (*IsSigned)
13794 break;
13795 }
13796 }
13797 if (IsSigned.value_or(false))
13798 break;
13799 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
13800 IsSigned =
13801 IsSigned.value_or(false) ||
13802 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
13803 continue;
13804 }
13805 if (IsSigned.value_or(false))
13806 break;
13807 }
13808 }
13809 if (IsSigned.value_or(false)) {
13810 // Final attempt - check user node.
13811 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
13812 if (It != MinBWs.end())
13813 IsSigned = It->second.second;
13814 }
13815 assert(IsSigned &&
13816 "Expected user node or perfect diamond match in MinBWs.");
13817 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
13818 }
13819 PrevVec->replaceAllUsesWith(Vec);
13820 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
13821 // Replace the stub vector node, if it was used before for one of the
13822 // buildvector nodes already.
13823 auto It = PostponedValues.find(PrevVec);
13824 if (It != PostponedValues.end()) {
13825 for (TreeEntry *VTE : It->getSecond())
13826 VTE->VectorizedValue = Vec;
13827 }
13828 eraseInstruction(PrevVec);
13829 }
13830
13831 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13832 << " values .\n");
13833
13834 SmallVector<ShuffledInsertData> ShuffledInserts;
13835 // Maps vector instruction to original insertelement instruction
13836 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
13837 // Maps extract Scalar to the corresponding extractelement instruction in the
13838 // basic block. Only one extractelement per block should be emitted.
13839 DenseMap<Value *,
13840 DenseMap<BasicBlock *, std::pair<Instruction *, Instruction *>>>
13841 ScalarToEEs;
13842 SmallDenseSet<Value *, 4> UsedInserts;
13843 DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
13844 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
13845 // Extract all of the elements with the external uses.
13846 for (const auto &ExternalUse : ExternalUses) {
13847 Value *Scalar = ExternalUse.Scalar;
13848 llvm::User *User = ExternalUse.User;
13849
13850 // Skip users that we already RAUW. This happens when one instruction
13851 // has multiple uses of the same value.
13852 if (User && !is_contained(Scalar->users(), User))
13853 continue;
13854 TreeEntry *E = getTreeEntry(Scalar);
13855 assert(E && "Invalid scalar");
13856 assert(!E->isGather() && "Extracting from a gather list");
13857 // Non-instruction pointers are not deleted, just skip them.
13858 if (E->getOpcode() == Instruction::GetElementPtr &&
13859 !isa<GetElementPtrInst>(Scalar))
13860 continue;
13861
13862 Value *Vec = E->VectorizedValue;
13863 assert(Vec && "Can't find vectorizable value");
13864
13865 Value *Lane = Builder.getInt32(ExternalUse.Lane);
13866 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13867 if (Scalar->getType() != Vec->getType()) {
13868 Value *Ex = nullptr;
13869 Value *ExV = nullptr;
13870 auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
13871 bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
13872 auto It = ScalarToEEs.find(Scalar);
13873 if (It != ScalarToEEs.end()) {
13874 // No need to emit many extracts, just move the only one in the
13875 // current block.
13876 auto EEIt = It->second.find(Builder.GetInsertBlock());
13877 if (EEIt != It->second.end()) {
13878 Instruction *I = EEIt->second.first;
13879 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13880 Builder.GetInsertPoint()->comesBefore(I)) {
13881 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
13882 Builder.GetInsertPoint());
13883 if (auto *CI = EEIt->second.second)
13884 CI->moveAfter(I);
13885 }
13886 Ex = I;
13887 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13888 }
13889 }
13890 if (!Ex) {
13891 // "Reuse" the existing extract to improve final codegen.
13892 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13893 Value *V = ES->getVectorOperand();
13894 if (const TreeEntry *ETE = getTreeEntry(V))
13895 V = ETE->VectorizedValue;
13896 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
13897 } else if (ReplaceGEP) {
13898 // Leave the GEPs as is, they are free in most cases and better to
13899 // keep them as GEPs.
13900 auto *CloneGEP = GEP->clone();
13901 if (isa<Instruction>(Vec))
13902 CloneGEP->insertBefore(*Builder.GetInsertBlock(),
13903 Builder.GetInsertPoint());
13904 else
13905 CloneGEP->insertBefore(GEP);
13906 if (GEP->hasName())
13907 CloneGEP->takeName(GEP);
13908 Ex = CloneGEP;
13909 } else {
13910 Ex = Builder.CreateExtractElement(Vec, Lane);
13911 }
13912 // If necessary, sign-extend or zero-extend ScalarRoot
13913 // to the larger type.
13914 ExV = Ex;
13915 if (Scalar->getType() != Ex->getType())
13916 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
13917 MinBWs.find(E)->second.second);
13918 if (auto *I = dyn_cast<Instruction>(Ex))
13919 ScalarToEEs[Scalar].try_emplace(
13920 Builder.GetInsertBlock(),
13921 std::make_pair(I, cast<Instruction>(ExV)));
13922 }
13923 // The then branch of the previous if may produce constants, since 0
13924 // operand might be a constant.
13925 if (auto *ExI = dyn_cast<Instruction>(Ex)) {
13926 GatherShuffleExtractSeq.insert(ExI);
13927 CSEBlocks.insert(ExI->getParent());
13928 }
13929 return ExV;
13930 }
13931 assert(isa<FixedVectorType>(Scalar->getType()) &&
13932 isa<InsertElementInst>(Scalar) &&
13933 "In-tree scalar of vector type is not insertelement?");
13934 auto *IE = cast<InsertElementInst>(Scalar);
13935 VectorToInsertElement.try_emplace(Vec, IE);
13936 return Vec;
13937 };
13938 // If User == nullptr, the Scalar remains as scalar in vectorized
13939 // instructions or is used as extra arg. Generate ExtractElement instruction
13940 // and update the record for this scalar in ExternallyUsedValues.
13941 if (!User) {
13942 if (!ScalarsWithNullptrUser.insert(Scalar).second)
13943 continue;
13944 assert((ExternallyUsedValues.count(Scalar) ||
13945 Scalar->hasNUsesOrMore(UsesLimit) ||
13946 any_of(Scalar->users(),
13947 [&](llvm::User *U) {
13948 if (ExternalUsesAsGEPs.contains(U))
13949 return true;
13950 TreeEntry *UseEntry = getTreeEntry(U);
13951 return UseEntry &&
13952 (UseEntry->State == TreeEntry::Vectorize ||
13953 UseEntry->State ==
13954 TreeEntry::StridedVectorize) &&
13955 (E->State == TreeEntry::Vectorize ||
13956 E->State == TreeEntry::StridedVectorize) &&
13957 doesInTreeUserNeedToExtract(
13958 Scalar,
13959 cast<Instruction>(UseEntry->Scalars.front()),
13960 TLI);
13961 })) &&
13962 "Scalar with nullptr User must be registered in "
13963 "ExternallyUsedValues map or remain as scalar in vectorized "
13964 "instructions");
13965 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13966 if (auto *PHI = dyn_cast<PHINode>(VecI))
13967 Builder.SetInsertPoint(PHI->getParent(),
13968 PHI->getParent()->getFirstNonPHIIt());
13969 else
13970 Builder.SetInsertPoint(VecI->getParent(),
13971 std::next(VecI->getIterator()));
13972 } else {
13973 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13974 }
13975 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13976 // Required to update internally referenced instructions.
13977 Scalar->replaceAllUsesWith(NewInst);
13978 ReplacedExternals.emplace_back(Scalar, NewInst);
13979 continue;
13980 }
13981
13982 if (auto *VU = dyn_cast<InsertElementInst>(User);
13983 VU && VU->getOperand(1) == Scalar) {
13984 // Skip if the scalar is another vector op or Vec is not an instruction.
13985 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13986 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
13987 if (!UsedInserts.insert(VU).second)
13988 continue;
13989 // Need to use original vector, if the root is truncated.
13990 auto BWIt = MinBWs.find(E);
13991 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13992 auto *ScalarTy = FTy->getElementType();
13993 auto Key = std::make_pair(Vec, ScalarTy);
13994 auto VecIt = VectorCasts.find(Key);
13995 if (VecIt == VectorCasts.end()) {
13996 IRBuilderBase::InsertPointGuard Guard(Builder);
13997 if (auto *IVec = dyn_cast<PHINode>(Vec))
13998 Builder.SetInsertPoint(
13999 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
14000 else if (auto *IVec = dyn_cast<Instruction>(Vec))
14001 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
14002 Vec = Builder.CreateIntCast(
14003 Vec,
14004 getWidenedType(
14005 ScalarTy,
14006 cast<FixedVectorType>(Vec->getType())->getNumElements()),
14007 BWIt->second.second);
14008 VectorCasts.try_emplace(Key, Vec);
14009 } else {
14010 Vec = VecIt->second;
14011 }
14012 }
14013
14014 std::optional<unsigned> InsertIdx = getElementIndex(VU);
14015 if (InsertIdx) {
14016 auto *It =
14017 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
14018 // Checks if 2 insertelements are from the same buildvector.
14019 InsertElementInst *VecInsert = Data.InsertElements.front();
14020 return areTwoInsertFromSameBuildVector(
14021 VU, VecInsert,
14022 [](InsertElementInst *II) { return II->getOperand(0); });
14023 });
14024 unsigned Idx = *InsertIdx;
14025 if (It == ShuffledInserts.end()) {
14026 (void)ShuffledInserts.emplace_back();
14027 It = std::next(ShuffledInserts.begin(),
14028 ShuffledInserts.size() - 1);
14029 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14030 if (Mask.empty())
14031 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
14032 // Find the insertvector, vectorized in tree, if any.
14033 Value *Base = VU;
14034 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
14035 if (IEBase != User &&
14036 (!IEBase->hasOneUse() ||
14037 getElementIndex(IEBase).value_or(Idx) == Idx))
14038 break;
14039 // Build the mask for the vectorized insertelement instructions.
14040 if (const TreeEntry *E = getTreeEntry(IEBase)) {
14041 do {
14042 IEBase = cast<InsertElementInst>(Base);
14043 int IEIdx = *getElementIndex(IEBase);
14044 assert(Mask[IEIdx] == PoisonMaskElem &&
14045 "InsertElementInstruction used already.");
14046 Mask[IEIdx] = IEIdx;
14047 Base = IEBase->getOperand(0);
14048 } while (E == getTreeEntry(Base));
14049 break;
14050 }
14051 Base = cast<InsertElementInst>(Base)->getOperand(0);
14052 // After the vectorization the def-use chain has changed, need
14053 // to look through original insertelement instructions, if they
14054 // get replaced by vector instructions.
14055 auto It = VectorToInsertElement.find(Base);
14056 if (It != VectorToInsertElement.end())
14057 Base = It->second;
14058 }
14059 }
14060 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14061 if (Mask.empty())
14062 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
14063 Mask[Idx] = ExternalUse.Lane;
14064 It->InsertElements.push_back(cast<InsertElementInst>(User));
14065 continue;
14066 }
14067 }
14068 }
14069 }
14070
14071 // Generate extracts for out-of-tree users.
14072 // Find the insertion point for the extractelement lane.
14073 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
14074 if (PHINode *PH = dyn_cast<PHINode>(User)) {
14075 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
14076 if (PH->getIncomingValue(I) == Scalar) {
14077 Instruction *IncomingTerminator =
14078 PH->getIncomingBlock(I)->getTerminator();
14079 if (isa<CatchSwitchInst>(IncomingTerminator)) {
14080 Builder.SetInsertPoint(VecI->getParent(),
14081 std::next(VecI->getIterator()));
14082 } else {
14083 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
14084 }
14085 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14086 PH->setOperand(I, NewInst);
14087 }
14088 }
14089 } else {
14090 Builder.SetInsertPoint(cast<Instruction>(User));
14091 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14092 User->replaceUsesOfWith(Scalar, NewInst);
14093 }
14094 } else {
14095 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
14096 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14097 User->replaceUsesOfWith(Scalar, NewInst);
14098 }
14099
14100 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
14101 }
14102
14103 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14104 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
14105 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
14106 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14107 for (int I = 0, E = Mask.size(); I < E; ++I) {
14108 if (Mask[I] < VF)
14109 CombinedMask1[I] = Mask[I];
14110 else
14111 CombinedMask2[I] = Mask[I] - VF;
14112 }
14113 ShuffleInstructionBuilder ShuffleBuilder(
14114 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
14115 ShuffleBuilder.add(V1, CombinedMask1);
14116 if (V2)
14117 ShuffleBuilder.add(V2, CombinedMask2);
14118 return ShuffleBuilder.finalize(std::nullopt);
14119 };
14120
14121 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
14122 bool ForSingleMask) {
14123 unsigned VF = Mask.size();
14124 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14125 if (VF != VecVF) {
14126 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
14127 Vec = CreateShuffle(Vec, nullptr, Mask);
14128 return std::make_pair(Vec, true);
14129 }
14130 if (!ForSingleMask) {
14131 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14132 for (unsigned I = 0; I < VF; ++I) {
14133 if (Mask[I] != PoisonMaskElem)
14134 ResizeMask[Mask[I]] = Mask[I];
14135 }
14136 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
14137 }
14138 }
14139
14140 return std::make_pair(Vec, false);
14141 };
14142 // Perform shuffling of the vectorize tree entries for better handling of
14143 // external extracts.
14144 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
14145 // Find the first and the last instruction in the list of insertelements.
14146 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
14147 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
14148 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
14149 Builder.SetInsertPoint(LastInsert);
14150 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
14151 Value *NewInst = performExtractsShuffleAction<Value>(
14152 MutableArrayRef(Vector.data(), Vector.size()),
14153 FirstInsert->getOperand(0),
14154 [](Value *Vec) {
14155 return cast<VectorType>(Vec->getType())
14156 ->getElementCount()
14157 .getKnownMinValue();
14158 },
14159 ResizeToVF,
14160 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
14161 ArrayRef<Value *> Vals) {
14162 assert((Vals.size() == 1 || Vals.size() == 2) &&
14163 "Expected exactly 1 or 2 input values.");
14164 if (Vals.size() == 1) {
14165 // Do not create shuffle if the mask is a simple identity
14166 // non-resizing mask.
14167 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
14168 ->getNumElements() ||
14169 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14170 return CreateShuffle(Vals.front(), nullptr, Mask);
14171 return Vals.front();
14172 }
14173 return CreateShuffle(Vals.front() ? Vals.front()
14174 : FirstInsert->getOperand(0),
14175 Vals.back(), Mask);
14176 });
14177 auto It = ShuffledInserts[I].InsertElements.rbegin();
14178 // Rebuild buildvector chain.
14179 InsertElementInst *II = nullptr;
14180 if (It != ShuffledInserts[I].InsertElements.rend())
14181 II = *It;
14182 SmallVector<Instruction *> Inserts;
14183 while (It != ShuffledInserts[I].InsertElements.rend()) {
14184 assert(II && "Must be an insertelement instruction.");
14185 if (*It == II)
14186 ++It;
14187 else
14188 Inserts.push_back(cast<Instruction>(II));
14189 II = dyn_cast<InsertElementInst>(II->getOperand(0));
14190 }
14191 for (Instruction *II : reverse(Inserts)) {
14192 II->replaceUsesOfWith(II->getOperand(0), NewInst);
14193 if (auto *NewI = dyn_cast<Instruction>(NewInst))
14194 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
14195 II->moveAfter(NewI);
14196 NewInst = II;
14197 }
14198 LastInsert->replaceAllUsesWith(NewInst);
14199 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
14200 IE->replaceUsesOfWith(IE->getOperand(0),
14201 PoisonValue::get(IE->getOperand(0)->getType()));
14202 IE->replaceUsesOfWith(IE->getOperand(1),
14203 PoisonValue::get(IE->getOperand(1)->getType()));
14204 eraseInstruction(IE);
14205 }
14206 CSEBlocks.insert(LastInsert->getParent());
14207 }
14208
14209 SmallVector<Instruction *> RemovedInsts;
14210 // For each vectorized value:
14211 for (auto &TEPtr : VectorizableTree) {
14212 TreeEntry *Entry = TEPtr.get();
14213
14214 // No need to handle users of gathered values.
14215 if (Entry->isGather())
14216 continue;
14217
14218 assert(Entry->VectorizedValue && "Can't find vectorizable value");
14219
14220 // For each lane:
14221 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14222 Value *Scalar = Entry->Scalars[Lane];
14223
14224 if (Entry->getOpcode() == Instruction::GetElementPtr &&
14225 !isa<GetElementPtrInst>(Scalar))
14226 continue;
14227 #ifndef NDEBUG
14228 Type *Ty = Scalar->getType();
14229 if (!Ty->isVoidTy()) {
14230 for (User *U : Scalar->users()) {
14231 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
14232
14233 // It is legal to delete users in the ignorelist.
14234 assert((getTreeEntry(U) ||
14235 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14236 (isa_and_nonnull<Instruction>(U) &&
14237 isDeleted(cast<Instruction>(U)))) &&
14238 "Deleting out-of-tree value");
14239 }
14240 }
14241 #endif
14242 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
14243 auto *I = cast<Instruction>(Scalar);
14244 RemovedInsts.push_back(I);
14245 }
14246 }
14247
14248 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
14249 // new vector instruction.
14250 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14251 V->mergeDIAssignID(RemovedInsts);
14252
14253 // Clear up reduction references, if any.
14254 if (UserIgnoreList) {
14255 for (Instruction *I : RemovedInsts) {
14256 if (getTreeEntry(I)->Idx != 0)
14257 continue;
14258 SmallVector<SelectInst *> LogicalOpSelects;
14259 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
14260 // Do not replace condition of the logical op in form select <cond>.
14261 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
14262 (match(U.getUser(), m_LogicalAnd()) ||
14263 match(U.getUser(), m_LogicalOr())) &&
14264 U.getOperandNo() == 0;
14265 if (IsPoisoningLogicalOp) {
14266 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
14267 return false;
14268 }
14269 return UserIgnoreList->contains(U.getUser());
14270 });
14271 // Replace conditions of the poisoning logical ops with the non-poison
14272 // constant value.
14273 for (SelectInst *SI : LogicalOpSelects)
14274 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
14275 }
14276 }
14277 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
14278 // cache correctness.
14279 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
14280 // - instructions are not deleted until later.
14281 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
14282
14283 Builder.ClearInsertionPoint();
14284 InstrElementSize.clear();
14285
14286 const TreeEntry &RootTE = *VectorizableTree.front();
14287 Value *Vec = RootTE.VectorizedValue;
14288 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14289 It != MinBWs.end() &&
14290 ReductionBitWidth != It->second.first) {
14291 IRBuilder<>::InsertPointGuard Guard(Builder);
14292 Builder.SetInsertPoint(ReductionRoot->getParent(),
14293 ReductionRoot->getIterator());
14294 Vec = Builder.CreateIntCast(
14295 Vec,
14296 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
14297 cast<VectorType>(Vec->getType())->getElementCount()),
14298 It->second.second);
14299 }
14300 return Vec;
14301 }
14302
optimizeGatherSequence()14303 void BoUpSLP::optimizeGatherSequence() {
14304 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
14305 << " gather sequences instructions.\n");
14306 // LICM InsertElementInst sequences.
14307 for (Instruction *I : GatherShuffleExtractSeq) {
14308 if (isDeleted(I))
14309 continue;
14310
14311 // Check if this block is inside a loop.
14312 Loop *L = LI->getLoopFor(I->getParent());
14313 if (!L)
14314 continue;
14315
14316 // Check if it has a preheader.
14317 BasicBlock *PreHeader = L->getLoopPreheader();
14318 if (!PreHeader)
14319 continue;
14320
14321 // If the vector or the element that we insert into it are
14322 // instructions that are defined in this basic block then we can't
14323 // hoist this instruction.
14324 if (any_of(I->operands(), [L](Value *V) {
14325 auto *OpI = dyn_cast<Instruction>(V);
14326 return OpI && L->contains(OpI);
14327 }))
14328 continue;
14329
14330 // We can hoist this instruction. Move it to the pre-header.
14331 I->moveBefore(PreHeader->getTerminator());
14332 CSEBlocks.insert(PreHeader);
14333 }
14334
14335 // Make a list of all reachable blocks in our CSE queue.
14336 SmallVector<const DomTreeNode *, 8> CSEWorkList;
14337 CSEWorkList.reserve(CSEBlocks.size());
14338 for (BasicBlock *BB : CSEBlocks)
14339 if (DomTreeNode *N = DT->getNode(BB)) {
14340 assert(DT->isReachableFromEntry(N));
14341 CSEWorkList.push_back(N);
14342 }
14343
14344 // Sort blocks by domination. This ensures we visit a block after all blocks
14345 // dominating it are visited.
14346 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
14347 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
14348 "Different nodes should have different DFS numbers");
14349 return A->getDFSNumIn() < B->getDFSNumIn();
14350 });
14351
14352 // Less defined shuffles can be replaced by the more defined copies.
14353 // Between two shuffles one is less defined if it has the same vector operands
14354 // and its mask indeces are the same as in the first one or undefs. E.g.
14355 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
14356 // poison, <0, 0, 0, 0>.
14357 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
14358 SmallVectorImpl<int> &NewMask) {
14359 if (I1->getType() != I2->getType())
14360 return false;
14361 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14362 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14363 if (!SI1 || !SI2)
14364 return I1->isIdenticalTo(I2);
14365 if (SI1->isIdenticalTo(SI2))
14366 return true;
14367 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
14368 if (SI1->getOperand(I) != SI2->getOperand(I))
14369 return false;
14370 // Check if the second instruction is more defined than the first one.
14371 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14372 ArrayRef<int> SM1 = SI1->getShuffleMask();
14373 // Count trailing undefs in the mask to check the final number of used
14374 // registers.
14375 unsigned LastUndefsCnt = 0;
14376 for (int I = 0, E = NewMask.size(); I < E; ++I) {
14377 if (SM1[I] == PoisonMaskElem)
14378 ++LastUndefsCnt;
14379 else
14380 LastUndefsCnt = 0;
14381 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
14382 NewMask[I] != SM1[I])
14383 return false;
14384 if (NewMask[I] == PoisonMaskElem)
14385 NewMask[I] = SM1[I];
14386 }
14387 // Check if the last undefs actually change the final number of used vector
14388 // registers.
14389 return SM1.size() - LastUndefsCnt > 1 &&
14390 TTI->getNumberOfParts(SI1->getType()) ==
14391 TTI->getNumberOfParts(
14392 getWidenedType(SI1->getType()->getElementType(),
14393 SM1.size() - LastUndefsCnt));
14394 };
14395 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
14396 // instructions. TODO: We can further optimize this scan if we split the
14397 // instructions into different buckets based on the insert lane.
14398 SmallVector<Instruction *, 16> Visited;
14399 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
14400 assert(*I &&
14401 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
14402 "Worklist not sorted properly!");
14403 BasicBlock *BB = (*I)->getBlock();
14404 // For all instructions in blocks containing gather sequences:
14405 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
14406 if (isDeleted(&In))
14407 continue;
14408 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14409 !GatherShuffleExtractSeq.contains(&In))
14410 continue;
14411
14412 // Check if we can replace this instruction with any of the
14413 // visited instructions.
14414 bool Replaced = false;
14415 for (Instruction *&V : Visited) {
14416 SmallVector<int> NewMask;
14417 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14418 DT->dominates(V->getParent(), In.getParent())) {
14419 In.replaceAllUsesWith(V);
14420 eraseInstruction(&In);
14421 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
14422 if (!NewMask.empty())
14423 SI->setShuffleMask(NewMask);
14424 Replaced = true;
14425 break;
14426 }
14427 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14428 GatherShuffleExtractSeq.contains(V) &&
14429 IsIdenticalOrLessDefined(V, &In, NewMask) &&
14430 DT->dominates(In.getParent(), V->getParent())) {
14431 In.moveAfter(V);
14432 V->replaceAllUsesWith(&In);
14433 eraseInstruction(V);
14434 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14435 if (!NewMask.empty())
14436 SI->setShuffleMask(NewMask);
14437 V = &In;
14438 Replaced = true;
14439 break;
14440 }
14441 }
14442 if (!Replaced) {
14443 assert(!is_contained(Visited, &In));
14444 Visited.push_back(&In);
14445 }
14446 }
14447 }
14448 CSEBlocks.clear();
14449 GatherShuffleExtractSeq.clear();
14450 }
14451
14452 BoUpSLP::ScheduleData *
buildBundle(ArrayRef<Value * > VL)14453 BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
14454 ScheduleData *Bundle = nullptr;
14455 ScheduleData *PrevInBundle = nullptr;
14456 for (Value *V : VL) {
14457 if (doesNotNeedToBeScheduled(V))
14458 continue;
14459 ScheduleData *BundleMember = getScheduleData(V);
14460 assert(BundleMember &&
14461 "no ScheduleData for bundle member "
14462 "(maybe not in same basic block)");
14463 assert(BundleMember->isSchedulingEntity() &&
14464 "bundle member already part of other bundle");
14465 if (PrevInBundle) {
14466 PrevInBundle->NextInBundle = BundleMember;
14467 } else {
14468 Bundle = BundleMember;
14469 }
14470
14471 // Group the instructions to a bundle.
14472 BundleMember->FirstInBundle = Bundle;
14473 PrevInBundle = BundleMember;
14474 }
14475 assert(Bundle && "Failed to find schedule bundle");
14476 return Bundle;
14477 }
14478
14479 // Groups the instructions to a bundle (which is then a single scheduling entity)
14480 // and schedules instructions until the bundle gets ready.
14481 std::optional<BoUpSLP::ScheduleData *>
tryScheduleBundle(ArrayRef<Value * > VL,BoUpSLP * SLP,const InstructionsState & S)14482 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
14483 const InstructionsState &S) {
14484 // No need to schedule PHIs, insertelement, extractelement and extractvalue
14485 // instructions.
14486 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
14487 doesNotNeedToSchedule(VL))
14488 return nullptr;
14489
14490 // Initialize the instruction bundle.
14491 Instruction *OldScheduleEnd = ScheduleEnd;
14492 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
14493
14494 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
14495 ScheduleData *Bundle) {
14496 // The scheduling region got new instructions at the lower end (or it is a
14497 // new region for the first bundle). This makes it necessary to
14498 // recalculate all dependencies.
14499 // It is seldom that this needs to be done a second time after adding the
14500 // initial bundle to the region.
14501 if (ScheduleEnd != OldScheduleEnd) {
14502 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
14503 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
14504 ReSchedule = true;
14505 }
14506 if (Bundle) {
14507 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
14508 << " in block " << BB->getName() << "\n");
14509 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
14510 }
14511
14512 if (ReSchedule) {
14513 resetSchedule();
14514 initialFillReadyList(ReadyInsts);
14515 }
14516
14517 // Now try to schedule the new bundle or (if no bundle) just calculate
14518 // dependencies. As soon as the bundle is "ready" it means that there are no
14519 // cyclic dependencies and we can schedule it. Note that's important that we
14520 // don't "schedule" the bundle yet (see cancelScheduling).
14521 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14522 !ReadyInsts.empty()) {
14523 ScheduleData *Picked = ReadyInsts.pop_back_val();
14524 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14525 "must be ready to schedule");
14526 schedule(Picked, ReadyInsts);
14527 }
14528 };
14529
14530 // Make sure that the scheduling region contains all
14531 // instructions of the bundle.
14532 for (Value *V : VL) {
14533 if (doesNotNeedToBeScheduled(V))
14534 continue;
14535 if (!extendSchedulingRegion(V, S)) {
14536 // If the scheduling region got new instructions at the lower end (or it
14537 // is a new region for the first bundle). This makes it necessary to
14538 // recalculate all dependencies.
14539 // Otherwise the compiler may crash trying to incorrectly calculate
14540 // dependencies and emit instruction in the wrong order at the actual
14541 // scheduling.
14542 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
14543 return std::nullopt;
14544 }
14545 }
14546
14547 bool ReSchedule = false;
14548 for (Value *V : VL) {
14549 if (doesNotNeedToBeScheduled(V))
14550 continue;
14551 ScheduleData *BundleMember = getScheduleData(V);
14552 assert(BundleMember &&
14553 "no ScheduleData for bundle member (maybe not in same basic block)");
14554
14555 // Make sure we don't leave the pieces of the bundle in the ready list when
14556 // whole bundle might not be ready.
14557 ReadyInsts.remove(BundleMember);
14558
14559 if (!BundleMember->IsScheduled)
14560 continue;
14561 // A bundle member was scheduled as single instruction before and now
14562 // needs to be scheduled as part of the bundle. We just get rid of the
14563 // existing schedule.
14564 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
14565 << " was already scheduled\n");
14566 ReSchedule = true;
14567 }
14568
14569 auto *Bundle = buildBundle(VL);
14570 TryScheduleBundleImpl(ReSchedule, Bundle);
14571 if (!Bundle->isReady()) {
14572 cancelScheduling(VL, S.OpValue);
14573 return std::nullopt;
14574 }
14575 return Bundle;
14576 }
14577
cancelScheduling(ArrayRef<Value * > VL,Value * OpValue)14578 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
14579 Value *OpValue) {
14580 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
14581 doesNotNeedToSchedule(VL))
14582 return;
14583
14584 if (doesNotNeedToBeScheduled(OpValue))
14585 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
14586 ScheduleData *Bundle = getScheduleData(OpValue);
14587 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
14588 assert(!Bundle->IsScheduled &&
14589 "Can't cancel bundle which is already scheduled");
14590 assert(Bundle->isSchedulingEntity() &&
14591 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
14592 "tried to unbundle something which is not a bundle");
14593
14594 // Remove the bundle from the ready list.
14595 if (Bundle->isReady())
14596 ReadyInsts.remove(Bundle);
14597
14598 // Un-bundle: make single instructions out of the bundle.
14599 ScheduleData *BundleMember = Bundle;
14600 while (BundleMember) {
14601 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
14602 BundleMember->FirstInBundle = BundleMember;
14603 ScheduleData *Next = BundleMember->NextInBundle;
14604 BundleMember->NextInBundle = nullptr;
14605 BundleMember->TE = nullptr;
14606 if (BundleMember->unscheduledDepsInBundle() == 0) {
14607 ReadyInsts.insert(BundleMember);
14608 }
14609 BundleMember = Next;
14610 }
14611 }
14612
allocateScheduleDataChunks()14613 BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14614 // Allocate a new ScheduleData for the instruction.
14615 if (ChunkPos >= ChunkSize) {
14616 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14617 ChunkPos = 0;
14618 }
14619 return &(ScheduleDataChunks.back()[ChunkPos++]);
14620 }
14621
extendSchedulingRegion(Value * V,const InstructionsState & S)14622 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14623 const InstructionsState &S) {
14624 if (getScheduleData(V, isOneOf(S, V)))
14625 return true;
14626 Instruction *I = dyn_cast<Instruction>(V);
14627 assert(I && "bundle member must be an instruction");
14628 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14629 !doesNotNeedToBeScheduled(I) &&
14630 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14631 "be scheduled");
14632 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
14633 ScheduleData *ISD = getScheduleData(I);
14634 if (!ISD)
14635 return false;
14636 assert(isInSchedulingRegion(ISD) &&
14637 "ScheduleData not in scheduling region");
14638 ScheduleData *SD = allocateScheduleDataChunks();
14639 SD->Inst = I;
14640 SD->init(SchedulingRegionID, S.OpValue);
14641 ExtraScheduleDataMap[I][S.OpValue] = SD;
14642 return true;
14643 };
14644 if (CheckScheduleForI(I))
14645 return true;
14646 if (!ScheduleStart) {
14647 // It's the first instruction in the new region.
14648 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
14649 ScheduleStart = I;
14650 ScheduleEnd = I->getNextNode();
14651 if (isOneOf(S, I) != I)
14652 CheckScheduleForI(I);
14653 assert(ScheduleEnd && "tried to vectorize a terminator?");
14654 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
14655 return true;
14656 }
14657 // Search up and down at the same time, because we don't know if the new
14658 // instruction is above or below the existing scheduling region.
14659 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14660 // against the budget. Otherwise debug info could affect codegen.
14661 BasicBlock::reverse_iterator UpIter =
14662 ++ScheduleStart->getIterator().getReverse();
14663 BasicBlock::reverse_iterator UpperEnd = BB->rend();
14664 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14665 BasicBlock::iterator LowerEnd = BB->end();
14666 auto IsAssumeLikeIntr = [](const Instruction &I) {
14667 if (auto *II = dyn_cast<IntrinsicInst>(&I))
14668 return II->isAssumeLikeIntrinsic();
14669 return false;
14670 };
14671 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14672 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14673 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14674 &*DownIter != I) {
14675 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14676 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
14677 return false;
14678 }
14679
14680 ++UpIter;
14681 ++DownIter;
14682
14683 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14684 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14685 }
14686 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
14687 assert(I->getParent() == ScheduleStart->getParent() &&
14688 "Instruction is in wrong basic block.");
14689 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
14690 ScheduleStart = I;
14691 if (isOneOf(S, I) != I)
14692 CheckScheduleForI(I);
14693 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
14694 << "\n");
14695 return true;
14696 }
14697 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
14698 "Expected to reach top of the basic block or instruction down the "
14699 "lower end.");
14700 assert(I->getParent() == ScheduleEnd->getParent() &&
14701 "Instruction is in wrong basic block.");
14702 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
14703 nullptr);
14704 ScheduleEnd = I->getNextNode();
14705 if (isOneOf(S, I) != I)
14706 CheckScheduleForI(I);
14707 assert(ScheduleEnd && "tried to vectorize a terminator?");
14708 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14709 return true;
14710 }
14711
initScheduleData(Instruction * FromI,Instruction * ToI,ScheduleData * PrevLoadStore,ScheduleData * NextLoadStore)14712 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14713 Instruction *ToI,
14714 ScheduleData *PrevLoadStore,
14715 ScheduleData *NextLoadStore) {
14716 ScheduleData *CurrentLoadStore = PrevLoadStore;
14717 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14718 // No need to allocate data for non-schedulable instructions.
14719 if (doesNotNeedToBeScheduled(I))
14720 continue;
14721 ScheduleData *SD = ScheduleDataMap.lookup(I);
14722 if (!SD) {
14723 SD = allocateScheduleDataChunks();
14724 ScheduleDataMap[I] = SD;
14725 SD->Inst = I;
14726 }
14727 assert(!isInSchedulingRegion(SD) &&
14728 "new ScheduleData already in scheduling region");
14729 SD->init(SchedulingRegionID, I);
14730
14731 if (I->mayReadOrWriteMemory() &&
14732 (!isa<IntrinsicInst>(I) ||
14733 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
14734 cast<IntrinsicInst>(I)->getIntrinsicID() !=
14735 Intrinsic::pseudoprobe))) {
14736 // Update the linked list of memory accessing instructions.
14737 if (CurrentLoadStore) {
14738 CurrentLoadStore->NextLoadStore = SD;
14739 } else {
14740 FirstLoadStoreInRegion = SD;
14741 }
14742 CurrentLoadStore = SD;
14743 }
14744
14745 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14746 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14747 RegionHasStackSave = true;
14748 }
14749 if (NextLoadStore) {
14750 if (CurrentLoadStore)
14751 CurrentLoadStore->NextLoadStore = NextLoadStore;
14752 } else {
14753 LastLoadStoreInRegion = CurrentLoadStore;
14754 }
14755 }
14756
calculateDependencies(ScheduleData * SD,bool InsertInReadyList,BoUpSLP * SLP)14757 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14758 bool InsertInReadyList,
14759 BoUpSLP *SLP) {
14760 assert(SD->isSchedulingEntity());
14761
14762 SmallVector<ScheduleData *, 10> WorkList;
14763 WorkList.push_back(SD);
14764
14765 while (!WorkList.empty()) {
14766 ScheduleData *SD = WorkList.pop_back_val();
14767 for (ScheduleData *BundleMember = SD; BundleMember;
14768 BundleMember = BundleMember->NextInBundle) {
14769 assert(isInSchedulingRegion(BundleMember));
14770 if (BundleMember->hasValidDependencies())
14771 continue;
14772
14773 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14774 << "\n");
14775 BundleMember->Dependencies = 0;
14776 BundleMember->resetUnscheduledDeps();
14777
14778 // Handle def-use chain dependencies.
14779 if (BundleMember->OpValue != BundleMember->Inst) {
14780 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14781 BundleMember->Dependencies++;
14782 ScheduleData *DestBundle = UseSD->FirstInBundle;
14783 if (!DestBundle->IsScheduled)
14784 BundleMember->incrementUnscheduledDeps(1);
14785 if (!DestBundle->hasValidDependencies())
14786 WorkList.push_back(DestBundle);
14787 }
14788 } else {
14789 for (User *U : BundleMember->Inst->users()) {
14790 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14791 BundleMember->Dependencies++;
14792 ScheduleData *DestBundle = UseSD->FirstInBundle;
14793 if (!DestBundle->IsScheduled)
14794 BundleMember->incrementUnscheduledDeps(1);
14795 if (!DestBundle->hasValidDependencies())
14796 WorkList.push_back(DestBundle);
14797 }
14798 }
14799 }
14800
14801 auto MakeControlDependent = [&](Instruction *I) {
14802 auto *DepDest = getScheduleData(I);
14803 assert(DepDest && "must be in schedule window");
14804 DepDest->ControlDependencies.push_back(BundleMember);
14805 BundleMember->Dependencies++;
14806 ScheduleData *DestBundle = DepDest->FirstInBundle;
14807 if (!DestBundle->IsScheduled)
14808 BundleMember->incrementUnscheduledDeps(1);
14809 if (!DestBundle->hasValidDependencies())
14810 WorkList.push_back(DestBundle);
14811 };
14812
14813 // Any instruction which isn't safe to speculate at the beginning of the
14814 // block is control dependend on any early exit or non-willreturn call
14815 // which proceeds it.
14816 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
14817 for (Instruction *I = BundleMember->Inst->getNextNode();
14818 I != ScheduleEnd; I = I->getNextNode()) {
14819 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
14820 continue;
14821
14822 // Add the dependency
14823 MakeControlDependent(I);
14824
14825 if (!isGuaranteedToTransferExecutionToSuccessor(I))
14826 // Everything past here must be control dependent on I.
14827 break;
14828 }
14829 }
14830
14831 if (RegionHasStackSave) {
14832 // If we have an inalloc alloca instruction, it needs to be scheduled
14833 // after any preceeding stacksave. We also need to prevent any alloca
14834 // from reordering above a preceeding stackrestore.
14835 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14836 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14837 for (Instruction *I = BundleMember->Inst->getNextNode();
14838 I != ScheduleEnd; I = I->getNextNode()) {
14839 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14840 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14841 // Any allocas past here must be control dependent on I, and I
14842 // must be memory dependend on BundleMember->Inst.
14843 break;
14844
14845 if (!isa<AllocaInst>(I))
14846 continue;
14847
14848 // Add the dependency
14849 MakeControlDependent(I);
14850 }
14851 }
14852
14853 // In addition to the cases handle just above, we need to prevent
14854 // allocas and loads/stores from moving below a stacksave or a
14855 // stackrestore. Avoiding moving allocas below stackrestore is currently
14856 // thought to be conservatism. Moving loads/stores below a stackrestore
14857 // can lead to incorrect code.
14858 if (isa<AllocaInst>(BundleMember->Inst) ||
14859 BundleMember->Inst->mayReadOrWriteMemory()) {
14860 for (Instruction *I = BundleMember->Inst->getNextNode();
14861 I != ScheduleEnd; I = I->getNextNode()) {
14862 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
14863 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14864 continue;
14865
14866 // Add the dependency
14867 MakeControlDependent(I);
14868 break;
14869 }
14870 }
14871 }
14872
14873 // Handle the memory dependencies (if any).
14874 ScheduleData *DepDest = BundleMember->NextLoadStore;
14875 if (!DepDest)
14876 continue;
14877 Instruction *SrcInst = BundleMember->Inst;
14878 assert(SrcInst->mayReadOrWriteMemory() &&
14879 "NextLoadStore list for non memory effecting bundle?");
14880 MemoryLocation SrcLoc = getLocation(SrcInst);
14881 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14882 unsigned NumAliased = 0;
14883 unsigned DistToSrc = 1;
14884
14885 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14886 assert(isInSchedulingRegion(DepDest));
14887
14888 // We have two limits to reduce the complexity:
14889 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
14890 // SLP->isAliased (which is the expensive part in this loop).
14891 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
14892 // the whole loop (even if the loop is fast, it's quadratic).
14893 // It's important for the loop break condition (see below) to
14894 // check this limit even between two read-only instructions.
14895 if (DistToSrc >= MaxMemDepDistance ||
14896 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14897 (NumAliased >= AliasedCheckLimit ||
14898 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14899
14900 // We increment the counter only if the locations are aliased
14901 // (instead of counting all alias checks). This gives a better
14902 // balance between reduced runtime and accurate dependencies.
14903 NumAliased++;
14904
14905 DepDest->MemoryDependencies.push_back(BundleMember);
14906 BundleMember->Dependencies++;
14907 ScheduleData *DestBundle = DepDest->FirstInBundle;
14908 if (!DestBundle->IsScheduled) {
14909 BundleMember->incrementUnscheduledDeps(1);
14910 }
14911 if (!DestBundle->hasValidDependencies()) {
14912 WorkList.push_back(DestBundle);
14913 }
14914 }
14915
14916 // Example, explaining the loop break condition: Let's assume our
14917 // starting instruction is i0 and MaxMemDepDistance = 3.
14918 //
14919 // +--------v--v--v
14920 // i0,i1,i2,i3,i4,i5,i6,i7,i8
14921 // +--------^--^--^
14922 //
14923 // MaxMemDepDistance let us stop alias-checking at i3 and we add
14924 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
14925 // Previously we already added dependencies from i3 to i6,i7,i8
14926 // (because of MaxMemDepDistance). As we added a dependency from
14927 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14928 // and we can abort this loop at i6.
14929 if (DistToSrc >= 2 * MaxMemDepDistance)
14930 break;
14931 DistToSrc++;
14932 }
14933 }
14934 if (InsertInReadyList && SD->isReady()) {
14935 ReadyInsts.insert(SD);
14936 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
14937 << "\n");
14938 }
14939 }
14940 }
14941
resetSchedule()14942 void BoUpSLP::BlockScheduling::resetSchedule() {
14943 assert(ScheduleStart &&
14944 "tried to reset schedule on block which has not been scheduled");
14945 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14946 doForAllOpcodes(I, [&](ScheduleData *SD) {
14947 assert(isInSchedulingRegion(SD) &&
14948 "ScheduleData not in scheduling region");
14949 SD->IsScheduled = false;
14950 SD->resetUnscheduledDeps();
14951 });
14952 }
14953 ReadyInsts.clear();
14954 }
14955
scheduleBlock(BlockScheduling * BS)14956 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14957 if (!BS->ScheduleStart)
14958 return;
14959
14960 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14961
14962 // A key point - if we got here, pre-scheduling was able to find a valid
14963 // scheduling of the sub-graph of the scheduling window which consists
14964 // of all vector bundles and their transitive users. As such, we do not
14965 // need to reschedule anything *outside of* that subgraph.
14966
14967 BS->resetSchedule();
14968
14969 // For the real scheduling we use a more sophisticated ready-list: it is
14970 // sorted by the original instruction location. This lets the final schedule
14971 // be as close as possible to the original instruction order.
14972 // WARNING: If changing this order causes a correctness issue, that means
14973 // there is some missing dependence edge in the schedule data graph.
14974 struct ScheduleDataCompare {
14975 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
14976 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14977 }
14978 };
14979 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14980
14981 // Ensure that all dependency data is updated (for nodes in the sub-graph)
14982 // and fill the ready-list with initial instructions.
14983 int Idx = 0;
14984 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14985 I = I->getNextNode()) {
14986 BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
14987 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14988 (void)SDTE;
14989 assert((isVectorLikeInstWithConstOps(SD->Inst) ||
14990 SD->isPartOfBundle() ==
14991 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14992 "scheduler and vectorizer bundle mismatch");
14993 SD->FirstInBundle->SchedulingPriority = Idx++;
14994
14995 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14996 BS->calculateDependencies(SD, false, this);
14997 });
14998 }
14999 BS->initialFillReadyList(ReadyInsts);
15000
15001 Instruction *LastScheduledInst = BS->ScheduleEnd;
15002
15003 // Do the "real" scheduling.
15004 while (!ReadyInsts.empty()) {
15005 ScheduleData *Picked = *ReadyInsts.begin();
15006 ReadyInsts.erase(ReadyInsts.begin());
15007
15008 // Move the scheduled instruction(s) to their dedicated places, if not
15009 // there yet.
15010 for (ScheduleData *BundleMember = Picked; BundleMember;
15011 BundleMember = BundleMember->NextInBundle) {
15012 Instruction *PickedInst = BundleMember->Inst;
15013 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
15014 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
15015 LastScheduledInst = PickedInst;
15016 }
15017
15018 BS->schedule(Picked, ReadyInsts);
15019 }
15020
15021 // Check that we didn't break any of our invariants.
15022 #ifdef EXPENSIVE_CHECKS
15023 BS->verify();
15024 #endif
15025
15026 #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
15027 // Check that all schedulable entities got scheduled
15028 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
15029 BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
15030 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
15031 assert(SD->IsScheduled && "must be scheduled at this point");
15032 }
15033 });
15034 }
15035 #endif
15036
15037 // Avoid duplicate scheduling of the block.
15038 BS->ScheduleStart = nullptr;
15039 }
15040
getVectorElementSize(Value * V)15041 unsigned BoUpSLP::getVectorElementSize(Value *V) {
15042 // If V is a store, just return the width of the stored value (or value
15043 // truncated just before storing) without traversing the expression tree.
15044 // This is the common case.
15045 if (auto *Store = dyn_cast<StoreInst>(V))
15046 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
15047
15048 if (auto *IEI = dyn_cast<InsertElementInst>(V))
15049 return getVectorElementSize(IEI->getOperand(1));
15050
15051 auto E = InstrElementSize.find(V);
15052 if (E != InstrElementSize.end())
15053 return E->second;
15054
15055 // If V is not a store, we can traverse the expression tree to find loads
15056 // that feed it. The type of the loaded value may indicate a more suitable
15057 // width than V's type. We want to base the vector element size on the width
15058 // of memory operations where possible.
15059 SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;
15060 SmallPtrSet<Instruction *, 16> Visited;
15061 if (auto *I = dyn_cast<Instruction>(V)) {
15062 Worklist.emplace_back(I, I->getParent(), 0);
15063 Visited.insert(I);
15064 }
15065
15066 // Traverse the expression tree in bottom-up order looking for loads. If we
15067 // encounter an instruction we don't yet handle, we give up.
15068 auto Width = 0u;
15069 Value *FirstNonBool = nullptr;
15070 while (!Worklist.empty()) {
15071 auto [I, Parent, Level] = Worklist.pop_back_val();
15072
15073 // We should only be looking at scalar instructions here. If the current
15074 // instruction has a vector type, skip.
15075 auto *Ty = I->getType();
15076 if (isa<VectorType>(Ty))
15077 continue;
15078 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
15079 FirstNonBool = I;
15080 if (Level > RecursionMaxDepth)
15081 continue;
15082
15083 // If the current instruction is a load, update MaxWidth to reflect the
15084 // width of the loaded value.
15085 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
15086 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
15087
15088 // Otherwise, we need to visit the operands of the instruction. We only
15089 // handle the interesting cases from buildTree here. If an operand is an
15090 // instruction we haven't yet visited and from the same basic block as the
15091 // user or the use is a PHI node, we add it to the worklist.
15092 else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
15093 BinaryOperator, UnaryOperator>(I)) {
15094 for (Use &U : I->operands()) {
15095 if (auto *J = dyn_cast<Instruction>(U.get()))
15096 if (Visited.insert(J).second &&
15097 (isa<PHINode>(I) || J->getParent() == Parent)) {
15098 Worklist.emplace_back(J, J->getParent(), Level + 1);
15099 continue;
15100 }
15101 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
15102 FirstNonBool = U.get();
15103 }
15104 } else {
15105 break;
15106 }
15107 }
15108
15109 // If we didn't encounter a memory access in the expression tree, or if we
15110 // gave up for some reason, just return the width of V. Otherwise, return the
15111 // maximum width we found.
15112 if (!Width) {
15113 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
15114 V = FirstNonBool;
15115 Width = DL->getTypeSizeInBits(V->getType());
15116 }
15117
15118 for (Instruction *I : Visited)
15119 InstrElementSize[I] = Width;
15120
15121 return Width;
15122 }
15123
collectValuesToDemote(const TreeEntry & E,bool IsProfitableToDemoteRoot,unsigned & BitWidth,SmallVectorImpl<unsigned> & ToDemote,DenseSet<const TreeEntry * > & Visited,unsigned & MaxDepthLevel,bool & IsProfitableToDemote,bool IsTruncRoot) const15124 bool BoUpSLP::collectValuesToDemote(
15125 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
15126 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
15127 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
15128 bool IsTruncRoot) const {
15129 // We can always demote constants.
15130 if (all_of(E.Scalars, IsaPred<Constant>))
15131 return true;
15132
15133 unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType());
15134 if (OrigBitWidth == BitWidth) {
15135 MaxDepthLevel = 1;
15136 return true;
15137 }
15138
15139 // If the value is not a vectorized instruction in the expression and not used
15140 // by the insertelement instruction and not used in multiple vector nodes, it
15141 // cannot be demoted.
15142 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
15143 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15144 });
15145 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
15146 if (MultiNodeScalars.contains(V))
15147 return false;
15148 // For lat shuffle of sext/zext with many uses need to check the extra bit
15149 // for unsigned values, otherwise may have incorrect casting for reused
15150 // scalars.
15151 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
15152 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
15153 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15154 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
15155 return true;
15156 }
15157 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15158 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
15159 if (IsSignedNode)
15160 ++BitWidth1;
15161 if (auto *I = dyn_cast<Instruction>(V)) {
15162 APInt Mask = DB->getDemandedBits(I);
15163 unsigned BitWidth2 =
15164 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
15165 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
15166 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
15167 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
15168 break;
15169 BitWidth2 *= 2;
15170 }
15171 BitWidth1 = std::min(BitWidth1, BitWidth2);
15172 }
15173 BitWidth = std::max(BitWidth, BitWidth1);
15174 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
15175 };
15176 using namespace std::placeholders;
15177 auto FinalAnalysis = [&]() {
15178 if (!IsProfitableToDemote)
15179 return false;
15180 bool Res = all_of(
15181 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
15182 // Demote gathers.
15183 if (Res && E.isGather()) {
15184 // Check possible extractelement instructions bases and final vector
15185 // length.
15186 SmallPtrSet<Value *, 4> UniqueBases;
15187 for (Value *V : E.Scalars) {
15188 auto *EE = dyn_cast<ExtractElementInst>(V);
15189 if (!EE)
15190 continue;
15191 UniqueBases.insert(EE->getVectorOperand());
15192 }
15193 const unsigned VF = E.Scalars.size();
15194 Type *OrigScalarTy = E.Scalars.front()->getType();
15195 if (UniqueBases.size() <= 2 ||
15196 TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
15197 TTI->getNumberOfParts(getWidenedType(
15198 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
15199 ToDemote.push_back(E.Idx);
15200 }
15201 return Res;
15202 };
15203 if (E.isGather() || !Visited.insert(&E).second ||
15204 any_of(E.Scalars, [&](Value *V) {
15205 return all_of(V->users(), [&](User *U) {
15206 return isa<InsertElementInst>(U) && !getTreeEntry(U);
15207 });
15208 }))
15209 return FinalAnalysis();
15210
15211 if (any_of(E.Scalars, [&](Value *V) {
15212 return !all_of(V->users(), [=](User *U) {
15213 return getTreeEntry(U) ||
15214 (E.Idx == 0 && UserIgnoreList &&
15215 UserIgnoreList->contains(U)) ||
15216 (!isa<CmpInst>(U) && U->getType()->isSized() &&
15217 !U->getType()->isScalableTy() &&
15218 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
15219 }) && !IsPotentiallyTruncated(V, BitWidth);
15220 }))
15221 return false;
15222
15223 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
15224 bool &NeedToExit) {
15225 NeedToExit = false;
15226 unsigned InitLevel = MaxDepthLevel;
15227 for (const TreeEntry *Op : Operands) {
15228 unsigned Level = InitLevel;
15229 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
15230 ToDemote, Visited, Level, IsProfitableToDemote,
15231 IsTruncRoot)) {
15232 if (!IsProfitableToDemote)
15233 return false;
15234 NeedToExit = true;
15235 if (!FinalAnalysis())
15236 return false;
15237 continue;
15238 }
15239 MaxDepthLevel = std::max(MaxDepthLevel, Level);
15240 }
15241 return true;
15242 };
15243 auto AttemptCheckBitwidth =
15244 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
15245 // Try all bitwidth < OrigBitWidth.
15246 NeedToExit = false;
15247 unsigned BestFailBitwidth = 0;
15248 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
15249 if (Checker(BitWidth, OrigBitWidth))
15250 return true;
15251 if (BestFailBitwidth == 0 && FinalAnalysis())
15252 BestFailBitwidth = BitWidth;
15253 }
15254 if (BitWidth >= OrigBitWidth) {
15255 if (BestFailBitwidth == 0) {
15256 BitWidth = OrigBitWidth;
15257 return false;
15258 }
15259 MaxDepthLevel = 1;
15260 BitWidth = BestFailBitwidth;
15261 NeedToExit = true;
15262 return true;
15263 }
15264 return false;
15265 };
15266 auto TryProcessInstruction =
15267 [&](unsigned &BitWidth,
15268 ArrayRef<const TreeEntry *> Operands = std::nullopt,
15269 function_ref<bool(unsigned, unsigned)> Checker = {}) {
15270 if (Operands.empty()) {
15271 if (!IsTruncRoot)
15272 MaxDepthLevel = 1;
15273 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15274 std::ref(BitWidth)));
15275 } else {
15276 // Several vectorized uses? Check if we can truncate it, otherwise -
15277 // exit.
15278 if (E.UserTreeIndices.size() > 1 &&
15279 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15280 std::ref(BitWidth))))
15281 return false;
15282 bool NeedToExit = false;
15283 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15284 return false;
15285 if (NeedToExit)
15286 return true;
15287 if (!ProcessOperands(Operands, NeedToExit))
15288 return false;
15289 if (NeedToExit)
15290 return true;
15291 }
15292
15293 ++MaxDepthLevel;
15294 // Record the entry that we can demote.
15295 ToDemote.push_back(E.Idx);
15296 return IsProfitableToDemote;
15297 };
15298 switch (E.getOpcode()) {
15299
15300 // We can always demote truncations and extensions. Since truncations can
15301 // seed additional demotion, we save the truncated value.
15302 case Instruction::Trunc:
15303 if (IsProfitableToDemoteRoot)
15304 IsProfitableToDemote = true;
15305 return TryProcessInstruction(BitWidth);
15306 case Instruction::ZExt:
15307 case Instruction::SExt:
15308 IsProfitableToDemote = true;
15309 return TryProcessInstruction(BitWidth);
15310
15311 // We can demote certain binary operations if we can demote both of their
15312 // operands.
15313 case Instruction::Add:
15314 case Instruction::Sub:
15315 case Instruction::Mul:
15316 case Instruction::And:
15317 case Instruction::Or:
15318 case Instruction::Xor: {
15319 return TryProcessInstruction(
15320 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15321 }
15322 case Instruction::Shl: {
15323 // If we are truncating the result of this SHL, and if it's a shift of an
15324 // inrange amount, we can always perform a SHL in a smaller type.
15325 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
15326 return all_of(E.Scalars, [&](Value *V) {
15327 auto *I = cast<Instruction>(V);
15328 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15329 return AmtKnownBits.getMaxValue().ult(BitWidth);
15330 });
15331 };
15332 return TryProcessInstruction(
15333 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15334 }
15335 case Instruction::LShr: {
15336 // If this is a truncate of a logical shr, we can truncate it to a smaller
15337 // lshr iff we know that the bits we would otherwise be shifting in are
15338 // already zeros.
15339 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15340 return all_of(E.Scalars, [&](Value *V) {
15341 auto *I = cast<Instruction>(V);
15342 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15343 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15344 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15345 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15346 SimplifyQuery(*DL));
15347 });
15348 };
15349 return TryProcessInstruction(
15350 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15351 LShrChecker);
15352 }
15353 case Instruction::AShr: {
15354 // If this is a truncate of an arithmetic shr, we can truncate it to a
15355 // smaller ashr iff we know that all the bits from the sign bit of the
15356 // original type and the sign bit of the truncate type are similar.
15357 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15358 return all_of(E.Scalars, [&](Value *V) {
15359 auto *I = cast<Instruction>(V);
15360 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15361 unsigned ShiftedBits = OrigBitWidth - BitWidth;
15362 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15363 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15364 nullptr, DT);
15365 });
15366 };
15367 return TryProcessInstruction(
15368 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15369 AShrChecker);
15370 }
15371 case Instruction::UDiv:
15372 case Instruction::URem: {
15373 // UDiv and URem can be truncated if all the truncated bits are zero.
15374 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15375 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15376 return all_of(E.Scalars, [&](Value *V) {
15377 auto *I = cast<Instruction>(V);
15378 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15379 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15380 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15381 });
15382 };
15383 return TryProcessInstruction(
15384 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15385 }
15386
15387 // We can demote selects if we can demote their true and false values.
15388 case Instruction::Select: {
15389 return TryProcessInstruction(
15390 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15391 }
15392
15393 // We can demote phis if we can demote all their incoming operands. Note that
15394 // we don't need to worry about cycles since we ensure single use above.
15395 case Instruction::PHI: {
15396 const unsigned NumOps = E.getNumOperands();
15397 SmallVector<const TreeEntry *> Ops(NumOps);
15398 transform(seq<unsigned>(0, NumOps), Ops.begin(),
15399 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
15400
15401 return TryProcessInstruction(BitWidth, Ops);
15402 }
15403
15404 case Instruction::Call: {
15405 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15406 if (!IC)
15407 break;
15408 Intrinsic::ID ID = getVectorIntrinsicIDForCall(IC, TLI);
15409 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
15410 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
15411 break;
15412 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
15413 function_ref<bool(unsigned, unsigned)> CallChecker;
15414 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15415 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15416 return all_of(E.Scalars, [&](Value *V) {
15417 auto *I = cast<Instruction>(V);
15418 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15419 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15420 return MaskedValueIsZero(I->getOperand(0), Mask,
15421 SimplifyQuery(*DL)) &&
15422 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15423 }
15424 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
15425 "Expected min/max intrinsics only.");
15426 unsigned SignBits = OrigBitWidth - BitWidth;
15427 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
15428 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15429 nullptr, DT);
15430 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
15431 nullptr, DT);
15432 return SignBits <= Op0SignBits &&
15433 ((SignBits != Op0SignBits &&
15434 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
15435 MaskedValueIsZero(I->getOperand(0), Mask,
15436 SimplifyQuery(*DL))) &&
15437 SignBits <= Op1SignBits &&
15438 ((SignBits != Op1SignBits &&
15439 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
15440 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
15441 });
15442 };
15443 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15444 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15445 return all_of(E.Scalars, [&](Value *V) {
15446 auto *I = cast<Instruction>(V);
15447 unsigned SignBits = OrigBitWidth - BitWidth;
15448 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
15449 unsigned Op0SignBits =
15450 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
15451 return SignBits <= Op0SignBits &&
15452 ((SignBits != Op0SignBits &&
15453 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
15454 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
15455 });
15456 };
15457 if (ID != Intrinsic::abs) {
15458 Operands.push_back(getOperandEntry(&E, 1));
15459 CallChecker = CompChecker;
15460 } else {
15461 CallChecker = AbsChecker;
15462 }
15463 InstructionCost BestCost =
15464 std::numeric_limits<InstructionCost::CostType>::max();
15465 unsigned BestBitWidth = BitWidth;
15466 unsigned VF = E.Scalars.size();
15467 // Choose the best bitwidth based on cost estimations.
15468 auto Checker = [&](unsigned BitWidth, unsigned) {
15469 unsigned MinBW = PowerOf2Ceil(BitWidth);
15470 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
15471 auto VecCallCosts = getVectorCallCosts(
15472 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
15473 TTI, TLI, ArgTys);
15474 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
15475 if (Cost < BestCost) {
15476 BestCost = Cost;
15477 BestBitWidth = BitWidth;
15478 }
15479 return false;
15480 };
15481 [[maybe_unused]] bool NeedToExit;
15482 (void)AttemptCheckBitwidth(Checker, NeedToExit);
15483 BitWidth = BestBitWidth;
15484 return TryProcessInstruction(BitWidth, Operands, CallChecker);
15485 }
15486
15487 // Otherwise, conservatively give up.
15488 default:
15489 break;
15490 }
15491 MaxDepthLevel = 1;
15492 return FinalAnalysis();
15493 }
15494
15495 static RecurKind getRdxKind(Value *V);
15496
computeMinimumValueSizes()15497 void BoUpSLP::computeMinimumValueSizes() {
15498 // We only attempt to truncate integer expressions.
15499 bool IsStoreOrInsertElt =
15500 VectorizableTree.front()->getOpcode() == Instruction::Store ||
15501 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15502 if ((IsStoreOrInsertElt || UserIgnoreList) &&
15503 ExtraBitWidthNodes.size() <= 1 &&
15504 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15505 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15506 return;
15507
15508 unsigned NodeIdx = 0;
15509 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
15510 NodeIdx = 1;
15511
15512 // Ensure the roots of the vectorizable tree don't form a cycle.
15513 if (VectorizableTree[NodeIdx]->isGather() ||
15514 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
15515 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15516 [NodeIdx](const EdgeInfo &EI) {
15517 return EI.UserTE->Idx >
15518 static_cast<int>(NodeIdx);
15519 })))
15520 return;
15521
15522 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
15523 // resize to the final type.
15524 bool IsTruncRoot = false;
15525 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15526 SmallVector<unsigned> RootDemotes;
15527 if (NodeIdx != 0 &&
15528 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15529 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15530 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
15531 IsTruncRoot = true;
15532 RootDemotes.push_back(NodeIdx);
15533 IsProfitableToDemoteRoot = true;
15534 ++NodeIdx;
15535 }
15536
15537 // Analyzed the reduction already and not profitable - exit.
15538 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
15539 return;
15540
15541 SmallVector<unsigned> ToDemote;
15542 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
15543 bool IsProfitableToDemoteRoot, unsigned Opcode,
15544 unsigned Limit, bool IsTruncRoot,
15545 bool IsSignedCmp) -> unsigned {
15546 ToDemote.clear();
15547 // Check if the root is trunc and the next node is gather/buildvector, then
15548 // keep trunc in scalars, which is free in most cases.
15549 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
15550 E.Idx > (IsStoreOrInsertElt ? 2 : 1) &&
15551 all_of(E.Scalars, [&](Value *V) {
15552 return V->hasOneUse() || isa<Constant>(V) ||
15553 (!V->hasNUsesOrMore(UsesLimit) &&
15554 none_of(V->users(), [&](User *U) {
15555 const TreeEntry *TE = getTreeEntry(U);
15556 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15557 if (TE == UserTE || !TE)
15558 return false;
15559 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
15560 SelectInst>(U) ||
15561 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
15562 SelectInst>(UserTE->getMainOp()))
15563 return true;
15564 unsigned UserTESz = DL->getTypeSizeInBits(
15565 UserTE->Scalars.front()->getType());
15566 auto It = MinBWs.find(TE);
15567 if (It != MinBWs.end() && It->second.first > UserTESz)
15568 return true;
15569 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
15570 }));
15571 })) {
15572 ToDemote.push_back(E.Idx);
15573 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15574 auto It = MinBWs.find(UserTE);
15575 if (It != MinBWs.end())
15576 return It->second.first;
15577 unsigned MaxBitWidth =
15578 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
15579 MaxBitWidth = bit_ceil(MaxBitWidth);
15580 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15581 MaxBitWidth = 8;
15582 return MaxBitWidth;
15583 }
15584
15585 unsigned VF = E.getVectorFactor();
15586 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
15587 if (!TreeRootIT || !Opcode)
15588 return 0u;
15589
15590 if (any_of(E.Scalars,
15591 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
15592 return 0u;
15593
15594 unsigned NumParts = TTI->getNumberOfParts(getWidenedType(TreeRootIT, VF));
15595
15596 // The maximum bit width required to represent all the values that can be
15597 // demoted without loss of precision. It would be safe to truncate the roots
15598 // of the expression to this width.
15599 unsigned MaxBitWidth = 1u;
15600
15601 // True if the roots can be zero-extended back to their original type,
15602 // rather than sign-extended. We know that if the leading bits are not
15603 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
15604 // True.
15605 // Determine if the sign bit of all the roots is known to be zero. If not,
15606 // IsKnownPositive is set to False.
15607 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
15608 KnownBits Known = computeKnownBits(R, *DL);
15609 return Known.isNonNegative();
15610 });
15611
15612 // We first check if all the bits of the roots are demanded. If they're not,
15613 // we can truncate the roots to this narrower type.
15614 for (Value *Root : E.Scalars) {
15615 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
15616 TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
15617 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15618 // If we can't prove that the sign bit is zero, we must add one to the
15619 // maximum bit width to account for the unknown sign bit. This preserves
15620 // the existing sign bit so we can safely sign-extend the root back to the
15621 // original type. Otherwise, if we know the sign bit is zero, we will
15622 // zero-extend the root instead.
15623 //
15624 // FIXME: This is somewhat suboptimal, as there will be cases where adding
15625 // one to the maximum bit width will yield a larger-than-necessary
15626 // type. In general, we need to add an extra bit only if we can't
15627 // prove that the upper bit of the original type is equal to the
15628 // upper bit of the proposed smaller type. If these two bits are
15629 // the same (either zero or one) we know that sign-extending from
15630 // the smaller type will result in the same value. Here, since we
15631 // can't yet prove this, we are just making the proposed smaller
15632 // type larger to ensure correctness.
15633 if (!IsKnownPositive)
15634 ++BitWidth1;
15635
15636 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
15637 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15638 MaxBitWidth =
15639 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15640 }
15641
15642 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15643 MaxBitWidth = 8;
15644
15645 // If the original type is large, but reduced type does not improve the reg
15646 // use - ignore it.
15647 if (NumParts > 1 &&
15648 NumParts ==
15649 TTI->getNumberOfParts(getWidenedType(
15650 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
15651 return 0u;
15652
15653 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15654 Opcode == Instruction::SExt ||
15655 Opcode == Instruction::ZExt || NumParts > 1;
15656 // Conservatively determine if we can actually truncate the roots of the
15657 // expression. Collect the values that can be demoted in ToDemote and
15658 // additional roots that require investigating in Roots.
15659 DenseSet<const TreeEntry *> Visited;
15660 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15661 bool NeedToDemote = IsProfitableToDemote;
15662
15663 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15664 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15665 IsTruncRoot) ||
15666 (MaxDepthLevel <= Limit &&
15667 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15668 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15669 DL->getTypeSizeInBits(TreeRootIT) /
15670 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15671 ->getOperand(0)
15672 ->getType()) >
15673 2)))))
15674 return 0u;
15675 // Round MaxBitWidth up to the next power-of-two.
15676 MaxBitWidth = bit_ceil(MaxBitWidth);
15677
15678 return MaxBitWidth;
15679 };
15680
15681 // If we can truncate the root, we must collect additional values that might
15682 // be demoted as a result. That is, those seeded by truncations we will
15683 // modify.
15684 // Add reduction ops sizes, if any.
15685 if (UserIgnoreList &&
15686 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15687 for (Value *V : *UserIgnoreList) {
15688 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15689 auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
15690 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15691 if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
15692 ++BitWidth1;
15693 unsigned BitWidth2 = BitWidth1;
15694 if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
15695 auto Mask = DB->getDemandedBits(cast<Instruction>(V));
15696 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15697 }
15698 ReductionBitWidth =
15699 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15700 }
15701 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15702 ReductionBitWidth = 8;
15703
15704 ReductionBitWidth = bit_ceil(ReductionBitWidth);
15705 }
15706 bool IsTopRoot = NodeIdx == 0;
15707 while (NodeIdx < VectorizableTree.size() &&
15708 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15709 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15710 RootDemotes.push_back(NodeIdx);
15711 ++NodeIdx;
15712 IsTruncRoot = true;
15713 }
15714 bool IsSignedCmp = false;
15715 while (NodeIdx < VectorizableTree.size()) {
15716 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
15717 unsigned Limit = 2;
15718 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15719 if (IsTopRoot &&
15720 ReductionBitWidth ==
15721 DL->getTypeSizeInBits(
15722 VectorizableTree.front()->Scalars.front()->getType()))
15723 Limit = 3;
15724 unsigned MaxBitWidth = ComputeMaxBitWidth(
15725 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
15726 Limit, IsTruncRoot, IsSignedCmp);
15727 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
15728 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15729 ReductionBitWidth = bit_ceil(MaxBitWidth);
15730 else if (MaxBitWidth == 0)
15731 ReductionBitWidth = 0;
15732 }
15733
15734 for (unsigned Idx : RootDemotes) {
15735 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
15736 uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
15737 if (OrigBitWidth > MaxBitWidth) {
15738 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
15739 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
15740 }
15741 return false;
15742 }))
15743 ToDemote.push_back(Idx);
15744 }
15745 RootDemotes.clear();
15746 IsTopRoot = false;
15747 IsProfitableToDemoteRoot = true;
15748
15749 if (ExtraBitWidthNodes.empty()) {
15750 NodeIdx = VectorizableTree.size();
15751 } else {
15752 unsigned NewIdx = 0;
15753 do {
15754 NewIdx = *ExtraBitWidthNodes.begin();
15755 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
15756 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15757 NodeIdx = NewIdx;
15758 IsTruncRoot =
15759 NodeIdx < VectorizableTree.size() &&
15760 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15761 [](const EdgeInfo &EI) {
15762 return EI.EdgeIdx == 0 &&
15763 EI.UserTE->getOpcode() == Instruction::Trunc &&
15764 !EI.UserTE->isAltShuffle();
15765 });
15766 IsSignedCmp =
15767 NodeIdx < VectorizableTree.size() &&
15768 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15769 [&](const EdgeInfo &EI) {
15770 return EI.UserTE->getOpcode() == Instruction::ICmp &&
15771 any_of(EI.UserTE->Scalars, [&](Value *V) {
15772 auto *IC = dyn_cast<ICmpInst>(V);
15773 return IC &&
15774 (IC->isSigned() ||
15775 !isKnownNonNegative(IC->getOperand(0),
15776 SimplifyQuery(*DL)) ||
15777 !isKnownNonNegative(IC->getOperand(1),
15778 SimplifyQuery(*DL)));
15779 });
15780 });
15781 }
15782
15783 // If the maximum bit width we compute is less than the with of the roots'
15784 // type, we can proceed with the narrowing. Otherwise, do nothing.
15785 if (MaxBitWidth == 0 ||
15786 MaxBitWidth >=
15787 cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
15788 if (UserIgnoreList)
15789 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
15790 continue;
15791 }
15792
15793 // Finally, map the values we can demote to the maximum bit with we
15794 // computed.
15795 for (unsigned Idx : ToDemote) {
15796 TreeEntry *TE = VectorizableTree[Idx].get();
15797 if (MinBWs.contains(TE))
15798 continue;
15799 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
15800 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15801 });
15802 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
15803 }
15804 }
15805 }
15806
run(Function & F,FunctionAnalysisManager & AM)15807 PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
15808 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
15809 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
15810 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
15811 auto *AA = &AM.getResult<AAManager>(F);
15812 auto *LI = &AM.getResult<LoopAnalysis>(F);
15813 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
15814 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
15815 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
15816 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
15817
15818 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
15819 if (!Changed)
15820 return PreservedAnalyses::all();
15821
15822 PreservedAnalyses PA;
15823 PA.preserveSet<CFGAnalyses>();
15824 return PA;
15825 }
15826
runImpl(Function & F,ScalarEvolution * SE_,TargetTransformInfo * TTI_,TargetLibraryInfo * TLI_,AAResults * AA_,LoopInfo * LI_,DominatorTree * DT_,AssumptionCache * AC_,DemandedBits * DB_,OptimizationRemarkEmitter * ORE_)15827 bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
15828 TargetTransformInfo *TTI_,
15829 TargetLibraryInfo *TLI_, AAResults *AA_,
15830 LoopInfo *LI_, DominatorTree *DT_,
15831 AssumptionCache *AC_, DemandedBits *DB_,
15832 OptimizationRemarkEmitter *ORE_) {
15833 if (!RunSLPVectorization)
15834 return false;
15835 SE = SE_;
15836 TTI = TTI_;
15837 TLI = TLI_;
15838 AA = AA_;
15839 LI = LI_;
15840 DT = DT_;
15841 AC = AC_;
15842 DB = DB_;
15843 DL = &F.getDataLayout();
15844
15845 Stores.clear();
15846 GEPs.clear();
15847 bool Changed = false;
15848
15849 // If the target claims to have no vector registers don't attempt
15850 // vectorization.
15851 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
15852 LLVM_DEBUG(
15853 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15854 return false;
15855 }
15856
15857 // Don't vectorize when the attribute NoImplicitFloat is used.
15858 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
15859 return false;
15860
15861 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15862
15863 // Use the bottom up slp vectorizer to construct chains that start with
15864 // store instructions.
15865 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15866
15867 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15868 // delete instructions.
15869
15870 // Update DFS numbers now so that we can use them for ordering.
15871 DT->updateDFSNumbers();
15872
15873 // Scan the blocks in the function in post order.
15874 for (auto *BB : post_order(&F.getEntryBlock())) {
15875 // Start new block - clear the list of reduction roots.
15876 R.clearReductionData();
15877 collectSeedInstructions(BB);
15878
15879 // Vectorize trees that end at stores.
15880 if (!Stores.empty()) {
15881 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15882 << " underlying objects.\n");
15883 Changed |= vectorizeStoreChains(R);
15884 }
15885
15886 // Vectorize trees that end at reductions.
15887 Changed |= vectorizeChainsInBlock(BB, R);
15888
15889 // Vectorize the index computations of getelementptr instructions. This
15890 // is primarily intended to catch gather-like idioms ending at
15891 // non-consecutive loads.
15892 if (!GEPs.empty()) {
15893 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15894 << " underlying objects.\n");
15895 Changed |= vectorizeGEPIndices(BB, R);
15896 }
15897 }
15898
15899 if (Changed) {
15900 R.optimizeGatherSequence();
15901 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15902 }
15903 return Changed;
15904 }
15905
15906 std::optional<bool>
vectorizeStoreChain(ArrayRef<Value * > Chain,BoUpSLP & R,unsigned Idx,unsigned MinVF,unsigned & Size)15907 SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15908 unsigned Idx, unsigned MinVF,
15909 unsigned &Size) {
15910 Size = 0;
15911 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15912 << "\n");
15913 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15914 unsigned VF = Chain.size();
15915
15916 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
15917 // Check if vectorizing with a non-power-of-2 VF should be considered. At
15918 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15919 // all vector lanes are used.
15920 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15921 return false;
15922 }
15923
15924 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15925 << "\n");
15926
15927 SetVector<Value *> ValOps;
15928 for (Value *V : Chain)
15929 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
15930 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
15931 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
15932 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
15933 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
15934 bool IsPowerOf2 =
15935 isPowerOf2_32(ValOps.size()) ||
15936 (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
15937 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15938 (!S.MainOp->isSafeToRemove() ||
15939 any_of(ValOps.getArrayRef(),
15940 [&](Value *V) {
15941 return !isa<ExtractElementInst>(V) &&
15942 (V->getNumUses() > Chain.size() ||
15943 any_of(V->users(), [&](User *U) {
15944 return !Stores.contains(U);
15945 }));
15946 }))) ||
15947 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
15948 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15949 return false;
15950 }
15951 }
15952 if (R.isLoadCombineCandidate(Chain))
15953 return true;
15954 R.buildTree(Chain);
15955 // Check if tree tiny and store itself or its value is not vectorized.
15956 if (R.isTreeTinyAndNotFullyVectorizable()) {
15957 if (R.isGathered(Chain.front()) ||
15958 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15959 return std::nullopt;
15960 Size = R.getTreeSize();
15961 return false;
15962 }
15963 R.reorderTopToBottom();
15964 R.reorderBottomToTop();
15965 R.buildExternalUses();
15966
15967 R.computeMinimumValueSizes();
15968 R.transformNodes();
15969
15970 Size = R.getTreeSize();
15971 if (S.getOpcode() == Instruction::Load)
15972 Size = 2; // cut off masked gather small trees
15973 InstructionCost Cost = R.getTreeCost();
15974
15975 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15976 if (Cost < -SLPCostThreshold) {
15977 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15978
15979 using namespace ore;
15980
15981 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
15982 cast<StoreInst>(Chain[0]))
15983 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
15984 << " and with tree size "
15985 << NV("TreeSize", R.getTreeSize()));
15986
15987 R.vectorizeTree();
15988 return true;
15989 }
15990
15991 return false;
15992 }
15993
15994 /// Checks if the quadratic mean deviation is less than 90% of the mean size.
checkTreeSizes(ArrayRef<std::pair<unsigned,unsigned>> Sizes,bool First)15995 static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
15996 bool First) {
15997 unsigned Num = 0;
15998 uint64_t Sum = std::accumulate(
15999 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
16000 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
16001 unsigned Size = First ? Val.first : Val.second;
16002 if (Size == 1)
16003 return V;
16004 ++Num;
16005 return V + Size;
16006 });
16007 if (Num == 0)
16008 return true;
16009 uint64_t Mean = Sum / Num;
16010 if (Mean == 0)
16011 return true;
16012 uint64_t Dev = std::accumulate(
16013 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
16014 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
16015 unsigned P = First ? Val.first : Val.second;
16016 if (P == 1)
16017 return V;
16018 return V + (P - Mean) * (P - Mean);
16019 }) /
16020 Num;
16021 return Dev * 81 / (Mean * Mean) == 0;
16022 }
16023
vectorizeStores(ArrayRef<StoreInst * > Stores,BoUpSLP & R,DenseSet<std::tuple<Value *,Value *,Value *,Value *,unsigned>> & Visited)16024 bool SLPVectorizerPass::vectorizeStores(
16025 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
16026 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
16027 &Visited) {
16028 // We may run into multiple chains that merge into a single chain. We mark the
16029 // stores that we vectorized so that we don't visit the same store twice.
16030 BoUpSLP::ValueSet VectorizedStores;
16031 bool Changed = false;
16032
16033 struct StoreDistCompare {
16034 bool operator()(const std::pair<unsigned, int> &Op1,
16035 const std::pair<unsigned, int> &Op2) const {
16036 return Op1.second < Op2.second;
16037 }
16038 };
16039 // A set of pairs (index of store in Stores array ref, Distance of the store
16040 // address relative to base store address in units).
16041 using StoreIndexToDistSet =
16042 std::set<std::pair<unsigned, int>, StoreDistCompare>;
16043 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
16044 int PrevDist = -1;
16045 BoUpSLP::ValueList Operands;
16046 // Collect the chain into a list.
16047 for (auto [Idx, Data] : enumerate(Set)) {
16048 if (Operands.empty() || Data.second - PrevDist == 1) {
16049 Operands.push_back(Stores[Data.first]);
16050 PrevDist = Data.second;
16051 if (Idx != Set.size() - 1)
16052 continue;
16053 }
16054 auto E = make_scope_exit([&, &DataVar = Data]() {
16055 Operands.clear();
16056 Operands.push_back(Stores[DataVar.first]);
16057 PrevDist = DataVar.second;
16058 });
16059
16060 if (Operands.size() <= 1 ||
16061 !Visited
16062 .insert({Operands.front(),
16063 cast<StoreInst>(Operands.front())->getValueOperand(),
16064 Operands.back(),
16065 cast<StoreInst>(Operands.back())->getValueOperand(),
16066 Operands.size()})
16067 .second)
16068 continue;
16069
16070 unsigned MaxVecRegSize = R.getMaxVecRegSize();
16071 unsigned EltSize = R.getVectorElementSize(Operands[0]);
16072 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
16073
16074 unsigned MaxVF =
16075 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
16076 unsigned MaxRegVF = MaxVF;
16077 auto *Store = cast<StoreInst>(Operands[0]);
16078 Type *StoreTy = Store->getValueOperand()->getType();
16079 Type *ValueTy = StoreTy;
16080 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
16081 ValueTy = Trunc->getSrcTy();
16082 if (ValueTy == StoreTy &&
16083 R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
16084 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
16085 unsigned MinVF = std::max<unsigned>(
16086 2, PowerOf2Ceil(TTI->getStoreMinimumVF(
16087 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
16088 ValueTy)));
16089
16090 if (MaxVF < MinVF) {
16091 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
16092 << ") < "
16093 << "MinVF (" << MinVF << ")\n");
16094 continue;
16095 }
16096
16097 unsigned NonPowerOf2VF = 0;
16098 if (VectorizeNonPowerOf2) {
16099 // First try vectorizing with a non-power-of-2 VF. At the moment, only
16100 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
16101 // lanes are used.
16102 unsigned CandVF = Operands.size();
16103 if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
16104 NonPowerOf2VF = CandVF;
16105 }
16106
16107 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
16108 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
16109 unsigned Size = MinVF;
16110 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
16111 VF = Size > MaxVF ? NonPowerOf2VF : Size;
16112 Size *= 2;
16113 });
16114 unsigned End = Operands.size();
16115 unsigned Repeat = 0;
16116 constexpr unsigned MaxAttempts = 4;
16117 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
16118 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
16119 P.first = P.second = 1;
16120 });
16121 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
16122 auto IsNotVectorized = [](bool First,
16123 const std::pair<unsigned, unsigned> &P) {
16124 return First ? P.first > 0 : P.second > 0;
16125 };
16126 auto IsVectorized = [](bool First,
16127 const std::pair<unsigned, unsigned> &P) {
16128 return First ? P.first == 0 : P.second == 0;
16129 };
16130 auto VFIsProfitable = [](bool First, unsigned Size,
16131 const std::pair<unsigned, unsigned> &P) {
16132 return First ? Size >= P.first : Size >= P.second;
16133 };
16134 auto FirstSizeSame = [](unsigned Size,
16135 const std::pair<unsigned, unsigned> &P) {
16136 return Size == P.first;
16137 };
16138 while (true) {
16139 ++Repeat;
16140 bool RepeatChanged = false;
16141 bool AnyProfitableGraph = false;
16142 for (unsigned Size : CandidateVFs) {
16143 AnyProfitableGraph = false;
16144 unsigned StartIdx = std::distance(
16145 RangeSizes.begin(),
16146 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
16147 std::placeholders::_1)));
16148 while (StartIdx < End) {
16149 unsigned EndIdx =
16150 std::distance(RangeSizes.begin(),
16151 find_if(RangeSizes.drop_front(StartIdx),
16152 std::bind(IsVectorized, Size >= MaxRegVF,
16153 std::placeholders::_1)));
16154 unsigned Sz = EndIdx >= End ? End : EndIdx;
16155 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
16156 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
16157 Size >= MaxRegVF)) {
16158 ++Cnt;
16159 continue;
16160 }
16161 ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
16162 assert(all_of(Slice,
16163 [&](Value *V) {
16164 return cast<StoreInst>(V)
16165 ->getValueOperand()
16166 ->getType() ==
16167 cast<StoreInst>(Slice.front())
16168 ->getValueOperand()
16169 ->getType();
16170 }) &&
16171 "Expected all operands of same type.");
16172 if (!NonSchedulable.empty()) {
16173 auto [NonSchedSizeMax, NonSchedSizeMin] =
16174 NonSchedulable.lookup(Slice.front());
16175 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
16176 Cnt += NonSchedSizeMax;
16177 continue;
16178 }
16179 }
16180 unsigned TreeSize;
16181 std::optional<bool> Res =
16182 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
16183 if (!Res) {
16184 NonSchedulable
16185 .try_emplace(Slice.front(), std::make_pair(Size, Size))
16186 .first->getSecond()
16187 .second = Size;
16188 } else if (*Res) {
16189 // Mark the vectorized stores so that we don't vectorize them
16190 // again.
16191 VectorizedStores.insert(Slice.begin(), Slice.end());
16192 // Mark the vectorized stores so that we don't vectorize them
16193 // again.
16194 AnyProfitableGraph = RepeatChanged = Changed = true;
16195 // If we vectorized initial block, no need to try to vectorize
16196 // it again.
16197 for_each(RangeSizes.slice(Cnt, Size),
16198 [](std::pair<unsigned, unsigned> &P) {
16199 P.first = P.second = 0;
16200 });
16201 if (Cnt < StartIdx + MinVF) {
16202 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
16203 [](std::pair<unsigned, unsigned> &P) {
16204 P.first = P.second = 0;
16205 });
16206 StartIdx = Cnt + Size;
16207 }
16208 if (Cnt > Sz - Size - MinVF) {
16209 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
16210 [](std::pair<unsigned, unsigned> &P) {
16211 P.first = P.second = 0;
16212 });
16213 if (Sz == End)
16214 End = Cnt;
16215 Sz = Cnt;
16216 }
16217 Cnt += Size;
16218 continue;
16219 }
16220 if (Size > 2 && Res &&
16221 !all_of(RangeSizes.slice(Cnt, Size),
16222 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
16223 std::placeholders::_1))) {
16224 Cnt += Size;
16225 continue;
16226 }
16227 // Check for the very big VFs that we're not rebuilding same
16228 // trees, just with larger number of elements.
16229 if (Size > MaxRegVF && TreeSize > 1 &&
16230 all_of(RangeSizes.slice(Cnt, Size),
16231 std::bind(FirstSizeSame, TreeSize,
16232 std::placeholders::_1))) {
16233 Cnt += Size;
16234 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
16235 ++Cnt;
16236 continue;
16237 }
16238 if (TreeSize > 1)
16239 for_each(RangeSizes.slice(Cnt, Size),
16240 [&](std::pair<unsigned, unsigned> &P) {
16241 if (Size >= MaxRegVF)
16242 P.second = std::max(P.second, TreeSize);
16243 else
16244 P.first = std::max(P.first, TreeSize);
16245 });
16246 ++Cnt;
16247 AnyProfitableGraph = true;
16248 }
16249 if (StartIdx >= End)
16250 break;
16251 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
16252 AnyProfitableGraph = true;
16253 StartIdx = std::distance(
16254 RangeSizes.begin(),
16255 find_if(RangeSizes.drop_front(Sz),
16256 std::bind(IsNotVectorized, Size >= MaxRegVF,
16257 std::placeholders::_1)));
16258 }
16259 if (!AnyProfitableGraph && Size >= MaxRegVF)
16260 break;
16261 }
16262 // All values vectorized - exit.
16263 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
16264 return P.first == 0 && P.second == 0;
16265 }))
16266 break;
16267 // Check if tried all attempts or no need for the last attempts at all.
16268 if (Repeat >= MaxAttempts ||
16269 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
16270 break;
16271 constexpr unsigned StoresLimit = 64;
16272 const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
16273 Operands.size(),
16274 static_cast<unsigned>(
16275 End -
16276 std::distance(
16277 RangeSizes.begin(),
16278 find_if(RangeSizes, std::bind(IsNotVectorized, true,
16279 std::placeholders::_1))) +
16280 1)));
16281 unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
16282 if (VF > MaxTotalNum || VF >= StoresLimit)
16283 break;
16284 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
16285 if (P.first != 0)
16286 P.first = std::max(P.second, P.first);
16287 });
16288 // Last attempt to vectorize max number of elements, if all previous
16289 // attempts were unsuccessful because of the cost issues.
16290 CandidateVFs.clear();
16291 CandidateVFs.push_back(VF);
16292 }
16293 }
16294 };
16295
16296 // Stores pair (first: index of the store into Stores array ref, address of
16297 // which taken as base, second: sorted set of pairs {index, dist}, which are
16298 // indices of stores in the set and their store location distances relative to
16299 // the base address).
16300
16301 // Need to store the index of the very first store separately, since the set
16302 // may be reordered after the insertion and the first store may be moved. This
16303 // container allows to reduce number of calls of getPointersDiff() function.
16304 SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
16305 // Inserts the specified store SI with the given index Idx to the set of the
16306 // stores. If the store with the same distance is found already - stop
16307 // insertion, try to vectorize already found stores. If some stores from this
16308 // sequence were not vectorized - try to vectorize them with the new store
16309 // later. But this logic is applied only to the stores, that come before the
16310 // previous store with the same distance.
16311 // Example:
16312 // 1. store x, %p
16313 // 2. store y, %p+1
16314 // 3. store z, %p+2
16315 // 4. store a, %p
16316 // 5. store b, %p+3
16317 // - Scan this from the last to first store. The very first bunch of stores is
16318 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
16319 // vector).
16320 // - The next store in the list - #1 - has the same distance from store #5 as
16321 // the store #4.
16322 // - Try to vectorize sequence of stores 4,2,3,5.
16323 // - If all these stores are vectorized - just drop them.
16324 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
16325 // - Start new stores sequence.
16326 // The new bunch of stores is {1, {1, 0}}.
16327 // - Add the stores from previous sequence, that were not vectorized.
16328 // Here we consider the stores in the reversed order, rather they are used in
16329 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
16330 // Store #3 can be added -> comes after store #4 with the same distance as
16331 // store #1.
16332 // Store #5 cannot be added - comes before store #4.
16333 // This logic allows to improve the compile time, we assume that the stores
16334 // after previous store with the same distance most likely have memory
16335 // dependencies and no need to waste compile time to try to vectorize them.
16336 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
16337 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
16338 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16339 std::optional<int> Diff = getPointersDiff(
16340 Stores[Set.first]->getValueOperand()->getType(),
16341 Stores[Set.first]->getPointerOperand(),
16342 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
16343 /*StrictCheck=*/true);
16344 if (!Diff)
16345 continue;
16346 auto It = Set.second.find(std::make_pair(Idx, *Diff));
16347 if (It == Set.second.end()) {
16348 Set.second.emplace(Idx, *Diff);
16349 return;
16350 }
16351 // Try to vectorize the first found set to avoid duplicate analysis.
16352 TryToVectorize(Set.second);
16353 StoreIndexToDistSet PrevSet;
16354 PrevSet.swap(Set.second);
16355 Set.first = Idx;
16356 Set.second.emplace(Idx, 0);
16357 // Insert stores that followed previous match to try to vectorize them
16358 // with this store.
16359 unsigned StartIdx = It->first + 1;
16360 SmallBitVector UsedStores(Idx - StartIdx);
16361 // Distances to previously found dup store (or this store, since they
16362 // store to the same addresses).
16363 SmallVector<int> Dists(Idx - StartIdx, 0);
16364 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
16365 // Do not try to vectorize sequences, we already tried.
16366 if (Pair.first <= It->first ||
16367 VectorizedStores.contains(Stores[Pair.first]))
16368 break;
16369 unsigned BI = Pair.first - StartIdx;
16370 UsedStores.set(BI);
16371 Dists[BI] = Pair.second - It->second;
16372 }
16373 for (unsigned I = StartIdx; I < Idx; ++I) {
16374 unsigned BI = I - StartIdx;
16375 if (UsedStores.test(BI))
16376 Set.second.emplace(I, Dists[BI]);
16377 }
16378 return;
16379 }
16380 auto &Res = SortedStores.emplace_back();
16381 Res.first = Idx;
16382 Res.second.emplace(Idx, 0);
16383 };
16384 Type *PrevValTy = nullptr;
16385 for (auto [I, SI] : enumerate(Stores)) {
16386 if (R.isDeleted(SI))
16387 continue;
16388 if (!PrevValTy)
16389 PrevValTy = SI->getValueOperand()->getType();
16390 // Check that we do not try to vectorize stores of different types.
16391 if (PrevValTy != SI->getValueOperand()->getType()) {
16392 for (auto &Set : SortedStores)
16393 TryToVectorize(Set.second);
16394 SortedStores.clear();
16395 PrevValTy = SI->getValueOperand()->getType();
16396 }
16397 FillStoresSet(I, SI);
16398 }
16399
16400 // Final vectorization attempt.
16401 for (auto &Set : SortedStores)
16402 TryToVectorize(Set.second);
16403
16404 return Changed;
16405 }
16406
collectSeedInstructions(BasicBlock * BB)16407 void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
16408 // Initialize the collections. We will make a single pass over the block.
16409 Stores.clear();
16410 GEPs.clear();
16411
16412 // Visit the store and getelementptr instructions in BB and organize them in
16413 // Stores and GEPs according to the underlying objects of their pointer
16414 // operands.
16415 for (Instruction &I : *BB) {
16416 // Ignore store instructions that are volatile or have a pointer operand
16417 // that doesn't point to a scalar type.
16418 if (auto *SI = dyn_cast<StoreInst>(&I)) {
16419 if (!SI->isSimple())
16420 continue;
16421 if (!isValidElementType(SI->getValueOperand()->getType()))
16422 continue;
16423 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
16424 }
16425
16426 // Ignore getelementptr instructions that have more than one index, a
16427 // constant index, or a pointer operand that doesn't point to a scalar
16428 // type.
16429 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
16430 if (GEP->getNumIndices() != 1)
16431 continue;
16432 Value *Idx = GEP->idx_begin()->get();
16433 if (isa<Constant>(Idx))
16434 continue;
16435 if (!isValidElementType(Idx->getType()))
16436 continue;
16437 if (GEP->getType()->isVectorTy())
16438 continue;
16439 GEPs[GEP->getPointerOperand()].push_back(GEP);
16440 }
16441 }
16442 }
16443
tryToVectorizeList(ArrayRef<Value * > VL,BoUpSLP & R,bool MaxVFOnly)16444 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
16445 bool MaxVFOnly) {
16446 if (VL.size() < 2)
16447 return false;
16448
16449 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
16450 << VL.size() << ".\n");
16451
16452 // Check that all of the parts are instructions of the same type,
16453 // we permit an alternate opcode via InstructionsState.
16454 InstructionsState S = getSameOpcode(VL, *TLI);
16455 if (!S.getOpcode())
16456 return false;
16457
16458 Instruction *I0 = cast<Instruction>(S.OpValue);
16459 // Make sure invalid types (including vector type) are rejected before
16460 // determining vectorization factor for scalar instructions.
16461 for (Value *V : VL) {
16462 Type *Ty = V->getType();
16463 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
16464 // NOTE: the following will give user internal llvm type name, which may
16465 // not be useful.
16466 R.getORE()->emit([&]() {
16467 std::string TypeStr;
16468 llvm::raw_string_ostream rso(TypeStr);
16469 Ty->print(rso);
16470 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
16471 << "Cannot SLP vectorize list: type "
16472 << TypeStr + " is unsupported by vectorizer";
16473 });
16474 return false;
16475 }
16476 }
16477
16478 unsigned Sz = R.getVectorElementSize(I0);
16479 unsigned MinVF = R.getMinVF(Sz);
16480 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
16481 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16482 if (MaxVF < 2) {
16483 R.getORE()->emit([&]() {
16484 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
16485 << "Cannot SLP vectorize list: vectorization factor "
16486 << "less than 2 is not supported";
16487 });
16488 return false;
16489 }
16490
16491 bool Changed = false;
16492 bool CandidateFound = false;
16493 InstructionCost MinCost = SLPCostThreshold.getValue();
16494 Type *ScalarTy = VL[0]->getType();
16495 if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16496 ScalarTy = IE->getOperand(1)->getType();
16497
16498 unsigned NextInst = 0, MaxInst = VL.size();
16499 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16500 // No actual vectorization should happen, if number of parts is the same as
16501 // provided vectorization factor (i.e. the scalar type is used for vector
16502 // code during codegen).
16503 auto *VecTy = getWidenedType(ScalarTy, VF);
16504 if (TTI->getNumberOfParts(VecTy) == VF)
16505 continue;
16506 for (unsigned I = NextInst; I < MaxInst; ++I) {
16507 unsigned ActualVF = std::min(MaxInst - I, VF);
16508
16509 if (!isPowerOf2_32(ActualVF))
16510 continue;
16511
16512 if (MaxVFOnly && ActualVF < MaxVF)
16513 break;
16514 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16515 break;
16516
16517 ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
16518 // Check that a previous iteration of this loop did not delete the Value.
16519 if (llvm::any_of(Ops, [&R](Value *V) {
16520 auto *I = dyn_cast<Instruction>(V);
16521 return I && R.isDeleted(I);
16522 }))
16523 continue;
16524
16525 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
16526 << "\n");
16527
16528 R.buildTree(Ops);
16529 if (R.isTreeTinyAndNotFullyVectorizable())
16530 continue;
16531 R.reorderTopToBottom();
16532 R.reorderBottomToTop(
16533 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
16534 !R.doesRootHaveInTreeUses());
16535 R.buildExternalUses();
16536
16537 R.computeMinimumValueSizes();
16538 R.transformNodes();
16539 InstructionCost Cost = R.getTreeCost();
16540 CandidateFound = true;
16541 MinCost = std::min(MinCost, Cost);
16542
16543 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16544 << " for VF=" << ActualVF << "\n");
16545 if (Cost < -SLPCostThreshold) {
16546 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
16547 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
16548 cast<Instruction>(Ops[0]))
16549 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
16550 << " and with tree size "
16551 << ore::NV("TreeSize", R.getTreeSize()));
16552
16553 R.vectorizeTree();
16554 // Move to the next bundle.
16555 I += VF - 1;
16556 NextInst = I + 1;
16557 Changed = true;
16558 }
16559 }
16560 }
16561
16562 if (!Changed && CandidateFound) {
16563 R.getORE()->emit([&]() {
16564 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
16565 << "List vectorization was possible but not beneficial with cost "
16566 << ore::NV("Cost", MinCost) << " >= "
16567 << ore::NV("Treshold", -SLPCostThreshold);
16568 });
16569 } else if (!Changed) {
16570 R.getORE()->emit([&]() {
16571 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
16572 << "Cannot SLP vectorize list: vectorization was impossible"
16573 << " with available vectorization factors";
16574 });
16575 }
16576 return Changed;
16577 }
16578
tryToVectorize(Instruction * I,BoUpSLP & R)16579 bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
16580 if (!I)
16581 return false;
16582
16583 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
16584 return false;
16585
16586 Value *P = I->getParent();
16587
16588 // Vectorize in current basic block only.
16589 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
16590 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
16591 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
16592 return false;
16593
16594 // First collect all possible candidates
16595 SmallVector<std::pair<Value *, Value *>, 4> Candidates;
16596 Candidates.emplace_back(Op0, Op1);
16597
16598 auto *A = dyn_cast<BinaryOperator>(Op0);
16599 auto *B = dyn_cast<BinaryOperator>(Op1);
16600 // Try to skip B.
16601 if (A && B && B->hasOneUse()) {
16602 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
16603 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
16604 if (B0 && B0->getParent() == P)
16605 Candidates.emplace_back(A, B0);
16606 if (B1 && B1->getParent() == P)
16607 Candidates.emplace_back(A, B1);
16608 }
16609 // Try to skip A.
16610 if (B && A && A->hasOneUse()) {
16611 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
16612 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
16613 if (A0 && A0->getParent() == P)
16614 Candidates.emplace_back(A0, B);
16615 if (A1 && A1->getParent() == P)
16616 Candidates.emplace_back(A1, B);
16617 }
16618
16619 if (Candidates.size() == 1)
16620 return tryToVectorizeList({Op0, Op1}, R);
16621
16622 // We have multiple options. Try to pick the single best.
16623 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
16624 if (!BestCandidate)
16625 return false;
16626 return tryToVectorizeList(
16627 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
16628 }
16629
16630 namespace {
16631
16632 /// Model horizontal reductions.
16633 ///
16634 /// A horizontal reduction is a tree of reduction instructions that has values
16635 /// that can be put into a vector as its leaves. For example:
16636 ///
16637 /// mul mul mul mul
16638 /// \ / \ /
16639 /// + +
16640 /// \ /
16641 /// +
16642 /// This tree has "mul" as its leaf values and "+" as its reduction
16643 /// instructions. A reduction can feed into a store or a binary operation
16644 /// feeding a phi.
16645 /// ...
16646 /// \ /
16647 /// +
16648 /// |
16649 /// phi +=
16650 ///
16651 /// Or:
16652 /// ...
16653 /// \ /
16654 /// +
16655 /// |
16656 /// *p =
16657 ///
16658 class HorizontalReduction {
16659 using ReductionOpsType = SmallVector<Value *, 16>;
16660 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
16661 ReductionOpsListType ReductionOps;
16662 /// List of possibly reduced values.
16663 SmallVector<SmallVector<Value *>> ReducedVals;
16664 /// Maps reduced value to the corresponding reduction operation.
16665 DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
16666 // Use map vector to make stable output.
16667 MapVector<Instruction *, Value *> ExtraArgs;
16668 WeakTrackingVH ReductionRoot;
16669 /// The type of reduction operation.
16670 RecurKind RdxKind;
16671 /// Checks if the optimization of original scalar identity operations on
16672 /// matched horizontal reductions is enabled and allowed.
16673 bool IsSupportedHorRdxIdentityOp = false;
16674
isCmpSelMinMax(Instruction * I)16675 static bool isCmpSelMinMax(Instruction *I) {
16676 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
16677 RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
16678 }
16679
16680 // And/or are potentially poison-safe logical patterns like:
16681 // select x, y, false
16682 // select x, true, y
isBoolLogicOp(Instruction * I)16683 static bool isBoolLogicOp(Instruction *I) {
16684 return isa<SelectInst>(I) &&
16685 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
16686 }
16687
16688 /// Checks if instruction is associative and can be vectorized.
isVectorizable(RecurKind Kind,Instruction * I)16689 static bool isVectorizable(RecurKind Kind, Instruction *I) {
16690 if (Kind == RecurKind::None)
16691 return false;
16692
16693 // Integer ops that map to select instructions or intrinsics are fine.
16694 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
16695 isBoolLogicOp(I))
16696 return true;
16697
16698 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16699 // FP min/max are associative except for NaN and -0.0. We do not
16700 // have to rule out -0.0 here because the intrinsic semantics do not
16701 // specify a fixed result for it.
16702 return I->getFastMathFlags().noNaNs();
16703 }
16704
16705 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16706 return true;
16707
16708 return I->isAssociative();
16709 }
16710
getRdxOperand(Instruction * I,unsigned Index)16711 static Value *getRdxOperand(Instruction *I, unsigned Index) {
16712 // Poison-safe 'or' takes the form: select X, true, Y
16713 // To make that work with the normal operand processing, we skip the
16714 // true value operand.
16715 // TODO: Change the code and data structures to handle this without a hack.
16716 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
16717 return I->getOperand(2);
16718 return I->getOperand(Index);
16719 }
16720
16721 /// Creates reduction operation with the current opcode.
createOp(IRBuilderBase & Builder,RecurKind Kind,Value * LHS,Value * RHS,const Twine & Name,bool UseSelect)16722 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
16723 Value *RHS, const Twine &Name, bool UseSelect) {
16724 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
16725 switch (Kind) {
16726 case RecurKind::Or:
16727 if (UseSelect &&
16728 LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
16729 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
16730 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16731 Name);
16732 case RecurKind::And:
16733 if (UseSelect &&
16734 LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
16735 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
16736 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16737 Name);
16738 case RecurKind::Add:
16739 case RecurKind::Mul:
16740 case RecurKind::Xor:
16741 case RecurKind::FAdd:
16742 case RecurKind::FMul:
16743 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16744 Name);
16745 case RecurKind::FMax:
16746 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
16747 case RecurKind::FMin:
16748 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
16749 case RecurKind::FMaximum:
16750 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
16751 case RecurKind::FMinimum:
16752 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
16753 case RecurKind::SMax:
16754 if (UseSelect) {
16755 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
16756 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16757 }
16758 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
16759 case RecurKind::SMin:
16760 if (UseSelect) {
16761 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
16762 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16763 }
16764 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
16765 case RecurKind::UMax:
16766 if (UseSelect) {
16767 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
16768 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16769 }
16770 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
16771 case RecurKind::UMin:
16772 if (UseSelect) {
16773 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
16774 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16775 }
16776 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
16777 default:
16778 llvm_unreachable("Unknown reduction operation.");
16779 }
16780 }
16781
16782 /// Creates reduction operation with the current opcode with the IR flags
16783 /// from \p ReductionOps, dropping nuw/nsw flags.
createOp(IRBuilderBase & Builder,RecurKind RdxKind,Value * LHS,Value * RHS,const Twine & Name,const ReductionOpsListType & ReductionOps)16784 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
16785 Value *RHS, const Twine &Name,
16786 const ReductionOpsListType &ReductionOps) {
16787 bool UseSelect = ReductionOps.size() == 2 ||
16788 // Logical or/and.
16789 (ReductionOps.size() == 1 &&
16790 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16791 assert((!UseSelect || ReductionOps.size() != 2 ||
16792 isa<SelectInst>(ReductionOps[1][0])) &&
16793 "Expected cmp + select pairs for reduction");
16794 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
16795 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
16796 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
16797 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
16798 /*IncludeWrapFlags=*/false);
16799 propagateIRFlags(Op, ReductionOps[1], nullptr,
16800 /*IncludeWrapFlags=*/false);
16801 return Op;
16802 }
16803 }
16804 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
16805 return Op;
16806 }
16807
16808 public:
getRdxKind(Value * V)16809 static RecurKind getRdxKind(Value *V) {
16810 auto *I = dyn_cast<Instruction>(V);
16811 if (!I)
16812 return RecurKind::None;
16813 if (match(I, m_Add(m_Value(), m_Value())))
16814 return RecurKind::Add;
16815 if (match(I, m_Mul(m_Value(), m_Value())))
16816 return RecurKind::Mul;
16817 if (match(I, m_And(m_Value(), m_Value())) ||
16818 match(I, m_LogicalAnd(m_Value(), m_Value())))
16819 return RecurKind::And;
16820 if (match(I, m_Or(m_Value(), m_Value())) ||
16821 match(I, m_LogicalOr(m_Value(), m_Value())))
16822 return RecurKind::Or;
16823 if (match(I, m_Xor(m_Value(), m_Value())))
16824 return RecurKind::Xor;
16825 if (match(I, m_FAdd(m_Value(), m_Value())))
16826 return RecurKind::FAdd;
16827 if (match(I, m_FMul(m_Value(), m_Value())))
16828 return RecurKind::FMul;
16829
16830 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
16831 return RecurKind::FMax;
16832 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
16833 return RecurKind::FMin;
16834
16835 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
16836 return RecurKind::FMaximum;
16837 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
16838 return RecurKind::FMinimum;
16839 // This matches either cmp+select or intrinsics. SLP is expected to handle
16840 // either form.
16841 // TODO: If we are canonicalizing to intrinsics, we can remove several
16842 // special-case paths that deal with selects.
16843 if (match(I, m_SMax(m_Value(), m_Value())))
16844 return RecurKind::SMax;
16845 if (match(I, m_SMin(m_Value(), m_Value())))
16846 return RecurKind::SMin;
16847 if (match(I, m_UMax(m_Value(), m_Value())))
16848 return RecurKind::UMax;
16849 if (match(I, m_UMin(m_Value(), m_Value())))
16850 return RecurKind::UMin;
16851
16852 if (auto *Select = dyn_cast<SelectInst>(I)) {
16853 // Try harder: look for min/max pattern based on instructions producing
16854 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
16855 // During the intermediate stages of SLP, it's very common to have
16856 // pattern like this (since optimizeGatherSequence is run only once
16857 // at the end):
16858 // %1 = extractelement <2 x i32> %a, i32 0
16859 // %2 = extractelement <2 x i32> %a, i32 1
16860 // %cond = icmp sgt i32 %1, %2
16861 // %3 = extractelement <2 x i32> %a, i32 0
16862 // %4 = extractelement <2 x i32> %a, i32 1
16863 // %select = select i1 %cond, i32 %3, i32 %4
16864 CmpInst::Predicate Pred;
16865 Instruction *L1;
16866 Instruction *L2;
16867
16868 Value *LHS = Select->getTrueValue();
16869 Value *RHS = Select->getFalseValue();
16870 Value *Cond = Select->getCondition();
16871
16872 // TODO: Support inverse predicates.
16873 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
16874 if (!isa<ExtractElementInst>(RHS) ||
16875 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16876 return RecurKind::None;
16877 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
16878 if (!isa<ExtractElementInst>(LHS) ||
16879 !L1->isIdenticalTo(cast<Instruction>(LHS)))
16880 return RecurKind::None;
16881 } else {
16882 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
16883 return RecurKind::None;
16884 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
16885 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
16886 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16887 return RecurKind::None;
16888 }
16889
16890 switch (Pred) {
16891 default:
16892 return RecurKind::None;
16893 case CmpInst::ICMP_SGT:
16894 case CmpInst::ICMP_SGE:
16895 return RecurKind::SMax;
16896 case CmpInst::ICMP_SLT:
16897 case CmpInst::ICMP_SLE:
16898 return RecurKind::SMin;
16899 case CmpInst::ICMP_UGT:
16900 case CmpInst::ICMP_UGE:
16901 return RecurKind::UMax;
16902 case CmpInst::ICMP_ULT:
16903 case CmpInst::ICMP_ULE:
16904 return RecurKind::UMin;
16905 }
16906 }
16907 return RecurKind::None;
16908 }
16909
16910 /// Get the index of the first operand.
getFirstOperandIndex(Instruction * I)16911 static unsigned getFirstOperandIndex(Instruction *I) {
16912 return isCmpSelMinMax(I) ? 1 : 0;
16913 }
16914
16915 private:
16916 /// Total number of operands in the reduction operation.
getNumberOfOperands(Instruction * I)16917 static unsigned getNumberOfOperands(Instruction *I) {
16918 return isCmpSelMinMax(I) ? 3 : 2;
16919 }
16920
16921 /// Checks if the instruction is in basic block \p BB.
16922 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
hasSameParent(Instruction * I,BasicBlock * BB)16923 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
16924 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
16925 auto *Sel = cast<SelectInst>(I);
16926 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
16927 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16928 }
16929 return I->getParent() == BB;
16930 }
16931
16932 /// Expected number of uses for reduction operations/reduced values.
hasRequiredNumberOfUses(bool IsCmpSelMinMax,Instruction * I)16933 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16934 if (IsCmpSelMinMax) {
16935 // SelectInst must be used twice while the condition op must have single
16936 // use only.
16937 if (auto *Sel = dyn_cast<SelectInst>(I))
16938 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
16939 return I->hasNUses(2);
16940 }
16941
16942 // Arithmetic reduction operation must be used once only.
16943 return I->hasOneUse();
16944 }
16945
16946 /// Initializes the list of reduction operations.
initReductionOps(Instruction * I)16947 void initReductionOps(Instruction *I) {
16948 if (isCmpSelMinMax(I))
16949 ReductionOps.assign(2, ReductionOpsType());
16950 else
16951 ReductionOps.assign(1, ReductionOpsType());
16952 }
16953
16954 /// Add all reduction operations for the reduction instruction \p I.
addReductionOps(Instruction * I)16955 void addReductionOps(Instruction *I) {
16956 if (isCmpSelMinMax(I)) {
16957 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
16958 ReductionOps[1].emplace_back(I);
16959 } else {
16960 ReductionOps[0].emplace_back(I);
16961 }
16962 }
16963
isGoodForReduction(ArrayRef<Value * > Data)16964 static bool isGoodForReduction(ArrayRef<Value *> Data) {
16965 int Sz = Data.size();
16966 auto *I = dyn_cast<Instruction>(Data.front());
16967 return Sz > 1 || isConstant(Data.front()) ||
16968 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
16969 }
16970
16971 public:
16972 HorizontalReduction() = default;
16973
16974 /// Try to find a reduction tree.
matchAssociativeReduction(BoUpSLP & R,Instruction * Root,ScalarEvolution & SE,const DataLayout & DL,const TargetLibraryInfo & TLI)16975 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16976 ScalarEvolution &SE, const DataLayout &DL,
16977 const TargetLibraryInfo &TLI) {
16978 RdxKind = HorizontalReduction::getRdxKind(Root);
16979 if (!isVectorizable(RdxKind, Root))
16980 return false;
16981
16982 // Analyze "regular" integer/FP types for reductions - no target-specific
16983 // types or pointers.
16984 Type *Ty = Root->getType();
16985 if (!isValidElementType(Ty) || Ty->isPointerTy())
16986 return false;
16987
16988 // Though the ultimate reduction may have multiple uses, its condition must
16989 // have only single use.
16990 if (auto *Sel = dyn_cast<SelectInst>(Root))
16991 if (!Sel->getCondition()->hasOneUse())
16992 return false;
16993
16994 ReductionRoot = Root;
16995
16996 // Iterate through all the operands of the possible reduction tree and
16997 // gather all the reduced values, sorting them by their value id.
16998 BasicBlock *BB = Root->getParent();
16999 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
17000 SmallVector<Instruction *> Worklist(1, Root);
17001 // Checks if the operands of the \p TreeN instruction are also reduction
17002 // operations or should be treated as reduced values or an extra argument,
17003 // which is not part of the reduction.
17004 auto CheckOperands = [&](Instruction *TreeN,
17005 SmallVectorImpl<Value *> &ExtraArgs,
17006 SmallVectorImpl<Value *> &PossibleReducedVals,
17007 SmallVectorImpl<Instruction *> &ReductionOps) {
17008 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
17009 getNumberOfOperands(TreeN)))) {
17010 Value *EdgeVal = getRdxOperand(TreeN, I);
17011 ReducedValsToOps[EdgeVal].push_back(TreeN);
17012 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
17013 // Edge has wrong parent - mark as an extra argument.
17014 if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
17015 !hasSameParent(EdgeInst, BB)) {
17016 ExtraArgs.push_back(EdgeVal);
17017 continue;
17018 }
17019 // If the edge is not an instruction, or it is different from the main
17020 // reduction opcode or has too many uses - possible reduced value.
17021 // Also, do not try to reduce const values, if the operation is not
17022 // foldable.
17023 if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
17024 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
17025 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
17026 !isVectorizable(RdxKind, EdgeInst) ||
17027 (R.isAnalyzedReductionRoot(EdgeInst) &&
17028 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
17029 PossibleReducedVals.push_back(EdgeVal);
17030 continue;
17031 }
17032 ReductionOps.push_back(EdgeInst);
17033 }
17034 };
17035 // Try to regroup reduced values so that it gets more profitable to try to
17036 // reduce them. Values are grouped by their value ids, instructions - by
17037 // instruction op id and/or alternate op id, plus do extra analysis for
17038 // loads (grouping them by the distabce between pointers) and cmp
17039 // instructions (grouping them by the predicate).
17040 MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
17041 PossibleReducedVals;
17042 initReductionOps(Root);
17043 DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap;
17044 SmallSet<size_t, 2> LoadKeyUsed;
17045
17046 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
17047 Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
17048 if (LoadKeyUsed.contains(Key)) {
17049 auto LIt = LoadsMap.find(Ptr);
17050 if (LIt != LoadsMap.end()) {
17051 for (LoadInst *RLI : LIt->second) {
17052 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
17053 LI->getType(), LI->getPointerOperand(), DL, SE,
17054 /*StrictCheck=*/true))
17055 return hash_value(RLI->getPointerOperand());
17056 }
17057 for (LoadInst *RLI : LIt->second) {
17058 if (arePointersCompatible(RLI->getPointerOperand(),
17059 LI->getPointerOperand(), TLI)) {
17060 hash_code SubKey = hash_value(RLI->getPointerOperand());
17061 return SubKey;
17062 }
17063 }
17064 if (LIt->second.size() > 2) {
17065 hash_code SubKey =
17066 hash_value(LIt->second.back()->getPointerOperand());
17067 return SubKey;
17068 }
17069 }
17070 }
17071 LoadKeyUsed.insert(Key);
17072 LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
17073 return hash_value(LI->getPointerOperand());
17074 };
17075
17076 while (!Worklist.empty()) {
17077 Instruction *TreeN = Worklist.pop_back_val();
17078 SmallVector<Value *> Args;
17079 SmallVector<Value *> PossibleRedVals;
17080 SmallVector<Instruction *> PossibleReductionOps;
17081 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
17082 // If too many extra args - mark the instruction itself as a reduction
17083 // value, not a reduction operation.
17084 if (Args.size() < 2) {
17085 addReductionOps(TreeN);
17086 // Add extra args.
17087 if (!Args.empty()) {
17088 assert(Args.size() == 1 && "Expected only single argument.");
17089 ExtraArgs[TreeN] = Args.front();
17090 }
17091 // Add reduction values. The values are sorted for better vectorization
17092 // results.
17093 for (Value *V : PossibleRedVals) {
17094 size_t Key, Idx;
17095 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
17096 /*AllowAlternate=*/false);
17097 ++PossibleReducedVals[Key][Idx]
17098 .insert(std::make_pair(V, 0))
17099 .first->second;
17100 }
17101 Worklist.append(PossibleReductionOps.rbegin(),
17102 PossibleReductionOps.rend());
17103 } else {
17104 size_t Key, Idx;
17105 std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
17106 /*AllowAlternate=*/false);
17107 ++PossibleReducedVals[Key][Idx]
17108 .insert(std::make_pair(TreeN, 0))
17109 .first->second;
17110 }
17111 }
17112 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
17113 // Sort values by the total number of values kinds to start the reduction
17114 // from the longest possible reduced values sequences.
17115 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
17116 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
17117 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
17118 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
17119 It != E; ++It) {
17120 PossibleRedValsVect.emplace_back();
17121 auto RedValsVect = It->second.takeVector();
17122 stable_sort(RedValsVect, llvm::less_second());
17123 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
17124 PossibleRedValsVect.back().append(Data.second, Data.first);
17125 }
17126 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
17127 return P1.size() > P2.size();
17128 });
17129 int NewIdx = -1;
17130 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
17131 if (NewIdx < 0 ||
17132 (!isGoodForReduction(Data) &&
17133 (!isa<LoadInst>(Data.front()) ||
17134 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
17135 getUnderlyingObject(
17136 cast<LoadInst>(Data.front())->getPointerOperand()) !=
17137 getUnderlyingObject(
17138 cast<LoadInst>(ReducedVals[NewIdx].front())
17139 ->getPointerOperand())))) {
17140 NewIdx = ReducedVals.size();
17141 ReducedVals.emplace_back();
17142 }
17143 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
17144 }
17145 }
17146 // Sort the reduced values by number of same/alternate opcode and/or pointer
17147 // operand.
17148 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
17149 return P1.size() > P2.size();
17150 });
17151 return true;
17152 }
17153
17154 /// Attempt to vectorize the tree found by matchAssociativeReduction.
tryToReduce(BoUpSLP & V,const DataLayout & DL,TargetTransformInfo * TTI,const TargetLibraryInfo & TLI)17155 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
17156 const TargetLibraryInfo &TLI) {
17157 constexpr int ReductionLimit = 4;
17158 constexpr unsigned RegMaxNumber = 4;
17159 constexpr unsigned RedValsMaxNumber = 128;
17160 // If there are a sufficient number of reduction values, reduce
17161 // to a nearby power-of-2. We can safely generate oversized
17162 // vectors and rely on the backend to split them to legal sizes.
17163 unsigned NumReducedVals =
17164 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
17165 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
17166 if (!isGoodForReduction(Vals))
17167 return Num;
17168 return Num + Vals.size();
17169 });
17170 if (NumReducedVals < ReductionLimit &&
17171 (!AllowHorRdxIdenityOptimization ||
17172 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
17173 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
17174 }))) {
17175 for (ReductionOpsType &RdxOps : ReductionOps)
17176 for (Value *RdxOp : RdxOps)
17177 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17178 return nullptr;
17179 }
17180
17181 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
17182 TargetFolder(DL));
17183 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
17184
17185 // Track the reduced values in case if they are replaced by extractelement
17186 // because of the vectorization.
17187 DenseMap<Value *, WeakTrackingVH> TrackedVals(
17188 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
17189 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
17190 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
17191 ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
17192 // The same extra argument may be used several times, so log each attempt
17193 // to use it.
17194 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
17195 assert(Pair.first && "DebugLoc must be set.");
17196 ExternallyUsedValues[Pair.second].push_back(Pair.first);
17197 TrackedVals.try_emplace(Pair.second, Pair.second);
17198 }
17199
17200 // The compare instruction of a min/max is the insertion point for new
17201 // instructions and may be replaced with a new compare instruction.
17202 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
17203 assert(isa<SelectInst>(RdxRootInst) &&
17204 "Expected min/max reduction to have select root instruction");
17205 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
17206 assert(isa<Instruction>(ScalarCond) &&
17207 "Expected min/max reduction to have compare condition");
17208 return cast<Instruction>(ScalarCond);
17209 };
17210
17211 // Return new VectorizedTree, based on previous value.
17212 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
17213 if (VectorizedTree) {
17214 // Update the final value in the reduction.
17215 Builder.SetCurrentDebugLocation(
17216 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
17217 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
17218 (isGuaranteedNotToBePoison(Res) &&
17219 !isGuaranteedNotToBePoison(VectorizedTree))) {
17220 auto It = ReducedValsToOps.find(Res);
17221 if (It != ReducedValsToOps.end() &&
17222 any_of(It->getSecond(),
17223 [](Instruction *I) { return isBoolLogicOp(I); }))
17224 std::swap(VectorizedTree, Res);
17225 }
17226
17227 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
17228 ReductionOps);
17229 }
17230 // Initialize the final value in the reduction.
17231 return Res;
17232 };
17233 bool AnyBoolLogicOp =
17234 any_of(ReductionOps.back(), [](Value *V) {
17235 return isBoolLogicOp(cast<Instruction>(V));
17236 });
17237 // The reduction root is used as the insertion point for new instructions,
17238 // so set it as externally used to prevent it from being deleted.
17239 ExternallyUsedValues[ReductionRoot];
17240 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
17241 ReductionOps.front().size());
17242 for (ReductionOpsType &RdxOps : ReductionOps)
17243 for (Value *RdxOp : RdxOps) {
17244 if (!RdxOp)
17245 continue;
17246 IgnoreList.insert(RdxOp);
17247 }
17248 // Intersect the fast-math-flags from all reduction operations.
17249 FastMathFlags RdxFMF;
17250 RdxFMF.set();
17251 for (Value *U : IgnoreList)
17252 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
17253 RdxFMF &= FPMO->getFastMathFlags();
17254 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
17255
17256 // Need to track reduced vals, they may be changed during vectorization of
17257 // subvectors.
17258 for (ArrayRef<Value *> Candidates : ReducedVals)
17259 for (Value *V : Candidates)
17260 TrackedVals.try_emplace(V, V);
17261
17262 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
17263 // List of the values that were reduced in other trees as part of gather
17264 // nodes and thus requiring extract if fully vectorized in other trees.
17265 SmallPtrSet<Value *, 4> RequiredExtract;
17266 Value *VectorizedTree = nullptr;
17267 bool CheckForReusedReductionOps = false;
17268 // Try to vectorize elements based on their type.
17269 SmallVector<InstructionsState> States;
17270 for (ArrayRef<Value *> RV : ReducedVals)
17271 States.push_back(getSameOpcode(RV, TLI));
17272 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
17273 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
17274 InstructionsState S = States[I];
17275 SmallVector<Value *> Candidates;
17276 Candidates.reserve(2 * OrigReducedVals.size());
17277 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
17278 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
17279 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
17280 // Check if the reduction value was not overriden by the extractelement
17281 // instruction because of the vectorization and exclude it, if it is not
17282 // compatible with other values.
17283 // Also check if the instruction was folded to constant/other value.
17284 auto *Inst = dyn_cast<Instruction>(RdxVal);
17285 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
17286 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
17287 (S.getOpcode() && !Inst))
17288 continue;
17289 Candidates.push_back(RdxVal);
17290 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
17291 }
17292 bool ShuffledExtracts = false;
17293 // Try to handle shuffled extractelements.
17294 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
17295 I + 1 < E) {
17296 InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
17297 if (NextS.getOpcode() == Instruction::ExtractElement &&
17298 !NextS.isAltShuffle()) {
17299 SmallVector<Value *> CommonCandidates(Candidates);
17300 for (Value *RV : ReducedVals[I + 1]) {
17301 Value *RdxVal = TrackedVals.find(RV)->second;
17302 // Check if the reduction value was not overriden by the
17303 // extractelement instruction because of the vectorization and
17304 // exclude it, if it is not compatible with other values.
17305 if (auto *Inst = dyn_cast<Instruction>(RdxVal))
17306 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
17307 continue;
17308 CommonCandidates.push_back(RdxVal);
17309 TrackedToOrig.try_emplace(RdxVal, RV);
17310 }
17311 SmallVector<int> Mask;
17312 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
17313 ++I;
17314 Candidates.swap(CommonCandidates);
17315 ShuffledExtracts = true;
17316 }
17317 }
17318 }
17319
17320 // Emit code for constant values.
17321 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
17322 allConstant(Candidates)) {
17323 Value *Res = Candidates.front();
17324 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
17325 for (Value *VC : ArrayRef(Candidates).drop_front()) {
17326 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
17327 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17328 if (auto *ResI = dyn_cast<Instruction>(Res))
17329 V.analyzedReductionRoot(ResI);
17330 }
17331 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17332 continue;
17333 }
17334
17335 unsigned NumReducedVals = Candidates.size();
17336 if (NumReducedVals < ReductionLimit &&
17337 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
17338 !isSplat(Candidates)))
17339 continue;
17340
17341 // Check if we support repeated scalar values processing (optimization of
17342 // original scalar identity operations on matched horizontal reductions).
17343 IsSupportedHorRdxIdentityOp =
17344 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
17345 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17346 // Gather same values.
17347 MapVector<Value *, unsigned> SameValuesCounter;
17348 if (IsSupportedHorRdxIdentityOp)
17349 for (Value *V : Candidates)
17350 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
17351 // Used to check if the reduced values used same number of times. In this
17352 // case the compiler may produce better code. E.g. if reduced values are
17353 // aabbccdd (8 x values), then the first node of the tree will have a node
17354 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
17355 // Plus, the final reduction will be performed on <8 x aabbccdd>.
17356 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
17357 // x abcd) * 2.
17358 // Currently it only handles add/fadd/xor. and/or/min/max do not require
17359 // this analysis, other operations may require an extra estimation of
17360 // the profitability.
17361 bool SameScaleFactor = false;
17362 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17363 SameValuesCounter.size() != Candidates.size();
17364 if (OptReusedScalars) {
17365 SameScaleFactor =
17366 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17367 RdxKind == RecurKind::Xor) &&
17368 all_of(drop_begin(SameValuesCounter),
17369 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
17370 return P.second == SameValuesCounter.front().second;
17371 });
17372 Candidates.resize(SameValuesCounter.size());
17373 transform(SameValuesCounter, Candidates.begin(),
17374 [](const auto &P) { return P.first; });
17375 NumReducedVals = Candidates.size();
17376 // Have a reduction of the same element.
17377 if (NumReducedVals == 1) {
17378 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17379 unsigned Cnt = SameValuesCounter.lookup(OrigV);
17380 Value *RedVal =
17381 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17382 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17383 VectorizedVals.try_emplace(OrigV, Cnt);
17384 continue;
17385 }
17386 }
17387
17388 unsigned MaxVecRegSize = V.getMaxVecRegSize();
17389 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
17390 unsigned MaxElts =
17391 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
17392
17393 unsigned ReduxWidth = std::min<unsigned>(
17394 llvm::bit_floor(NumReducedVals),
17395 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17396 RegMaxNumber * RedValsMaxNumber));
17397 unsigned Start = 0;
17398 unsigned Pos = Start;
17399 // Restarts vectorization attempt with lower vector factor.
17400 unsigned PrevReduxWidth = ReduxWidth;
17401 bool CheckForReusedReductionOpsLocal = false;
17402 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17403 &CheckForReusedReductionOpsLocal,
17404 &PrevReduxWidth, &V,
17405 &IgnoreList](bool IgnoreVL = false) {
17406 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
17407 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17408 // Check if any of the reduction ops are gathered. If so, worth
17409 // trying again with less number of reduction ops.
17410 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17411 }
17412 ++Pos;
17413 if (Pos < NumReducedVals - ReduxWidth + 1)
17414 return IsAnyRedOpGathered;
17415 Pos = Start;
17416 ReduxWidth /= 2;
17417 return IsAnyRedOpGathered;
17418 };
17419 bool AnyVectorized = false;
17420 while (Pos < NumReducedVals - ReduxWidth + 1 &&
17421 ReduxWidth >= ReductionLimit) {
17422 // Dependency in tree of the reduction ops - drop this attempt, try
17423 // later.
17424 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17425 Start == 0) {
17426 CheckForReusedReductionOps = true;
17427 break;
17428 }
17429 PrevReduxWidth = ReduxWidth;
17430 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
17431 // Beeing analyzed already - skip.
17432 if (V.areAnalyzedReductionVals(VL)) {
17433 (void)AdjustReducedVals(/*IgnoreVL=*/true);
17434 continue;
17435 }
17436 // Early exit if any of the reduction values were deleted during
17437 // previous vectorization attempts.
17438 if (any_of(VL, [&V](Value *RedVal) {
17439 auto *RedValI = dyn_cast<Instruction>(RedVal);
17440 if (!RedValI)
17441 return false;
17442 return V.isDeleted(RedValI);
17443 }))
17444 break;
17445 V.buildTree(VL, IgnoreList);
17446 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
17447 if (!AdjustReducedVals())
17448 V.analyzedReductionVals(VL);
17449 continue;
17450 }
17451 if (V.isLoadCombineReductionCandidate(RdxKind)) {
17452 if (!AdjustReducedVals())
17453 V.analyzedReductionVals(VL);
17454 continue;
17455 }
17456 V.reorderTopToBottom();
17457 // No need to reorder the root node at all.
17458 V.reorderBottomToTop(/*IgnoreReorder=*/true);
17459 // Keep extracted other reduction values, if they are used in the
17460 // vectorization trees.
17461 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
17462 ExternallyUsedValues);
17463 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
17464 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
17465 continue;
17466 for (Value *V : ReducedVals[Cnt])
17467 if (isa<Instruction>(V))
17468 LocalExternallyUsedValues[TrackedVals[V]];
17469 }
17470 if (!IsSupportedHorRdxIdentityOp) {
17471 // Number of uses of the candidates in the vector of values.
17472 assert(SameValuesCounter.empty() &&
17473 "Reused values counter map is not empty");
17474 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17475 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17476 continue;
17477 Value *V = Candidates[Cnt];
17478 Value *OrigV = TrackedToOrig.find(V)->second;
17479 ++SameValuesCounter[OrigV];
17480 }
17481 }
17482 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
17483 // Gather externally used values.
17484 SmallPtrSet<Value *, 4> Visited;
17485 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17486 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17487 continue;
17488 Value *RdxVal = Candidates[Cnt];
17489 if (!Visited.insert(RdxVal).second)
17490 continue;
17491 // Check if the scalar was vectorized as part of the vectorization
17492 // tree but not the top node.
17493 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
17494 LocalExternallyUsedValues[RdxVal];
17495 continue;
17496 }
17497 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17498 unsigned NumOps =
17499 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17500 if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
17501 LocalExternallyUsedValues[RdxVal];
17502 }
17503 // Do not need the list of reused scalars in regular mode anymore.
17504 if (!IsSupportedHorRdxIdentityOp)
17505 SameValuesCounter.clear();
17506 for (Value *RdxVal : VL)
17507 if (RequiredExtract.contains(RdxVal))
17508 LocalExternallyUsedValues[RdxVal];
17509 // Update LocalExternallyUsedValues for the scalar, replaced by
17510 // extractelement instructions.
17511 DenseMap<Value *, Value *> ReplacementToExternal;
17512 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
17513 ReplacementToExternal.try_emplace(Pair.second, Pair.first);
17514 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
17515 Value *Ext = Pair.first;
17516 auto RIt = ReplacementToExternal.find(Ext);
17517 while (RIt != ReplacementToExternal.end()) {
17518 Ext = RIt->second;
17519 RIt = ReplacementToExternal.find(Ext);
17520 }
17521 auto *It = ExternallyUsedValues.find(Ext);
17522 if (It == ExternallyUsedValues.end())
17523 continue;
17524 LocalExternallyUsedValues[Pair.second].append(It->second);
17525 }
17526 V.buildExternalUses(LocalExternallyUsedValues);
17527
17528 V.computeMinimumValueSizes();
17529 V.transformNodes();
17530
17531 // Estimate cost.
17532 InstructionCost TreeCost = V.getTreeCost(VL);
17533 InstructionCost ReductionCost =
17534 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17535 InstructionCost Cost = TreeCost + ReductionCost;
17536 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
17537 << " for reduction\n");
17538 if (!Cost.isValid())
17539 break;
17540 if (Cost >= -SLPCostThreshold) {
17541 V.getORE()->emit([&]() {
17542 return OptimizationRemarkMissed(
17543 SV_NAME, "HorSLPNotBeneficial",
17544 ReducedValsToOps.find(VL[0])->second.front())
17545 << "Vectorizing horizontal reduction is possible "
17546 << "but not beneficial with cost " << ore::NV("Cost", Cost)
17547 << " and threshold "
17548 << ore::NV("Threshold", -SLPCostThreshold);
17549 });
17550 if (!AdjustReducedVals())
17551 V.analyzedReductionVals(VL);
17552 continue;
17553 }
17554
17555 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
17556 << Cost << ". (HorRdx)\n");
17557 V.getORE()->emit([&]() {
17558 return OptimizationRemark(
17559 SV_NAME, "VectorizedHorizontalReduction",
17560 ReducedValsToOps.find(VL[0])->second.front())
17561 << "Vectorized horizontal reduction with cost "
17562 << ore::NV("Cost", Cost) << " and with tree size "
17563 << ore::NV("TreeSize", V.getTreeSize());
17564 });
17565
17566 Builder.setFastMathFlags(RdxFMF);
17567
17568 // Emit a reduction. If the root is a select (min/max idiom), the insert
17569 // point is the compare condition of that select.
17570 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17571 Instruction *InsertPt = RdxRootInst;
17572 if (IsCmpSelMinMax)
17573 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17574
17575 // Vectorize a tree.
17576 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
17577 ReplacedExternals, InsertPt);
17578
17579 Builder.SetInsertPoint(InsertPt);
17580
17581 // To prevent poison from leaking across what used to be sequential,
17582 // safe, scalar boolean logic operations, the reduction operand must be
17583 // frozen.
17584 if ((isBoolLogicOp(RdxRootInst) ||
17585 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17586 !isGuaranteedNotToBePoison(VectorizedRoot))
17587 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
17588
17589 // Emit code to correctly handle reused reduced values, if required.
17590 if (OptReusedScalars && !SameScaleFactor) {
17591 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
17592 SameValuesCounter, TrackedToOrig);
17593 }
17594
17595 Value *ReducedSubTree =
17596 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
17597 if (ReducedSubTree->getType() != VL.front()->getType()) {
17598 assert(ReducedSubTree->getType() != VL.front()->getType() &&
17599 "Expected different reduction type.");
17600 ReducedSubTree =
17601 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
17602 V.isSignedMinBitwidthRootNode());
17603 }
17604
17605 // Improved analysis for add/fadd/xor reductions with same scale factor
17606 // for all operands of reductions. We can emit scalar ops for them
17607 // instead.
17608 if (OptReusedScalars && SameScaleFactor)
17609 ReducedSubTree = emitScaleForReusedOps(
17610 ReducedSubTree, Builder, SameValuesCounter.front().second);
17611
17612 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17613 // Count vectorized reduced values to exclude them from final reduction.
17614 for (Value *RdxVal : VL) {
17615 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17616 if (IsSupportedHorRdxIdentityOp) {
17617 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17618 continue;
17619 }
17620 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17621 if (!V.isVectorized(RdxVal))
17622 RequiredExtract.insert(RdxVal);
17623 }
17624 Pos += ReduxWidth;
17625 Start = Pos;
17626 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
17627 AnyVectorized = true;
17628 }
17629 if (OptReusedScalars && !AnyVectorized) {
17630 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
17631 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
17632 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17633 Value *OrigV = TrackedToOrig.find(P.first)->second;
17634 VectorizedVals.try_emplace(OrigV, P.second);
17635 }
17636 continue;
17637 }
17638 }
17639 if (VectorizedTree) {
17640 // Reorder operands of bool logical op in the natural order to avoid
17641 // possible problem with poison propagation. If not possible to reorder
17642 // (both operands are originally RHS), emit an extra freeze instruction
17643 // for the LHS operand.
17644 // I.e., if we have original code like this:
17645 // RedOp1 = select i1 ?, i1 LHS, i1 false
17646 // RedOp2 = select i1 RHS, i1 ?, i1 false
17647
17648 // Then, we swap LHS/RHS to create a new op that matches the poison
17649 // semantics of the original code.
17650
17651 // If we have original code like this and both values could be poison:
17652 // RedOp1 = select i1 ?, i1 LHS, i1 false
17653 // RedOp2 = select i1 ?, i1 RHS, i1 false
17654
17655 // Then, we must freeze LHS in the new op.
17656 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
17657 Instruction *RedOp1,
17658 Instruction *RedOp2,
17659 bool InitStep) {
17660 if (!AnyBoolLogicOp)
17661 return;
17662 if (isBoolLogicOp(RedOp1) &&
17663 ((!InitStep && LHS == VectorizedTree) ||
17664 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
17665 return;
17666 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
17667 getRdxOperand(RedOp2, 0) == RHS ||
17668 isGuaranteedNotToBePoison(RHS))) {
17669 std::swap(LHS, RHS);
17670 return;
17671 }
17672 if (LHS != VectorizedTree)
17673 LHS = Builder.CreateFreeze(LHS);
17674 };
17675 // Finish the reduction.
17676 // Need to add extra arguments and not vectorized possible reduction
17677 // values.
17678 // Try to avoid dependencies between the scalar remainders after
17679 // reductions.
17680 auto FinalGen =
17681 [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
17682 bool InitStep) {
17683 unsigned Sz = InstVals.size();
17684 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
17685 Sz % 2);
17686 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
17687 Instruction *RedOp = InstVals[I + 1].first;
17688 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
17689 Value *RdxVal1 = InstVals[I].second;
17690 Value *StableRdxVal1 = RdxVal1;
17691 auto It1 = TrackedVals.find(RdxVal1);
17692 if (It1 != TrackedVals.end())
17693 StableRdxVal1 = It1->second;
17694 Value *RdxVal2 = InstVals[I + 1].second;
17695 Value *StableRdxVal2 = RdxVal2;
17696 auto It2 = TrackedVals.find(RdxVal2);
17697 if (It2 != TrackedVals.end())
17698 StableRdxVal2 = It2->second;
17699 // To prevent poison from leaking across what used to be
17700 // sequential, safe, scalar boolean logic operations, the
17701 // reduction operand must be frozen.
17702 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
17703 RedOp, InitStep);
17704 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17705 StableRdxVal2, "op.rdx", ReductionOps);
17706 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
17707 }
17708 if (Sz % 2 == 1)
17709 ExtraReds[Sz / 2] = InstVals.back();
17710 return ExtraReds;
17711 };
17712 SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
17713 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
17714 VectorizedTree);
17715 SmallPtrSet<Value *, 8> Visited;
17716 for (ArrayRef<Value *> Candidates : ReducedVals) {
17717 for (Value *RdxVal : Candidates) {
17718 if (!Visited.insert(RdxVal).second)
17719 continue;
17720 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17721 for (Instruction *RedOp :
17722 ArrayRef(ReducedValsToOps.find(RdxVal)->second)
17723 .drop_back(NumOps))
17724 ExtraReductions.emplace_back(RedOp, RdxVal);
17725 }
17726 }
17727 for (auto &Pair : ExternallyUsedValues) {
17728 // Add each externally used value to the final reduction.
17729 for (auto *I : Pair.second)
17730 ExtraReductions.emplace_back(I, Pair.first);
17731 }
17732 // Iterate through all not-vectorized reduction values/extra arguments.
17733 bool InitStep = true;
17734 while (ExtraReductions.size() > 1) {
17735 SmallVector<std::pair<Instruction *, Value *>> NewReds =
17736 FinalGen(ExtraReductions, InitStep);
17737 ExtraReductions.swap(NewReds);
17738 InitStep = false;
17739 }
17740 VectorizedTree = ExtraReductions.front().second;
17741
17742 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17743
17744 // The original scalar reduction is expected to have no remaining
17745 // uses outside the reduction tree itself. Assert that we got this
17746 // correct, replace internal uses with undef, and mark for eventual
17747 // deletion.
17748 #ifndef NDEBUG
17749 SmallSet<Value *, 4> IgnoreSet;
17750 for (ArrayRef<Value *> RdxOps : ReductionOps)
17751 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
17752 #endif
17753 for (ArrayRef<Value *> RdxOps : ReductionOps) {
17754 for (Value *Ignore : RdxOps) {
17755 if (!Ignore)
17756 continue;
17757 #ifndef NDEBUG
17758 for (auto *U : Ignore->users()) {
17759 assert(IgnoreSet.count(U) &&
17760 "All users must be either in the reduction ops list.");
17761 }
17762 #endif
17763 if (!Ignore->use_empty()) {
17764 Value *P = PoisonValue::get(Ignore->getType());
17765 Ignore->replaceAllUsesWith(P);
17766 }
17767 }
17768 V.removeInstructionsAndOperands(RdxOps);
17769 }
17770 } else if (!CheckForReusedReductionOps) {
17771 for (ReductionOpsType &RdxOps : ReductionOps)
17772 for (Value *RdxOp : RdxOps)
17773 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17774 }
17775 return VectorizedTree;
17776 }
17777
17778 private:
17779 /// Calculate the cost of a reduction.
getReductionCost(TargetTransformInfo * TTI,ArrayRef<Value * > ReducedVals,bool IsCmpSelMinMax,unsigned ReduxWidth,FastMathFlags FMF)17780 InstructionCost getReductionCost(TargetTransformInfo *TTI,
17781 ArrayRef<Value *> ReducedVals,
17782 bool IsCmpSelMinMax, unsigned ReduxWidth,
17783 FastMathFlags FMF) {
17784 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17785 Type *ScalarTy = ReducedVals.front()->getType();
17786 FixedVectorType *VectorTy = getWidenedType(ScalarTy, ReduxWidth);
17787 InstructionCost VectorCost = 0, ScalarCost;
17788 // If all of the reduced values are constant, the vector cost is 0, since
17789 // the reduction value can be calculated at the compile time.
17790 bool AllConsts = allConstant(ReducedVals);
17791 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
17792 InstructionCost Cost = 0;
17793 // Scalar cost is repeated for N-1 elements.
17794 int Cnt = ReducedVals.size();
17795 for (Value *RdxVal : ReducedVals) {
17796 if (Cnt == 1)
17797 break;
17798 --Cnt;
17799 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
17800 Cost += GenCostFn();
17801 continue;
17802 }
17803 InstructionCost ScalarCost = 0;
17804 for (User *U : RdxVal->users()) {
17805 auto *RdxOp = cast<Instruction>(U);
17806 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17807 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
17808 continue;
17809 }
17810 ScalarCost = InstructionCost::getInvalid();
17811 break;
17812 }
17813 if (ScalarCost.isValid())
17814 Cost += ScalarCost;
17815 else
17816 Cost += GenCostFn();
17817 }
17818 return Cost;
17819 };
17820 switch (RdxKind) {
17821 case RecurKind::Add:
17822 case RecurKind::Mul:
17823 case RecurKind::Or:
17824 case RecurKind::And:
17825 case RecurKind::Xor:
17826 case RecurKind::FAdd:
17827 case RecurKind::FMul: {
17828 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
17829 if (!AllConsts)
17830 VectorCost =
17831 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
17832 ScalarCost = EvaluateScalarCost([&]() {
17833 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
17834 });
17835 break;
17836 }
17837 case RecurKind::FMax:
17838 case RecurKind::FMin:
17839 case RecurKind::FMaximum:
17840 case RecurKind::FMinimum:
17841 case RecurKind::SMax:
17842 case RecurKind::SMin:
17843 case RecurKind::UMax:
17844 case RecurKind::UMin: {
17845 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
17846 if (!AllConsts)
17847 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
17848 ScalarCost = EvaluateScalarCost([&]() {
17849 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
17850 return TTI->getIntrinsicInstrCost(ICA, CostKind);
17851 });
17852 break;
17853 }
17854 default:
17855 llvm_unreachable("Expected arithmetic or min/max reduction operation");
17856 }
17857
17858 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
17859 << " for reduction of " << shortBundleName(ReducedVals)
17860 << " (It is a splitting reduction)\n");
17861 return VectorCost - ScalarCost;
17862 }
17863
17864 /// Emit a horizontal reduction of the vectorized value.
emitReduction(Value * VectorizedValue,IRBuilderBase & Builder,unsigned ReduxWidth,const TargetTransformInfo * TTI)17865 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
17866 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
17867 assert(VectorizedValue && "Need to have a vectorized tree node");
17868 assert(isPowerOf2_32(ReduxWidth) &&
17869 "We only handle power-of-two reductions for now");
17870 assert(RdxKind != RecurKind::FMulAdd &&
17871 "A call to the llvm.fmuladd intrinsic is not handled yet");
17872
17873 ++NumVectorInstructions;
17874 return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
17875 }
17876
17877 /// Emits optimized code for unique scalar value reused \p Cnt times.
emitScaleForReusedOps(Value * VectorizedValue,IRBuilderBase & Builder,unsigned Cnt)17878 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17879 unsigned Cnt) {
17880 assert(IsSupportedHorRdxIdentityOp &&
17881 "The optimization of matched scalar identity horizontal reductions "
17882 "must be supported.");
17883 switch (RdxKind) {
17884 case RecurKind::Add: {
17885 // res = mul vv, n
17886 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
17887 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
17888 << VectorizedValue << ". (HorRdx)\n");
17889 return Builder.CreateMul(VectorizedValue, Scale);
17890 }
17891 case RecurKind::Xor: {
17892 // res = n % 2 ? 0 : vv
17893 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
17894 << ". (HorRdx)\n");
17895 if (Cnt % 2 == 0)
17896 return Constant::getNullValue(VectorizedValue->getType());
17897 return VectorizedValue;
17898 }
17899 case RecurKind::FAdd: {
17900 // res = fmul v, n
17901 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
17902 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
17903 << VectorizedValue << ". (HorRdx)\n");
17904 return Builder.CreateFMul(VectorizedValue, Scale);
17905 }
17906 case RecurKind::And:
17907 case RecurKind::Or:
17908 case RecurKind::SMax:
17909 case RecurKind::SMin:
17910 case RecurKind::UMax:
17911 case RecurKind::UMin:
17912 case RecurKind::FMax:
17913 case RecurKind::FMin:
17914 case RecurKind::FMaximum:
17915 case RecurKind::FMinimum:
17916 // res = vv
17917 return VectorizedValue;
17918 case RecurKind::Mul:
17919 case RecurKind::FMul:
17920 case RecurKind::FMulAdd:
17921 case RecurKind::IAnyOf:
17922 case RecurKind::FAnyOf:
17923 case RecurKind::None:
17924 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17925 }
17926 return nullptr;
17927 }
17928
17929 /// Emits actual operation for the scalar identity values, found during
17930 /// horizontal reduction analysis.
emitReusedOps(Value * VectorizedValue,IRBuilderBase & Builder,BoUpSLP & R,const MapVector<Value *,unsigned> & SameValuesCounter,const DenseMap<Value *,Value * > & TrackedToOrig)17931 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17932 BoUpSLP &R,
17933 const MapVector<Value *, unsigned> &SameValuesCounter,
17934 const DenseMap<Value *, Value *> &TrackedToOrig) {
17935 assert(IsSupportedHorRdxIdentityOp &&
17936 "The optimization of matched scalar identity horizontal reductions "
17937 "must be supported.");
17938 ArrayRef<Value *> VL = R.getRootNodeScalars();
17939 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
17940 if (VTy->getElementType() != VL.front()->getType()) {
17941 VectorizedValue = Builder.CreateIntCast(
17942 VectorizedValue,
17943 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
17944 R.isSignedMinBitwidthRootNode());
17945 }
17946 switch (RdxKind) {
17947 case RecurKind::Add: {
17948 // root = mul prev_root, <1, 1, n, 1>
17949 SmallVector<Constant *> Vals;
17950 for (Value *V : VL) {
17951 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17952 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
17953 }
17954 auto *Scale = ConstantVector::get(Vals);
17955 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17956 << VectorizedValue << ". (HorRdx)\n");
17957 return Builder.CreateMul(VectorizedValue, Scale);
17958 }
17959 case RecurKind::And:
17960 case RecurKind::Or:
17961 // No need for multiple or/and(s).
17962 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17963 << ". (HorRdx)\n");
17964 return VectorizedValue;
17965 case RecurKind::SMax:
17966 case RecurKind::SMin:
17967 case RecurKind::UMax:
17968 case RecurKind::UMin:
17969 case RecurKind::FMax:
17970 case RecurKind::FMin:
17971 case RecurKind::FMaximum:
17972 case RecurKind::FMinimum:
17973 // No need for multiple min/max(s) of the same value.
17974 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17975 << ". (HorRdx)\n");
17976 return VectorizedValue;
17977 case RecurKind::Xor: {
17978 // Replace values with even number of repeats with 0, since
17979 // x xor x = 0.
17980 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17981 // 7>, if elements 4th and 6th elements have even number of repeats.
17982 SmallVector<int> Mask(
17983 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
17984 PoisonMaskElem);
17985 std::iota(Mask.begin(), Mask.end(), 0);
17986 bool NeedShuffle = false;
17987 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
17988 Value *V = VL[I];
17989 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17990 if (Cnt % 2 == 0) {
17991 Mask[I] = VF;
17992 NeedShuffle = true;
17993 }
17994 }
17995 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17996 : Mask) dbgs()
17997 << I << " ";
17998 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17999 if (NeedShuffle)
18000 VectorizedValue = Builder.CreateShuffleVector(
18001 VectorizedValue,
18002 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
18003 return VectorizedValue;
18004 }
18005 case RecurKind::FAdd: {
18006 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
18007 SmallVector<Constant *> Vals;
18008 for (Value *V : VL) {
18009 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
18010 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
18011 }
18012 auto *Scale = ConstantVector::get(Vals);
18013 return Builder.CreateFMul(VectorizedValue, Scale);
18014 }
18015 case RecurKind::Mul:
18016 case RecurKind::FMul:
18017 case RecurKind::FMulAdd:
18018 case RecurKind::IAnyOf:
18019 case RecurKind::FAnyOf:
18020 case RecurKind::None:
18021 llvm_unreachable("Unexpected reduction kind for reused scalars.");
18022 }
18023 return nullptr;
18024 }
18025 };
18026 } // end anonymous namespace
18027
18028 /// Gets recurrence kind from the specified value.
getRdxKind(Value * V)18029 static RecurKind getRdxKind(Value *V) {
18030 return HorizontalReduction::getRdxKind(V);
18031 }
getAggregateSize(Instruction * InsertInst)18032 static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
18033 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
18034 return cast<FixedVectorType>(IE->getType())->getNumElements();
18035
18036 unsigned AggregateSize = 1;
18037 auto *IV = cast<InsertValueInst>(InsertInst);
18038 Type *CurrentType = IV->getType();
18039 do {
18040 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
18041 for (auto *Elt : ST->elements())
18042 if (Elt != ST->getElementType(0)) // check homogeneity
18043 return std::nullopt;
18044 AggregateSize *= ST->getNumElements();
18045 CurrentType = ST->getElementType(0);
18046 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
18047 AggregateSize *= AT->getNumElements();
18048 CurrentType = AT->getElementType();
18049 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
18050 AggregateSize *= VT->getNumElements();
18051 return AggregateSize;
18052 } else if (CurrentType->isSingleValueType()) {
18053 return AggregateSize;
18054 } else {
18055 return std::nullopt;
18056 }
18057 } while (true);
18058 }
18059
findBuildAggregate_rec(Instruction * LastInsertInst,TargetTransformInfo * TTI,SmallVectorImpl<Value * > & BuildVectorOpds,SmallVectorImpl<Value * > & InsertElts,unsigned OperandOffset)18060 static void findBuildAggregate_rec(Instruction *LastInsertInst,
18061 TargetTransformInfo *TTI,
18062 SmallVectorImpl<Value *> &BuildVectorOpds,
18063 SmallVectorImpl<Value *> &InsertElts,
18064 unsigned OperandOffset) {
18065 do {
18066 Value *InsertedOperand = LastInsertInst->getOperand(1);
18067 std::optional<unsigned> OperandIndex =
18068 getElementIndex(LastInsertInst, OperandOffset);
18069 if (!OperandIndex)
18070 return;
18071 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
18072 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
18073 BuildVectorOpds, InsertElts, *OperandIndex);
18074
18075 } else {
18076 BuildVectorOpds[*OperandIndex] = InsertedOperand;
18077 InsertElts[*OperandIndex] = LastInsertInst;
18078 }
18079 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
18080 } while (LastInsertInst != nullptr &&
18081 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
18082 LastInsertInst->hasOneUse());
18083 }
18084
18085 /// Recognize construction of vectors like
18086 /// %ra = insertelement <4 x float> poison, float %s0, i32 0
18087 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1
18088 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2
18089 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3
18090 /// starting from the last insertelement or insertvalue instruction.
18091 ///
18092 /// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
18093 /// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
18094 /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
18095 ///
18096 /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
18097 ///
18098 /// \return true if it matches.
findBuildAggregate(Instruction * LastInsertInst,TargetTransformInfo * TTI,SmallVectorImpl<Value * > & BuildVectorOpds,SmallVectorImpl<Value * > & InsertElts)18099 static bool findBuildAggregate(Instruction *LastInsertInst,
18100 TargetTransformInfo *TTI,
18101 SmallVectorImpl<Value *> &BuildVectorOpds,
18102 SmallVectorImpl<Value *> &InsertElts) {
18103
18104 assert((isa<InsertElementInst>(LastInsertInst) ||
18105 isa<InsertValueInst>(LastInsertInst)) &&
18106 "Expected insertelement or insertvalue instruction!");
18107
18108 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
18109 "Expected empty result vectors!");
18110
18111 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
18112 if (!AggregateSize)
18113 return false;
18114 BuildVectorOpds.resize(*AggregateSize);
18115 InsertElts.resize(*AggregateSize);
18116
18117 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
18118 llvm::erase(BuildVectorOpds, nullptr);
18119 llvm::erase(InsertElts, nullptr);
18120 if (BuildVectorOpds.size() >= 2)
18121 return true;
18122
18123 return false;
18124 }
18125
18126 /// Try and get a reduction instruction from a phi node.
18127 ///
18128 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
18129 /// if they come from either \p ParentBB or a containing loop latch.
18130 ///
18131 /// \returns A candidate reduction value if possible, or \code nullptr \endcode
18132 /// if not possible.
getReductionInstr(const DominatorTree * DT,PHINode * P,BasicBlock * ParentBB,LoopInfo * LI)18133 static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
18134 BasicBlock *ParentBB, LoopInfo *LI) {
18135 // There are situations where the reduction value is not dominated by the
18136 // reduction phi. Vectorizing such cases has been reported to cause
18137 // miscompiles. See PR25787.
18138 auto DominatedReduxValue = [&](Value *R) {
18139 return isa<Instruction>(R) &&
18140 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
18141 };
18142
18143 Instruction *Rdx = nullptr;
18144
18145 // Return the incoming value if it comes from the same BB as the phi node.
18146 if (P->getIncomingBlock(0) == ParentBB) {
18147 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
18148 } else if (P->getIncomingBlock(1) == ParentBB) {
18149 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
18150 }
18151
18152 if (Rdx && DominatedReduxValue(Rdx))
18153 return Rdx;
18154
18155 // Otherwise, check whether we have a loop latch to look at.
18156 Loop *BBL = LI->getLoopFor(ParentBB);
18157 if (!BBL)
18158 return nullptr;
18159 BasicBlock *BBLatch = BBL->getLoopLatch();
18160 if (!BBLatch)
18161 return nullptr;
18162
18163 // There is a loop latch, return the incoming value if it comes from
18164 // that. This reduction pattern occasionally turns up.
18165 if (P->getIncomingBlock(0) == BBLatch) {
18166 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
18167 } else if (P->getIncomingBlock(1) == BBLatch) {
18168 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
18169 }
18170
18171 if (Rdx && DominatedReduxValue(Rdx))
18172 return Rdx;
18173
18174 return nullptr;
18175 }
18176
matchRdxBop(Instruction * I,Value * & V0,Value * & V1)18177 static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
18178 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
18179 return true;
18180 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
18181 return true;
18182 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
18183 return true;
18184 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
18185 return true;
18186 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
18187 return true;
18188 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
18189 return true;
18190 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
18191 return true;
18192 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
18193 return true;
18194 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
18195 return true;
18196 return false;
18197 }
18198
18199 /// We could have an initial reduction that is not an add.
18200 /// r *= v1 + v2 + v3 + v4
18201 /// In such a case start looking for a tree rooted in the first '+'.
18202 /// \Returns the new root if found, which may be nullptr if not an instruction.
tryGetSecondaryReductionRoot(PHINode * Phi,Instruction * Root)18203 static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
18204 Instruction *Root) {
18205 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
18206 isa<IntrinsicInst>(Root)) &&
18207 "Expected binop, select, or intrinsic for reduction matching");
18208 Value *LHS =
18209 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
18210 Value *RHS =
18211 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
18212 if (LHS == Phi)
18213 return dyn_cast<Instruction>(RHS);
18214 if (RHS == Phi)
18215 return dyn_cast<Instruction>(LHS);
18216 return nullptr;
18217 }
18218
18219 /// \p Returns the first operand of \p I that does not match \p Phi. If
18220 /// operand is not an instruction it returns nullptr.
getNonPhiOperand(Instruction * I,PHINode * Phi)18221 static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
18222 Value *Op0 = nullptr;
18223 Value *Op1 = nullptr;
18224 if (!matchRdxBop(I, Op0, Op1))
18225 return nullptr;
18226 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
18227 }
18228
18229 /// \Returns true if \p I is a candidate instruction for reduction vectorization.
isReductionCandidate(Instruction * I)18230 static bool isReductionCandidate(Instruction *I) {
18231 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
18232 Value *B0 = nullptr, *B1 = nullptr;
18233 bool IsBinop = matchRdxBop(I, B0, B1);
18234 return IsBinop || IsSelect;
18235 }
18236
vectorizeHorReduction(PHINode * P,Instruction * Root,BasicBlock * BB,BoUpSLP & R,TargetTransformInfo * TTI,SmallVectorImpl<WeakTrackingVH> & PostponedInsts)18237 bool SLPVectorizerPass::vectorizeHorReduction(
18238 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
18239 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
18240 if (!ShouldVectorizeHor)
18241 return false;
18242 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
18243
18244 if (Root->getParent() != BB || isa<PHINode>(Root))
18245 return false;
18246
18247 // If we can find a secondary reduction root, use that instead.
18248 auto SelectRoot = [&]() {
18249 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
18250 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
18251 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
18252 return NewRoot;
18253 return Root;
18254 };
18255
18256 // Start analysis starting from Root instruction. If horizontal reduction is
18257 // found, try to vectorize it. If it is not a horizontal reduction or
18258 // vectorization is not possible or not effective, and currently analyzed
18259 // instruction is a binary operation, try to vectorize the operands, using
18260 // pre-order DFS traversal order. If the operands were not vectorized, repeat
18261 // the same procedure considering each operand as a possible root of the
18262 // horizontal reduction.
18263 // Interrupt the process if the Root instruction itself was vectorized or all
18264 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
18265 // If a horizintal reduction was not matched or vectorized we collect
18266 // instructions for possible later attempts for vectorization.
18267 std::queue<std::pair<Instruction *, unsigned>> Stack;
18268 Stack.emplace(SelectRoot(), 0);
18269 SmallPtrSet<Value *, 8> VisitedInstrs;
18270 bool Res = false;
18271 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
18272 if (R.isAnalyzedReductionRoot(Inst))
18273 return nullptr;
18274 if (!isReductionCandidate(Inst))
18275 return nullptr;
18276 HorizontalReduction HorRdx;
18277 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
18278 return nullptr;
18279 return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
18280 };
18281 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
18282 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
18283 FutureSeed = getNonPhiOperand(Root, P);
18284 if (!FutureSeed)
18285 return false;
18286 }
18287 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
18288 // analysis is done separately.
18289 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
18290 PostponedInsts.push_back(FutureSeed);
18291 return true;
18292 };
18293
18294 while (!Stack.empty()) {
18295 Instruction *Inst;
18296 unsigned Level;
18297 std::tie(Inst, Level) = Stack.front();
18298 Stack.pop();
18299 // Do not try to analyze instruction that has already been vectorized.
18300 // This may happen when we vectorize instruction operands on a previous
18301 // iteration while stack was populated before that happened.
18302 if (R.isDeleted(Inst))
18303 continue;
18304 if (Value *VectorizedV = TryToReduce(Inst)) {
18305 Res = true;
18306 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
18307 // Try to find another reduction.
18308 Stack.emplace(I, Level);
18309 continue;
18310 }
18311 if (R.isDeleted(Inst))
18312 continue;
18313 } else {
18314 // We could not vectorize `Inst` so try to use it as a future seed.
18315 if (!TryAppendToPostponedInsts(Inst)) {
18316 assert(Stack.empty() && "Expected empty stack");
18317 break;
18318 }
18319 }
18320
18321 // Try to vectorize operands.
18322 // Continue analysis for the instruction from the same basic block only to
18323 // save compile time.
18324 if (++Level < RecursionMaxDepth)
18325 for (auto *Op : Inst->operand_values())
18326 if (VisitedInstrs.insert(Op).second)
18327 if (auto *I = dyn_cast<Instruction>(Op))
18328 // Do not try to vectorize CmpInst operands, this is done
18329 // separately.
18330 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
18331 !R.isDeleted(I) && I->getParent() == BB)
18332 Stack.emplace(I, Level);
18333 }
18334 return Res;
18335 }
18336
vectorizeRootInstruction(PHINode * P,Instruction * Root,BasicBlock * BB,BoUpSLP & R,TargetTransformInfo * TTI)18337 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
18338 BasicBlock *BB, BoUpSLP &R,
18339 TargetTransformInfo *TTI) {
18340 SmallVector<WeakTrackingVH> PostponedInsts;
18341 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
18342 Res |= tryToVectorize(PostponedInsts, R);
18343 return Res;
18344 }
18345
tryToVectorize(ArrayRef<WeakTrackingVH> Insts,BoUpSLP & R)18346 bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
18347 BoUpSLP &R) {
18348 bool Res = false;
18349 for (Value *V : Insts)
18350 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
18351 Res |= tryToVectorize(Inst, R);
18352 return Res;
18353 }
18354
vectorizeInsertValueInst(InsertValueInst * IVI,BasicBlock * BB,BoUpSLP & R,bool MaxVFOnly)18355 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
18356 BasicBlock *BB, BoUpSLP &R,
18357 bool MaxVFOnly) {
18358 if (!R.canMapToVector(IVI->getType()))
18359 return false;
18360
18361 SmallVector<Value *, 16> BuildVectorOpds;
18362 SmallVector<Value *, 16> BuildVectorInsts;
18363 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
18364 return false;
18365
18366 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
18367 R.getORE()->emit([&]() {
18368 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
18369 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
18370 "trying reduction first.";
18371 });
18372 return false;
18373 }
18374 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
18375 // Aggregate value is unlikely to be processed in vector register.
18376 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
18377 }
18378
vectorizeInsertElementInst(InsertElementInst * IEI,BasicBlock * BB,BoUpSLP & R,bool MaxVFOnly)18379 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
18380 BasicBlock *BB, BoUpSLP &R,
18381 bool MaxVFOnly) {
18382 SmallVector<Value *, 16> BuildVectorInsts;
18383 SmallVector<Value *, 16> BuildVectorOpds;
18384 SmallVector<int> Mask;
18385 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
18386 (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18387 isFixedVectorShuffle(BuildVectorOpds, Mask)))
18388 return false;
18389
18390 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
18391 R.getORE()->emit([&]() {
18392 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
18393 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
18394 "trying reduction first.";
18395 });
18396 return false;
18397 }
18398 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
18399 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
18400 }
18401
18402 template <typename T>
tryToVectorizeSequence(SmallVectorImpl<T * > & Incoming,function_ref<bool (T *,T *)> Comparator,function_ref<bool (T *,T *)> AreCompatible,function_ref<bool (ArrayRef<T * >,bool)> TryToVectorizeHelper,bool MaxVFOnly,BoUpSLP & R)18403 static bool tryToVectorizeSequence(
18404 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
18405 function_ref<bool(T *, T *)> AreCompatible,
18406 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
18407 bool MaxVFOnly, BoUpSLP &R) {
18408 bool Changed = false;
18409 // Sort by type, parent, operands.
18410 stable_sort(Incoming, Comparator);
18411
18412 // Try to vectorize elements base on their type.
18413 SmallVector<T *> Candidates;
18414 SmallVector<T *> VL;
18415 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
18416 VL.clear()) {
18417 // Look for the next elements with the same type, parent and operand
18418 // kinds.
18419 auto *I = dyn_cast<Instruction>(*IncIt);
18420 if (!I || R.isDeleted(I)) {
18421 ++IncIt;
18422 continue;
18423 }
18424 auto *SameTypeIt = IncIt;
18425 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
18426 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18427 AreCompatible(*SameTypeIt, *IncIt))) {
18428 auto *I = dyn_cast<Instruction>(*SameTypeIt);
18429 ++SameTypeIt;
18430 if (I && !R.isDeleted(I))
18431 VL.push_back(cast<T>(I));
18432 }
18433
18434 // Try to vectorize them.
18435 unsigned NumElts = VL.size();
18436 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
18437 << NumElts << ")\n");
18438 // The vectorization is a 3-state attempt:
18439 // 1. Try to vectorize instructions with the same/alternate opcodes with the
18440 // size of maximal register at first.
18441 // 2. Try to vectorize remaining instructions with the same type, if
18442 // possible. This may result in the better vectorization results rather than
18443 // if we try just to vectorize instructions with the same/alternate opcodes.
18444 // 3. Final attempt to try to vectorize all instructions with the
18445 // same/alternate ops only, this may result in some extra final
18446 // vectorization.
18447 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
18448 // Success start over because instructions might have been changed.
18449 Changed = true;
18450 VL.swap(Candidates);
18451 Candidates.clear();
18452 for (T *V : VL) {
18453 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18454 Candidates.push_back(V);
18455 }
18456 } else {
18457 /// \Returns the minimum number of elements that we will attempt to
18458 /// vectorize.
18459 auto GetMinNumElements = [&R](Value *V) {
18460 unsigned EltSize = R.getVectorElementSize(V);
18461 return std::max(2U, R.getMaxVecRegSize() / EltSize);
18462 };
18463 if (NumElts < GetMinNumElements(*IncIt) &&
18464 (Candidates.empty() ||
18465 Candidates.front()->getType() == (*IncIt)->getType())) {
18466 for (T *V : VL) {
18467 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18468 Candidates.push_back(V);
18469 }
18470 }
18471 }
18472 // Final attempt to vectorize instructions with the same types.
18473 if (Candidates.size() > 1 &&
18474 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18475 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
18476 // Success start over because instructions might have been changed.
18477 Changed = true;
18478 } else if (MaxVFOnly) {
18479 // Try to vectorize using small vectors.
18480 SmallVector<T *> VL;
18481 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
18482 VL.clear()) {
18483 auto *I = dyn_cast<Instruction>(*It);
18484 if (!I || R.isDeleted(I)) {
18485 ++It;
18486 continue;
18487 }
18488 auto *SameTypeIt = It;
18489 while (SameTypeIt != End &&
18490 (!isa<Instruction>(*SameTypeIt) ||
18491 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18492 AreCompatible(*SameTypeIt, *It))) {
18493 auto *I = dyn_cast<Instruction>(*SameTypeIt);
18494 ++SameTypeIt;
18495 if (I && !R.isDeleted(I))
18496 VL.push_back(cast<T>(I));
18497 }
18498 unsigned NumElts = VL.size();
18499 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
18500 /*MaxVFOnly=*/false))
18501 Changed = true;
18502 It = SameTypeIt;
18503 }
18504 }
18505 Candidates.clear();
18506 }
18507
18508 // Start over at the next instruction of a different type (or the end).
18509 IncIt = SameTypeIt;
18510 }
18511 return Changed;
18512 }
18513
18514 /// Compare two cmp instructions. If IsCompatibility is true, function returns
18515 /// true if 2 cmps have same/swapped predicates and mos compatible corresponding
18516 /// operands. If IsCompatibility is false, function implements strict weak
18517 /// ordering relation between two cmp instructions, returning true if the first
18518 /// instruction is "less" than the second, i.e. its predicate is less than the
18519 /// predicate of the second or the operands IDs are less than the operands IDs
18520 /// of the second cmp instruction.
18521 template <bool IsCompatibility>
compareCmp(Value * V,Value * V2,TargetLibraryInfo & TLI,const DominatorTree & DT)18522 static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
18523 const DominatorTree &DT) {
18524 assert(isValidElementType(V->getType()) &&
18525 isValidElementType(V2->getType()) &&
18526 "Expected valid element types only.");
18527 if (V == V2)
18528 return IsCompatibility;
18529 auto *CI1 = cast<CmpInst>(V);
18530 auto *CI2 = cast<CmpInst>(V2);
18531 if (CI1->getOperand(0)->getType()->getTypeID() <
18532 CI2->getOperand(0)->getType()->getTypeID())
18533 return !IsCompatibility;
18534 if (CI1->getOperand(0)->getType()->getTypeID() >
18535 CI2->getOperand(0)->getType()->getTypeID())
18536 return false;
18537 CmpInst::Predicate Pred1 = CI1->getPredicate();
18538 CmpInst::Predicate Pred2 = CI2->getPredicate();
18539 CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1);
18540 CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2);
18541 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
18542 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
18543 if (BasePred1 < BasePred2)
18544 return !IsCompatibility;
18545 if (BasePred1 > BasePred2)
18546 return false;
18547 // Compare operands.
18548 bool CI1Preds = Pred1 == BasePred1;
18549 bool CI2Preds = Pred2 == BasePred1;
18550 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
18551 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
18552 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
18553 if (Op1 == Op2)
18554 continue;
18555 if (Op1->getValueID() < Op2->getValueID())
18556 return !IsCompatibility;
18557 if (Op1->getValueID() > Op2->getValueID())
18558 return false;
18559 if (auto *I1 = dyn_cast<Instruction>(Op1))
18560 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
18561 if (IsCompatibility) {
18562 if (I1->getParent() != I2->getParent())
18563 return false;
18564 } else {
18565 // Try to compare nodes with same parent.
18566 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
18567 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
18568 if (!NodeI1)
18569 return NodeI2 != nullptr;
18570 if (!NodeI2)
18571 return false;
18572 assert((NodeI1 == NodeI2) ==
18573 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18574 "Different nodes should have different DFS numbers");
18575 if (NodeI1 != NodeI2)
18576 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18577 }
18578 InstructionsState S = getSameOpcode({I1, I2}, TLI);
18579 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18580 continue;
18581 if (IsCompatibility)
18582 return false;
18583 if (I1->getOpcode() != I2->getOpcode())
18584 return I1->getOpcode() < I2->getOpcode();
18585 }
18586 }
18587 return IsCompatibility;
18588 }
18589
18590 template <typename ItT>
vectorizeCmpInsts(iterator_range<ItT> CmpInsts,BasicBlock * BB,BoUpSLP & R)18591 bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
18592 BasicBlock *BB, BoUpSLP &R) {
18593 bool Changed = false;
18594 // Try to find reductions first.
18595 for (CmpInst *I : CmpInsts) {
18596 if (R.isDeleted(I))
18597 continue;
18598 for (Value *Op : I->operands())
18599 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
18600 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
18601 if (R.isDeleted(I))
18602 break;
18603 }
18604 }
18605 // Try to vectorize operands as vector bundles.
18606 for (CmpInst *I : CmpInsts) {
18607 if (R.isDeleted(I))
18608 continue;
18609 Changed |= tryToVectorize(I, R);
18610 }
18611 // Try to vectorize list of compares.
18612 // Sort by type, compare predicate, etc.
18613 auto CompareSorter = [&](Value *V, Value *V2) {
18614 if (V == V2)
18615 return false;
18616 return compareCmp<false>(V, V2, *TLI, *DT);
18617 };
18618
18619 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
18620 if (V1 == V2)
18621 return true;
18622 return compareCmp<true>(V1, V2, *TLI, *DT);
18623 };
18624
18625 SmallVector<Value *> Vals;
18626 for (Instruction *V : CmpInsts)
18627 if (!R.isDeleted(V) && isValidElementType(V->getType()))
18628 Vals.push_back(V);
18629 if (Vals.size() <= 1)
18630 return Changed;
18631 Changed |= tryToVectorizeSequence<Value>(
18632 Vals, CompareSorter, AreCompatibleCompares,
18633 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18634 // Exclude possible reductions from other blocks.
18635 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
18636 return any_of(V->users(), [V](User *U) {
18637 auto *Select = dyn_cast<SelectInst>(U);
18638 return Select &&
18639 Select->getParent() != cast<Instruction>(V)->getParent();
18640 });
18641 });
18642 if (ArePossiblyReducedInOtherBlock)
18643 return false;
18644 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18645 },
18646 /*MaxVFOnly=*/true, R);
18647 return Changed;
18648 }
18649
vectorizeInserts(InstSetVector & Instructions,BasicBlock * BB,BoUpSLP & R)18650 bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18651 BasicBlock *BB, BoUpSLP &R) {
18652 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18653 "This function only accepts Insert instructions");
18654 bool OpsChanged = false;
18655 SmallVector<WeakTrackingVH> PostponedInsts;
18656 for (auto *I : reverse(Instructions)) {
18657 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
18658 if (R.isDeleted(I) || isa<CmpInst>(I))
18659 continue;
18660 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18661 OpsChanged |=
18662 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
18663 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18664 OpsChanged |=
18665 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
18666 }
18667 // pass2 - try to vectorize reductions only
18668 if (R.isDeleted(I))
18669 continue;
18670 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
18671 if (R.isDeleted(I) || isa<CmpInst>(I))
18672 continue;
18673 // pass3 - try to match and vectorize a buildvector sequence.
18674 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18675 OpsChanged |=
18676 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
18677 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18678 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
18679 /*MaxVFOnly=*/false);
18680 }
18681 }
18682 // Now try to vectorize postponed instructions.
18683 OpsChanged |= tryToVectorize(PostponedInsts, R);
18684
18685 Instructions.clear();
18686 return OpsChanged;
18687 }
18688
vectorizeChainsInBlock(BasicBlock * BB,BoUpSLP & R)18689 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
18690 bool Changed = false;
18691 SmallVector<Value *, 4> Incoming;
18692 SmallPtrSet<Value *, 16> VisitedInstrs;
18693 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
18694 // node. Allows better to identify the chains that can be vectorized in the
18695 // better way.
18696 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
18697 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
18698 assert(isValidElementType(V1->getType()) &&
18699 isValidElementType(V2->getType()) &&
18700 "Expected vectorizable types only.");
18701 // It is fine to compare type IDs here, since we expect only vectorizable
18702 // types, like ints, floats and pointers, we don't care about other type.
18703 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
18704 return true;
18705 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
18706 return false;
18707 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18708 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18709 if (Opcodes1.size() < Opcodes2.size())
18710 return true;
18711 if (Opcodes1.size() > Opcodes2.size())
18712 return false;
18713 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18714 {
18715 // Instructions come first.
18716 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
18717 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
18718 if (I1 && I2) {
18719 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
18720 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
18721 if (!NodeI1)
18722 return NodeI2 != nullptr;
18723 if (!NodeI2)
18724 return false;
18725 assert((NodeI1 == NodeI2) ==
18726 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18727 "Different nodes should have different DFS numbers");
18728 if (NodeI1 != NodeI2)
18729 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18730 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18731 if (S.getOpcode() && !S.isAltShuffle())
18732 continue;
18733 return I1->getOpcode() < I2->getOpcode();
18734 }
18735 if (I1)
18736 return true;
18737 if (I2)
18738 return false;
18739 }
18740 {
18741 // Non-undef constants come next.
18742 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
18743 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
18744 if (C1 && C2)
18745 continue;
18746 if (C1)
18747 return true;
18748 if (C2)
18749 return false;
18750 }
18751 bool U1 = isa<UndefValue>(Opcodes1[I]);
18752 bool U2 = isa<UndefValue>(Opcodes2[I]);
18753 {
18754 // Non-constant non-instructions come next.
18755 if (!U1 && !U2) {
18756 auto ValID1 = Opcodes1[I]->getValueID();
18757 auto ValID2 = Opcodes2[I]->getValueID();
18758 if (ValID1 == ValID2)
18759 continue;
18760 if (ValID1 < ValID2)
18761 return true;
18762 if (ValID1 > ValID2)
18763 return false;
18764 }
18765 if (!U1)
18766 return true;
18767 if (!U2)
18768 return false;
18769 }
18770 // Undefs come last.
18771 assert(U1 && U2 && "The only thing left should be undef & undef.");
18772 continue;
18773 }
18774 return false;
18775 };
18776 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
18777 if (V1 == V2)
18778 return true;
18779 if (V1->getType() != V2->getType())
18780 return false;
18781 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18782 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18783 if (Opcodes1.size() != Opcodes2.size())
18784 return false;
18785 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18786 // Undefs are compatible with any other value.
18787 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
18788 continue;
18789 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
18790 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
18791 if (R.isDeleted(I1) || R.isDeleted(I2))
18792 return false;
18793 if (I1->getParent() != I2->getParent())
18794 return false;
18795 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18796 if (S.getOpcode())
18797 continue;
18798 return false;
18799 }
18800 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
18801 continue;
18802 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
18803 return false;
18804 }
18805 return true;
18806 };
18807
18808 bool HaveVectorizedPhiNodes = false;
18809 do {
18810 // Collect the incoming values from the PHIs.
18811 Incoming.clear();
18812 for (Instruction &I : *BB) {
18813 auto *P = dyn_cast<PHINode>(&I);
18814 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
18815 break;
18816
18817 // No need to analyze deleted, vectorized and non-vectorizable
18818 // instructions.
18819 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
18820 isValidElementType(P->getType()))
18821 Incoming.push_back(P);
18822 }
18823
18824 if (Incoming.size() <= 1)
18825 break;
18826
18827 // Find the corresponding non-phi nodes for better matching when trying to
18828 // build the tree.
18829 for (Value *V : Incoming) {
18830 SmallVectorImpl<Value *> &Opcodes =
18831 PHIToOpcodes.try_emplace(V).first->getSecond();
18832 if (!Opcodes.empty())
18833 continue;
18834 SmallVector<Value *, 4> Nodes(1, V);
18835 SmallPtrSet<Value *, 4> Visited;
18836 while (!Nodes.empty()) {
18837 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
18838 if (!Visited.insert(PHI).second)
18839 continue;
18840 for (Value *V : PHI->incoming_values()) {
18841 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
18842 Nodes.push_back(PHI1);
18843 continue;
18844 }
18845 Opcodes.emplace_back(V);
18846 }
18847 }
18848 }
18849
18850 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18851 Incoming, PHICompare, AreCompatiblePHIs,
18852 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18853 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18854 },
18855 /*MaxVFOnly=*/true, R);
18856 Changed |= HaveVectorizedPhiNodes;
18857 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
18858 auto *PHI = dyn_cast<PHINode>(P.first);
18859 return !PHI || R.isDeleted(PHI);
18860 }))
18861 PHIToOpcodes.clear();
18862 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
18863 } while (HaveVectorizedPhiNodes);
18864
18865 VisitedInstrs.clear();
18866
18867 InstSetVector PostProcessInserts;
18868 SmallSetVector<CmpInst *, 8> PostProcessCmps;
18869 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
18870 // also vectorizes `PostProcessCmps`.
18871 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
18872 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18873 if (VectorizeCmps) {
18874 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
18875 PostProcessCmps.clear();
18876 }
18877 PostProcessInserts.clear();
18878 return Changed;
18879 };
18880 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
18881 auto IsInPostProcessInstrs = [&](Instruction *I) {
18882 if (auto *Cmp = dyn_cast<CmpInst>(I))
18883 return PostProcessCmps.contains(Cmp);
18884 return isa<InsertElementInst, InsertValueInst>(I) &&
18885 PostProcessInserts.contains(I);
18886 };
18887 // Returns true if `I` is an instruction without users, like terminator, or
18888 // function call with ignored return value, store. Ignore unused instructions
18889 // (basing on instruction type, except for CallInst and InvokeInst).
18890 auto HasNoUsers = [](Instruction *I) {
18891 return I->use_empty() &&
18892 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
18893 };
18894 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
18895 // Skip instructions with scalable type. The num of elements is unknown at
18896 // compile-time for scalable type.
18897 if (isa<ScalableVectorType>(It->getType()))
18898 continue;
18899
18900 // Skip instructions marked for the deletion.
18901 if (R.isDeleted(&*It))
18902 continue;
18903 // We may go through BB multiple times so skip the one we have checked.
18904 if (!VisitedInstrs.insert(&*It).second) {
18905 if (HasNoUsers(&*It) &&
18906 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
18907 // We would like to start over since some instructions are deleted
18908 // and the iterator may become invalid value.
18909 Changed = true;
18910 It = BB->begin();
18911 E = BB->end();
18912 }
18913 continue;
18914 }
18915
18916 if (isa<DbgInfoIntrinsic>(It))
18917 continue;
18918
18919 // Try to vectorize reductions that use PHINodes.
18920 if (PHINode *P = dyn_cast<PHINode>(It)) {
18921 // Check that the PHI is a reduction PHI.
18922 if (P->getNumIncomingValues() == 2) {
18923 // Try to match and vectorize a horizontal reduction.
18924 Instruction *Root = getReductionInstr(DT, P, BB, LI);
18925 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
18926 Changed = true;
18927 It = BB->begin();
18928 E = BB->end();
18929 continue;
18930 }
18931 }
18932 // Try to vectorize the incoming values of the PHI, to catch reductions
18933 // that feed into PHIs.
18934 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
18935 // Skip if the incoming block is the current BB for now. Also, bypass
18936 // unreachable IR for efficiency and to avoid crashing.
18937 // TODO: Collect the skipped incoming values and try to vectorize them
18938 // after processing BB.
18939 if (BB == P->getIncomingBlock(I) ||
18940 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
18941 continue;
18942
18943 // Postponed instructions should not be vectorized here, delay their
18944 // vectorization.
18945 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
18946 PI && !IsInPostProcessInstrs(PI)) {
18947 bool Res = vectorizeRootInstruction(nullptr, PI,
18948 P->getIncomingBlock(I), R, TTI);
18949 Changed |= Res;
18950 if (Res && R.isDeleted(P)) {
18951 It = BB->begin();
18952 E = BB->end();
18953 break;
18954 }
18955 }
18956 }
18957 continue;
18958 }
18959
18960 if (HasNoUsers(&*It)) {
18961 bool OpsChanged = false;
18962 auto *SI = dyn_cast<StoreInst>(It);
18963 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
18964 if (SI) {
18965 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
18966 // Try to vectorize chain in store, if this is the only store to the
18967 // address in the block.
18968 // TODO: This is just a temporarily solution to save compile time. Need
18969 // to investigate if we can safely turn on slp-vectorize-hor-store
18970 // instead to allow lookup for reduction chains in all non-vectorized
18971 // stores (need to check side effects and compile time).
18972 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
18973 SI->getValueOperand()->hasOneUse();
18974 }
18975 if (TryToVectorizeRoot) {
18976 for (auto *V : It->operand_values()) {
18977 // Postponed instructions should not be vectorized here, delay their
18978 // vectorization.
18979 if (auto *VI = dyn_cast<Instruction>(V);
18980 VI && !IsInPostProcessInstrs(VI))
18981 // Try to match and vectorize a horizontal reduction.
18982 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
18983 }
18984 }
18985 // Start vectorization of post-process list of instructions from the
18986 // top-tree instructions to try to vectorize as many instructions as
18987 // possible.
18988 OpsChanged |=
18989 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
18990 if (OpsChanged) {
18991 // We would like to start over since some instructions are deleted
18992 // and the iterator may become invalid value.
18993 Changed = true;
18994 It = BB->begin();
18995 E = BB->end();
18996 continue;
18997 }
18998 }
18999
19000 if (isa<InsertElementInst, InsertValueInst>(It))
19001 PostProcessInserts.insert(&*It);
19002 else if (isa<CmpInst>(It))
19003 PostProcessCmps.insert(cast<CmpInst>(&*It));
19004 }
19005
19006 return Changed;
19007 }
19008
vectorizeGEPIndices(BasicBlock * BB,BoUpSLP & R)19009 bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
19010 auto Changed = false;
19011 for (auto &Entry : GEPs) {
19012 // If the getelementptr list has fewer than two elements, there's nothing
19013 // to do.
19014 if (Entry.second.size() < 2)
19015 continue;
19016
19017 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
19018 << Entry.second.size() << ".\n");
19019
19020 // Process the GEP list in chunks suitable for the target's supported
19021 // vector size. If a vector register can't hold 1 element, we are done. We
19022 // are trying to vectorize the index computations, so the maximum number of
19023 // elements is based on the size of the index expression, rather than the
19024 // size of the GEP itself (the target's pointer size).
19025 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
19026 return !R.isDeleted(GEP);
19027 });
19028 if (It == Entry.second.end())
19029 continue;
19030 unsigned MaxVecRegSize = R.getMaxVecRegSize();
19031 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
19032 if (MaxVecRegSize < EltSize)
19033 continue;
19034
19035 unsigned MaxElts = MaxVecRegSize / EltSize;
19036 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
19037 auto Len = std::min<unsigned>(BE - BI, MaxElts);
19038 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
19039
19040 // Initialize a set a candidate getelementptrs. Note that we use a
19041 // SetVector here to preserve program order. If the index computations
19042 // are vectorizable and begin with loads, we want to minimize the chance
19043 // of having to reorder them later.
19044 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
19045
19046 // Some of the candidates may have already been vectorized after we
19047 // initially collected them or their index is optimized to constant value.
19048 // If so, they are marked as deleted, so remove them from the set of
19049 // candidates.
19050 Candidates.remove_if([&R](Value *I) {
19051 return R.isDeleted(cast<Instruction>(I)) ||
19052 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
19053 });
19054
19055 // Remove from the set of candidates all pairs of getelementptrs with
19056 // constant differences. Such getelementptrs are likely not good
19057 // candidates for vectorization in a bottom-up phase since one can be
19058 // computed from the other. We also ensure all candidate getelementptr
19059 // indices are unique.
19060 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
19061 auto *GEPI = GEPList[I];
19062 if (!Candidates.count(GEPI))
19063 continue;
19064 auto *SCEVI = SE->getSCEV(GEPList[I]);
19065 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
19066 auto *GEPJ = GEPList[J];
19067 auto *SCEVJ = SE->getSCEV(GEPList[J]);
19068 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
19069 Candidates.remove(GEPI);
19070 Candidates.remove(GEPJ);
19071 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
19072 Candidates.remove(GEPJ);
19073 }
19074 }
19075 }
19076
19077 // We break out of the above computation as soon as we know there are
19078 // fewer than two candidates remaining.
19079 if (Candidates.size() < 2)
19080 continue;
19081
19082 // Add the single, non-constant index of each candidate to the bundle. We
19083 // ensured the indices met these constraints when we originally collected
19084 // the getelementptrs.
19085 SmallVector<Value *, 16> Bundle(Candidates.size());
19086 auto BundleIndex = 0u;
19087 for (auto *V : Candidates) {
19088 auto *GEP = cast<GetElementPtrInst>(V);
19089 auto *GEPIdx = GEP->idx_begin()->get();
19090 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
19091 Bundle[BundleIndex++] = GEPIdx;
19092 }
19093
19094 // Try and vectorize the indices. We are currently only interested in
19095 // gather-like cases of the form:
19096 //
19097 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
19098 //
19099 // where the loads of "a", the loads of "b", and the subtractions can be
19100 // performed in parallel. It's likely that detecting this pattern in a
19101 // bottom-up phase will be simpler and less costly than building a
19102 // full-blown top-down phase beginning at the consecutive loads.
19103 Changed |= tryToVectorizeList(Bundle, R);
19104 }
19105 }
19106 return Changed;
19107 }
19108
vectorizeStoreChains(BoUpSLP & R)19109 bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
19110 bool Changed = false;
19111 // Sort by type, base pointers and values operand. Value operands must be
19112 // compatible (have the same opcode, same parent), otherwise it is
19113 // definitely not profitable to try to vectorize them.
19114 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
19115 if (V->getValueOperand()->getType()->getTypeID() <
19116 V2->getValueOperand()->getType()->getTypeID())
19117 return true;
19118 if (V->getValueOperand()->getType()->getTypeID() >
19119 V2->getValueOperand()->getType()->getTypeID())
19120 return false;
19121 if (V->getPointerOperandType()->getTypeID() <
19122 V2->getPointerOperandType()->getTypeID())
19123 return true;
19124 if (V->getPointerOperandType()->getTypeID() >
19125 V2->getPointerOperandType()->getTypeID())
19126 return false;
19127 // UndefValues are compatible with all other values.
19128 if (isa<UndefValue>(V->getValueOperand()) ||
19129 isa<UndefValue>(V2->getValueOperand()))
19130 return false;
19131 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
19132 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
19133 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
19134 DT->getNode(I1->getParent());
19135 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
19136 DT->getNode(I2->getParent());
19137 assert(NodeI1 && "Should only process reachable instructions");
19138 assert(NodeI2 && "Should only process reachable instructions");
19139 assert((NodeI1 == NodeI2) ==
19140 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
19141 "Different nodes should have different DFS numbers");
19142 if (NodeI1 != NodeI2)
19143 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
19144 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19145 if (S.getOpcode())
19146 return false;
19147 return I1->getOpcode() < I2->getOpcode();
19148 }
19149 if (isa<Constant>(V->getValueOperand()) &&
19150 isa<Constant>(V2->getValueOperand()))
19151 return false;
19152 return V->getValueOperand()->getValueID() <
19153 V2->getValueOperand()->getValueID();
19154 };
19155
19156 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
19157 if (V1 == V2)
19158 return true;
19159 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
19160 return false;
19161 if (V1->getPointerOperandType() != V2->getPointerOperandType())
19162 return false;
19163 // Undefs are compatible with any other value.
19164 if (isa<UndefValue>(V1->getValueOperand()) ||
19165 isa<UndefValue>(V2->getValueOperand()))
19166 return true;
19167 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
19168 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
19169 if (I1->getParent() != I2->getParent())
19170 return false;
19171 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
19172 return S.getOpcode() > 0;
19173 }
19174 if (isa<Constant>(V1->getValueOperand()) &&
19175 isa<Constant>(V2->getValueOperand()))
19176 return true;
19177 return V1->getValueOperand()->getValueID() ==
19178 V2->getValueOperand()->getValueID();
19179 };
19180
19181 // Attempt to sort and vectorize each of the store-groups.
19182 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
19183 for (auto &Pair : Stores) {
19184 if (Pair.second.size() < 2)
19185 continue;
19186
19187 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
19188 << Pair.second.size() << ".\n");
19189
19190 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
19191 continue;
19192
19193 // Reverse stores to do bottom-to-top analysis. This is important if the
19194 // values are stores to the same addresses several times, in this case need
19195 // to follow the stores order (reversed to meet the memory dependecies).
19196 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
19197 Pair.second.rend());
19198 Changed |= tryToVectorizeSequence<StoreInst>(
19199 ReversedStores, StoreSorter, AreCompatibleStores,
19200 [&](ArrayRef<StoreInst *> Candidates, bool) {
19201 return vectorizeStores(Candidates, R, Attempted);
19202 },
19203 /*MaxVFOnly=*/false, R);
19204 }
19205 return Changed;
19206 }
19207