1ac9a064cSDimitry Andric //===-------- LoopIdiomVectorize.cpp - Loop idiom vectorization -----------===//
2ac9a064cSDimitry Andric //
3ac9a064cSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4ac9a064cSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5ac9a064cSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6ac9a064cSDimitry Andric //
7ac9a064cSDimitry Andric //===----------------------------------------------------------------------===//
8ac9a064cSDimitry Andric //
9ac9a064cSDimitry Andric // This pass implements a pass that recognizes certain loop idioms and
10ac9a064cSDimitry Andric // transforms them into more optimized versions of the same loop. In cases
11ac9a064cSDimitry Andric // where this happens, it can be a significant performance win.
12ac9a064cSDimitry Andric //
13ac9a064cSDimitry Andric // We currently only recognize one loop that finds the first mismatched byte
14ac9a064cSDimitry Andric // in an array and returns the index, i.e. something like:
15ac9a064cSDimitry Andric //
16ac9a064cSDimitry Andric // while (++i != n) {
17ac9a064cSDimitry Andric // if (a[i] != b[i])
18ac9a064cSDimitry Andric // break;
19ac9a064cSDimitry Andric // }
20ac9a064cSDimitry Andric //
21ac9a064cSDimitry Andric // In this example we can actually vectorize the loop despite the early exit,
22ac9a064cSDimitry Andric // although the loop vectorizer does not support it. It requires some extra
23ac9a064cSDimitry Andric // checks to deal with the possibility of faulting loads when crossing page
24ac9a064cSDimitry Andric // boundaries. However, even with these checks it is still profitable to do the
25ac9a064cSDimitry Andric // transformation.
26ac9a064cSDimitry Andric //
27ac9a064cSDimitry Andric //===----------------------------------------------------------------------===//
28ac9a064cSDimitry Andric //
29ac9a064cSDimitry Andric // NOTE: This Pass matches a really specific loop pattern because it's only
30ac9a064cSDimitry Andric // supposed to be a temporary solution until our LoopVectorizer is powerful
31ac9a064cSDimitry Andric // enought to vectorize it automatically.
32ac9a064cSDimitry Andric //
33ac9a064cSDimitry Andric // TODO List:
34ac9a064cSDimitry Andric //
35ac9a064cSDimitry Andric // * Add support for the inverse case where we scan for a matching element.
36ac9a064cSDimitry Andric // * Permit 64-bit induction variable types.
37ac9a064cSDimitry Andric // * Recognize loops that increment the IV *after* comparing bytes.
38ac9a064cSDimitry Andric // * Allow 32-bit sign-extends of the IV used by the GEP.
39ac9a064cSDimitry Andric //
40ac9a064cSDimitry Andric //===----------------------------------------------------------------------===//
41ac9a064cSDimitry Andric
42ac9a064cSDimitry Andric #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
43ac9a064cSDimitry Andric #include "llvm/Analysis/DomTreeUpdater.h"
44ac9a064cSDimitry Andric #include "llvm/Analysis/LoopPass.h"
45ac9a064cSDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
46ac9a064cSDimitry Andric #include "llvm/IR/Dominators.h"
47ac9a064cSDimitry Andric #include "llvm/IR/IRBuilder.h"
48ac9a064cSDimitry Andric #include "llvm/IR/Intrinsics.h"
49ac9a064cSDimitry Andric #include "llvm/IR/MDBuilder.h"
50ac9a064cSDimitry Andric #include "llvm/IR/PatternMatch.h"
51ac9a064cSDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h"
52ac9a064cSDimitry Andric
53ac9a064cSDimitry Andric using namespace llvm;
54ac9a064cSDimitry Andric using namespace PatternMatch;
55ac9a064cSDimitry Andric
56ac9a064cSDimitry Andric #define DEBUG_TYPE "loop-idiom-vectorize"
57ac9a064cSDimitry Andric
58ac9a064cSDimitry Andric static cl::opt<bool> DisableAll("disable-loop-idiom-vectorize-all", cl::Hidden,
59ac9a064cSDimitry Andric cl::init(false),
60ac9a064cSDimitry Andric cl::desc("Disable Loop Idiom Vectorize Pass."));
61ac9a064cSDimitry Andric
62ac9a064cSDimitry Andric static cl::opt<LoopIdiomVectorizeStyle>
63ac9a064cSDimitry Andric LITVecStyle("loop-idiom-vectorize-style", cl::Hidden,
64ac9a064cSDimitry Andric cl::desc("The vectorization style for loop idiom transform."),
65ac9a064cSDimitry Andric cl::values(clEnumValN(LoopIdiomVectorizeStyle::Masked, "masked",
66ac9a064cSDimitry Andric "Use masked vector intrinsics"),
67ac9a064cSDimitry Andric clEnumValN(LoopIdiomVectorizeStyle::Predicated,
68ac9a064cSDimitry Andric "predicated", "Use VP intrinsics")),
69ac9a064cSDimitry Andric cl::init(LoopIdiomVectorizeStyle::Masked));
70ac9a064cSDimitry Andric
71ac9a064cSDimitry Andric static cl::opt<bool>
72ac9a064cSDimitry Andric DisableByteCmp("disable-loop-idiom-vectorize-bytecmp", cl::Hidden,
73ac9a064cSDimitry Andric cl::init(false),
74ac9a064cSDimitry Andric cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
75ac9a064cSDimitry Andric "not convert byte-compare loop(s)."));
76ac9a064cSDimitry Andric
77ac9a064cSDimitry Andric static cl::opt<unsigned>
78ac9a064cSDimitry Andric ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden,
79ac9a064cSDimitry Andric cl::desc("The vectorization factor for byte-compare patterns."),
80ac9a064cSDimitry Andric cl::init(16));
81ac9a064cSDimitry Andric
82ac9a064cSDimitry Andric static cl::opt<bool>
83ac9a064cSDimitry Andric VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false),
84ac9a064cSDimitry Andric cl::desc("Verify loops generated Loop Idiom Vectorize Pass."));
85ac9a064cSDimitry Andric
86ac9a064cSDimitry Andric namespace {
87ac9a064cSDimitry Andric class LoopIdiomVectorize {
88ac9a064cSDimitry Andric LoopIdiomVectorizeStyle VectorizeStyle;
89ac9a064cSDimitry Andric unsigned ByteCompareVF;
90ac9a064cSDimitry Andric Loop *CurLoop = nullptr;
91ac9a064cSDimitry Andric DominatorTree *DT;
92ac9a064cSDimitry Andric LoopInfo *LI;
93ac9a064cSDimitry Andric const TargetTransformInfo *TTI;
94ac9a064cSDimitry Andric const DataLayout *DL;
95ac9a064cSDimitry Andric
96ac9a064cSDimitry Andric // Blocks that will be used for inserting vectorized code.
97ac9a064cSDimitry Andric BasicBlock *EndBlock = nullptr;
98ac9a064cSDimitry Andric BasicBlock *VectorLoopPreheaderBlock = nullptr;
99ac9a064cSDimitry Andric BasicBlock *VectorLoopStartBlock = nullptr;
100ac9a064cSDimitry Andric BasicBlock *VectorLoopMismatchBlock = nullptr;
101ac9a064cSDimitry Andric BasicBlock *VectorLoopIncBlock = nullptr;
102ac9a064cSDimitry Andric
103ac9a064cSDimitry Andric public:
LoopIdiomVectorize(LoopIdiomVectorizeStyle S,unsigned VF,DominatorTree * DT,LoopInfo * LI,const TargetTransformInfo * TTI,const DataLayout * DL)104ac9a064cSDimitry Andric LoopIdiomVectorize(LoopIdiomVectorizeStyle S, unsigned VF, DominatorTree *DT,
105ac9a064cSDimitry Andric LoopInfo *LI, const TargetTransformInfo *TTI,
106ac9a064cSDimitry Andric const DataLayout *DL)
107ac9a064cSDimitry Andric : VectorizeStyle(S), ByteCompareVF(VF), DT(DT), LI(LI), TTI(TTI), DL(DL) {
108ac9a064cSDimitry Andric }
109ac9a064cSDimitry Andric
110ac9a064cSDimitry Andric bool run(Loop *L);
111ac9a064cSDimitry Andric
112ac9a064cSDimitry Andric private:
113ac9a064cSDimitry Andric /// \name Countable Loop Idiom Handling
114ac9a064cSDimitry Andric /// @{
115ac9a064cSDimitry Andric
116ac9a064cSDimitry Andric bool runOnCountableLoop();
117ac9a064cSDimitry Andric bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
118ac9a064cSDimitry Andric SmallVectorImpl<BasicBlock *> &ExitBlocks);
119ac9a064cSDimitry Andric
120ac9a064cSDimitry Andric bool recognizeByteCompare();
121ac9a064cSDimitry Andric
122ac9a064cSDimitry Andric Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
123ac9a064cSDimitry Andric GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
124ac9a064cSDimitry Andric Instruction *Index, Value *Start, Value *MaxLen);
125ac9a064cSDimitry Andric
126ac9a064cSDimitry Andric Value *createMaskedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
127ac9a064cSDimitry Andric GetElementPtrInst *GEPA,
128ac9a064cSDimitry Andric GetElementPtrInst *GEPB, Value *ExtStart,
129ac9a064cSDimitry Andric Value *ExtEnd);
130ac9a064cSDimitry Andric Value *createPredicatedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
131ac9a064cSDimitry Andric GetElementPtrInst *GEPA,
132ac9a064cSDimitry Andric GetElementPtrInst *GEPB, Value *ExtStart,
133ac9a064cSDimitry Andric Value *ExtEnd);
134ac9a064cSDimitry Andric
135ac9a064cSDimitry Andric void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
136ac9a064cSDimitry Andric PHINode *IndPhi, Value *MaxLen, Instruction *Index,
137ac9a064cSDimitry Andric Value *Start, bool IncIdx, BasicBlock *FoundBB,
138ac9a064cSDimitry Andric BasicBlock *EndBB);
139ac9a064cSDimitry Andric /// @}
140ac9a064cSDimitry Andric };
141ac9a064cSDimitry Andric } // anonymous namespace
142ac9a064cSDimitry Andric
run(Loop & L,LoopAnalysisManager & AM,LoopStandardAnalysisResults & AR,LPMUpdater &)143ac9a064cSDimitry Andric PreservedAnalyses LoopIdiomVectorizePass::run(Loop &L, LoopAnalysisManager &AM,
144ac9a064cSDimitry Andric LoopStandardAnalysisResults &AR,
145ac9a064cSDimitry Andric LPMUpdater &) {
146ac9a064cSDimitry Andric if (DisableAll)
147ac9a064cSDimitry Andric return PreservedAnalyses::all();
148ac9a064cSDimitry Andric
149ac9a064cSDimitry Andric const auto *DL = &L.getHeader()->getDataLayout();
150ac9a064cSDimitry Andric
151ac9a064cSDimitry Andric LoopIdiomVectorizeStyle VecStyle = VectorizeStyle;
152ac9a064cSDimitry Andric if (LITVecStyle.getNumOccurrences())
153ac9a064cSDimitry Andric VecStyle = LITVecStyle;
154ac9a064cSDimitry Andric
155ac9a064cSDimitry Andric unsigned BCVF = ByteCompareVF;
156ac9a064cSDimitry Andric if (ByteCmpVF.getNumOccurrences())
157ac9a064cSDimitry Andric BCVF = ByteCmpVF;
158ac9a064cSDimitry Andric
159ac9a064cSDimitry Andric LoopIdiomVectorize LIV(VecStyle, BCVF, &AR.DT, &AR.LI, &AR.TTI, DL);
160ac9a064cSDimitry Andric if (!LIV.run(&L))
161ac9a064cSDimitry Andric return PreservedAnalyses::all();
162ac9a064cSDimitry Andric
163ac9a064cSDimitry Andric return PreservedAnalyses::none();
164ac9a064cSDimitry Andric }
165ac9a064cSDimitry Andric
166ac9a064cSDimitry Andric //===----------------------------------------------------------------------===//
167ac9a064cSDimitry Andric //
168ac9a064cSDimitry Andric // Implementation of LoopIdiomVectorize
169ac9a064cSDimitry Andric //
170ac9a064cSDimitry Andric //===----------------------------------------------------------------------===//
171ac9a064cSDimitry Andric
run(Loop * L)172ac9a064cSDimitry Andric bool LoopIdiomVectorize::run(Loop *L) {
173ac9a064cSDimitry Andric CurLoop = L;
174ac9a064cSDimitry Andric
175ac9a064cSDimitry Andric Function &F = *L->getHeader()->getParent();
176ac9a064cSDimitry Andric if (DisableAll || F.hasOptSize())
177ac9a064cSDimitry Andric return false;
178ac9a064cSDimitry Andric
179ac9a064cSDimitry Andric if (F.hasFnAttribute(Attribute::NoImplicitFloat)) {
180ac9a064cSDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE << " is disabled on " << F.getName()
181ac9a064cSDimitry Andric << " due to its NoImplicitFloat attribute");
182ac9a064cSDimitry Andric return false;
183ac9a064cSDimitry Andric }
184ac9a064cSDimitry Andric
185ac9a064cSDimitry Andric // If the loop could not be converted to canonical form, it must have an
186ac9a064cSDimitry Andric // indirectbr in it, just give up.
187ac9a064cSDimitry Andric if (!L->getLoopPreheader())
188ac9a064cSDimitry Andric return false;
189ac9a064cSDimitry Andric
190ac9a064cSDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %"
191ac9a064cSDimitry Andric << CurLoop->getHeader()->getName() << "\n");
192ac9a064cSDimitry Andric
193ac9a064cSDimitry Andric return recognizeByteCompare();
194ac9a064cSDimitry Andric }
195ac9a064cSDimitry Andric
recognizeByteCompare()196ac9a064cSDimitry Andric bool LoopIdiomVectorize::recognizeByteCompare() {
197ac9a064cSDimitry Andric // Currently the transformation only works on scalable vector types, although
198ac9a064cSDimitry Andric // there is no fundamental reason why it cannot be made to work for fixed
199ac9a064cSDimitry Andric // width too.
200ac9a064cSDimitry Andric
201ac9a064cSDimitry Andric // We also need to know the minimum page size for the target in order to
202ac9a064cSDimitry Andric // generate runtime memory checks to ensure the vector version won't fault.
203ac9a064cSDimitry Andric if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() ||
204ac9a064cSDimitry Andric DisableByteCmp)
205ac9a064cSDimitry Andric return false;
206ac9a064cSDimitry Andric
207ac9a064cSDimitry Andric BasicBlock *Header = CurLoop->getHeader();
208ac9a064cSDimitry Andric
209ac9a064cSDimitry Andric // In LoopIdiomVectorize::run we have already checked that the loop
210ac9a064cSDimitry Andric // has a preheader so we can assume it's in a canonical form.
211ac9a064cSDimitry Andric if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2)
212ac9a064cSDimitry Andric return false;
213ac9a064cSDimitry Andric
214ac9a064cSDimitry Andric PHINode *PN = dyn_cast<PHINode>(&Header->front());
215ac9a064cSDimitry Andric if (!PN || PN->getNumIncomingValues() != 2)
216ac9a064cSDimitry Andric return false;
217ac9a064cSDimitry Andric
218ac9a064cSDimitry Andric auto LoopBlocks = CurLoop->getBlocks();
219ac9a064cSDimitry Andric // The first block in the loop should contain only 4 instructions, e.g.
220ac9a064cSDimitry Andric //
221ac9a064cSDimitry Andric // while.cond:
222ac9a064cSDimitry Andric // %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ]
223ac9a064cSDimitry Andric // %inc = add i32 %res.phi, 1
224ac9a064cSDimitry Andric // %cmp.not = icmp eq i32 %inc, %n
225ac9a064cSDimitry Andric // br i1 %cmp.not, label %while.end, label %while.body
226ac9a064cSDimitry Andric //
227ac9a064cSDimitry Andric if (LoopBlocks[0]->sizeWithoutDebug() > 4)
228ac9a064cSDimitry Andric return false;
229ac9a064cSDimitry Andric
230ac9a064cSDimitry Andric // The second block should contain 7 instructions, e.g.
231ac9a064cSDimitry Andric //
232ac9a064cSDimitry Andric // while.body:
233ac9a064cSDimitry Andric // %idx = zext i32 %inc to i64
234ac9a064cSDimitry Andric // %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx
235ac9a064cSDimitry Andric // %load.a = load i8, ptr %idx.a
236ac9a064cSDimitry Andric // %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx
237ac9a064cSDimitry Andric // %load.b = load i8, ptr %idx.b
238ac9a064cSDimitry Andric // %cmp.not.ld = icmp eq i8 %load.a, %load.b
239ac9a064cSDimitry Andric // br i1 %cmp.not.ld, label %while.cond, label %while.end
240ac9a064cSDimitry Andric //
241ac9a064cSDimitry Andric if (LoopBlocks[1]->sizeWithoutDebug() > 7)
242ac9a064cSDimitry Andric return false;
243ac9a064cSDimitry Andric
244ac9a064cSDimitry Andric // The incoming value to the PHI node from the loop should be an add of 1.
245ac9a064cSDimitry Andric Value *StartIdx = nullptr;
246ac9a064cSDimitry Andric Instruction *Index = nullptr;
247ac9a064cSDimitry Andric if (!CurLoop->contains(PN->getIncomingBlock(0))) {
248ac9a064cSDimitry Andric StartIdx = PN->getIncomingValue(0);
249ac9a064cSDimitry Andric Index = dyn_cast<Instruction>(PN->getIncomingValue(1));
250ac9a064cSDimitry Andric } else {
251ac9a064cSDimitry Andric StartIdx = PN->getIncomingValue(1);
252ac9a064cSDimitry Andric Index = dyn_cast<Instruction>(PN->getIncomingValue(0));
253ac9a064cSDimitry Andric }
254ac9a064cSDimitry Andric
255ac9a064cSDimitry Andric // Limit to 32-bit types for now
256ac9a064cSDimitry Andric if (!Index || !Index->getType()->isIntegerTy(32) ||
257ac9a064cSDimitry Andric !match(Index, m_c_Add(m_Specific(PN), m_One())))
258ac9a064cSDimitry Andric return false;
259ac9a064cSDimitry Andric
260ac9a064cSDimitry Andric // If we match the pattern, PN and Index will be replaced with the result of
261ac9a064cSDimitry Andric // the cttz.elts intrinsic. If any other instructions are used outside of
262ac9a064cSDimitry Andric // the loop, we cannot replace it.
263ac9a064cSDimitry Andric for (BasicBlock *BB : LoopBlocks)
264ac9a064cSDimitry Andric for (Instruction &I : *BB)
265ac9a064cSDimitry Andric if (&I != PN && &I != Index)
266ac9a064cSDimitry Andric for (User *U : I.users())
267ac9a064cSDimitry Andric if (!CurLoop->contains(cast<Instruction>(U)))
268ac9a064cSDimitry Andric return false;
269ac9a064cSDimitry Andric
270ac9a064cSDimitry Andric // Match the branch instruction for the header
271ac9a064cSDimitry Andric ICmpInst::Predicate Pred;
272ac9a064cSDimitry Andric Value *MaxLen;
273ac9a064cSDimitry Andric BasicBlock *EndBB, *WhileBB;
274ac9a064cSDimitry Andric if (!match(Header->getTerminator(),
275ac9a064cSDimitry Andric m_Br(m_ICmp(Pred, m_Specific(Index), m_Value(MaxLen)),
276ac9a064cSDimitry Andric m_BasicBlock(EndBB), m_BasicBlock(WhileBB))) ||
277ac9a064cSDimitry Andric Pred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(WhileBB))
278ac9a064cSDimitry Andric return false;
279ac9a064cSDimitry Andric
280ac9a064cSDimitry Andric // WhileBB should contain the pattern of load & compare instructions. Match
281ac9a064cSDimitry Andric // the pattern and find the GEP instructions used by the loads.
282ac9a064cSDimitry Andric ICmpInst::Predicate WhilePred;
283ac9a064cSDimitry Andric BasicBlock *FoundBB;
284ac9a064cSDimitry Andric BasicBlock *TrueBB;
285ac9a064cSDimitry Andric Value *LoadA, *LoadB;
286ac9a064cSDimitry Andric if (!match(WhileBB->getTerminator(),
287ac9a064cSDimitry Andric m_Br(m_ICmp(WhilePred, m_Value(LoadA), m_Value(LoadB)),
288ac9a064cSDimitry Andric m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))) ||
289ac9a064cSDimitry Andric WhilePred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(TrueBB))
290ac9a064cSDimitry Andric return false;
291ac9a064cSDimitry Andric
292ac9a064cSDimitry Andric Value *A, *B;
293ac9a064cSDimitry Andric if (!match(LoadA, m_Load(m_Value(A))) || !match(LoadB, m_Load(m_Value(B))))
294ac9a064cSDimitry Andric return false;
295ac9a064cSDimitry Andric
296ac9a064cSDimitry Andric LoadInst *LoadAI = cast<LoadInst>(LoadA);
297ac9a064cSDimitry Andric LoadInst *LoadBI = cast<LoadInst>(LoadB);
298ac9a064cSDimitry Andric if (!LoadAI->isSimple() || !LoadBI->isSimple())
299ac9a064cSDimitry Andric return false;
300ac9a064cSDimitry Andric
301ac9a064cSDimitry Andric GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(A);
302ac9a064cSDimitry Andric GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(B);
303ac9a064cSDimitry Andric
304ac9a064cSDimitry Andric if (!GEPA || !GEPB)
305ac9a064cSDimitry Andric return false;
306ac9a064cSDimitry Andric
307ac9a064cSDimitry Andric Value *PtrA = GEPA->getPointerOperand();
308ac9a064cSDimitry Andric Value *PtrB = GEPB->getPointerOperand();
309ac9a064cSDimitry Andric
310ac9a064cSDimitry Andric // Check we are loading i8 values from two loop invariant pointers
311ac9a064cSDimitry Andric if (!CurLoop->isLoopInvariant(PtrA) || !CurLoop->isLoopInvariant(PtrB) ||
312ac9a064cSDimitry Andric !GEPA->getResultElementType()->isIntegerTy(8) ||
313ac9a064cSDimitry Andric !GEPB->getResultElementType()->isIntegerTy(8) ||
314ac9a064cSDimitry Andric !LoadAI->getType()->isIntegerTy(8) ||
315ac9a064cSDimitry Andric !LoadBI->getType()->isIntegerTy(8) || PtrA == PtrB)
316ac9a064cSDimitry Andric return false;
317ac9a064cSDimitry Andric
318ac9a064cSDimitry Andric // Check that the index to the GEPs is the index we found earlier
319ac9a064cSDimitry Andric if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1)
320ac9a064cSDimitry Andric return false;
321ac9a064cSDimitry Andric
322ac9a064cSDimitry Andric Value *IdxA = GEPA->getOperand(GEPA->getNumIndices());
323ac9a064cSDimitry Andric Value *IdxB = GEPB->getOperand(GEPB->getNumIndices());
324ac9a064cSDimitry Andric if (IdxA != IdxB || !match(IdxA, m_ZExt(m_Specific(Index))))
325ac9a064cSDimitry Andric return false;
326ac9a064cSDimitry Andric
327ac9a064cSDimitry Andric // We only ever expect the pre-incremented index value to be used inside the
328ac9a064cSDimitry Andric // loop.
329ac9a064cSDimitry Andric if (!PN->hasOneUse())
330ac9a064cSDimitry Andric return false;
331ac9a064cSDimitry Andric
332ac9a064cSDimitry Andric // Ensure that when the Found and End blocks are identical the PHIs have the
333ac9a064cSDimitry Andric // supported format. We don't currently allow cases like this:
334ac9a064cSDimitry Andric // while.cond:
335ac9a064cSDimitry Andric // ...
336ac9a064cSDimitry Andric // br i1 %cmp.not, label %while.end, label %while.body
337ac9a064cSDimitry Andric //
338ac9a064cSDimitry Andric // while.body:
339ac9a064cSDimitry Andric // ...
340ac9a064cSDimitry Andric // br i1 %cmp.not2, label %while.cond, label %while.end
341ac9a064cSDimitry Andric //
342ac9a064cSDimitry Andric // while.end:
343ac9a064cSDimitry Andric // %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ]
344ac9a064cSDimitry Andric //
345ac9a064cSDimitry Andric // Where the incoming values for %final_ptr are unique and from each of the
346ac9a064cSDimitry Andric // loop blocks, but not actually defined in the loop. This requires extra
347ac9a064cSDimitry Andric // work setting up the byte.compare block, i.e. by introducing a select to
348ac9a064cSDimitry Andric // choose the correct value.
349ac9a064cSDimitry Andric // TODO: We could add support for this in future.
350ac9a064cSDimitry Andric if (FoundBB == EndBB) {
351ac9a064cSDimitry Andric for (PHINode &EndPN : EndBB->phis()) {
352ac9a064cSDimitry Andric Value *WhileCondVal = EndPN.getIncomingValueForBlock(Header);
353ac9a064cSDimitry Andric Value *WhileBodyVal = EndPN.getIncomingValueForBlock(WhileBB);
354ac9a064cSDimitry Andric
355ac9a064cSDimitry Andric // The value of the index when leaving the while.cond block is always the
356ac9a064cSDimitry Andric // same as the end value (MaxLen) so we permit either. The value when
357ac9a064cSDimitry Andric // leaving the while.body block should only be the index. Otherwise for
358ac9a064cSDimitry Andric // any other values we only allow ones that are same for both blocks.
359ac9a064cSDimitry Andric if (WhileCondVal != WhileBodyVal &&
360ac9a064cSDimitry Andric ((WhileCondVal != Index && WhileCondVal != MaxLen) ||
361ac9a064cSDimitry Andric (WhileBodyVal != Index)))
362ac9a064cSDimitry Andric return false;
363ac9a064cSDimitry Andric }
364ac9a064cSDimitry Andric }
365ac9a064cSDimitry Andric
366ac9a064cSDimitry Andric LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n"
367ac9a064cSDimitry Andric << *(EndBB->getParent()) << "\n\n");
368ac9a064cSDimitry Andric
369ac9a064cSDimitry Andric // The index is incremented before the GEP/Load pair so we need to
370ac9a064cSDimitry Andric // add 1 to the start value.
371ac9a064cSDimitry Andric transformByteCompare(GEPA, GEPB, PN, MaxLen, Index, StartIdx, /*IncIdx=*/true,
372ac9a064cSDimitry Andric FoundBB, EndBB);
373ac9a064cSDimitry Andric return true;
374ac9a064cSDimitry Andric }
375ac9a064cSDimitry Andric
createMaskedFindMismatch(IRBuilder<> & Builder,DomTreeUpdater & DTU,GetElementPtrInst * GEPA,GetElementPtrInst * GEPB,Value * ExtStart,Value * ExtEnd)376ac9a064cSDimitry Andric Value *LoopIdiomVectorize::createMaskedFindMismatch(
377ac9a064cSDimitry Andric IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
378ac9a064cSDimitry Andric GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) {
379ac9a064cSDimitry Andric Type *I64Type = Builder.getInt64Ty();
380ac9a064cSDimitry Andric Type *ResType = Builder.getInt32Ty();
381ac9a064cSDimitry Andric Type *LoadType = Builder.getInt8Ty();
382ac9a064cSDimitry Andric Value *PtrA = GEPA->getPointerOperand();
383ac9a064cSDimitry Andric Value *PtrB = GEPB->getPointerOperand();
384ac9a064cSDimitry Andric
385ac9a064cSDimitry Andric ScalableVectorType *PredVTy =
386ac9a064cSDimitry Andric ScalableVectorType::get(Builder.getInt1Ty(), ByteCompareVF);
387ac9a064cSDimitry Andric
388ac9a064cSDimitry Andric Value *InitialPred = Builder.CreateIntrinsic(
389ac9a064cSDimitry Andric Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
390ac9a064cSDimitry Andric
391ac9a064cSDimitry Andric Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
392ac9a064cSDimitry Andric VecLen =
393ac9a064cSDimitry Andric Builder.CreateMul(VecLen, ConstantInt::get(I64Type, ByteCompareVF), "",
394ac9a064cSDimitry Andric /*HasNUW=*/true, /*HasNSW=*/true);
395ac9a064cSDimitry Andric
396ac9a064cSDimitry Andric Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
397ac9a064cSDimitry Andric Builder.getInt1(false));
398ac9a064cSDimitry Andric
399ac9a064cSDimitry Andric BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
400ac9a064cSDimitry Andric Builder.Insert(JumpToVectorLoop);
401ac9a064cSDimitry Andric
402ac9a064cSDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
403ac9a064cSDimitry Andric VectorLoopStartBlock}});
404ac9a064cSDimitry Andric
405ac9a064cSDimitry Andric // Set up the first vector loop block by creating the PHIs, doing the vector
406ac9a064cSDimitry Andric // loads and comparing the vectors.
407ac9a064cSDimitry Andric Builder.SetInsertPoint(VectorLoopStartBlock);
408ac9a064cSDimitry Andric PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred");
409ac9a064cSDimitry Andric LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
410ac9a064cSDimitry Andric PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
411ac9a064cSDimitry Andric VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
412ac9a064cSDimitry Andric Type *VectorLoadType =
413ac9a064cSDimitry Andric ScalableVectorType::get(Builder.getInt8Ty(), ByteCompareVF);
414ac9a064cSDimitry Andric Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
415ac9a064cSDimitry Andric
416ac9a064cSDimitry Andric Value *VectorLhsGep =
417ac9a064cSDimitry Andric Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds());
418ac9a064cSDimitry Andric Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep,
419ac9a064cSDimitry Andric Align(1), LoopPred, Passthru);
420ac9a064cSDimitry Andric
421ac9a064cSDimitry Andric Value *VectorRhsGep =
422ac9a064cSDimitry Andric Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds());
423ac9a064cSDimitry Andric Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep,
424ac9a064cSDimitry Andric Align(1), LoopPred, Passthru);
425ac9a064cSDimitry Andric
426ac9a064cSDimitry Andric Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad);
427ac9a064cSDimitry Andric VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse);
428ac9a064cSDimitry Andric Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp);
429ac9a064cSDimitry Andric BranchInst *VectorEarlyExit = BranchInst::Create(
430ac9a064cSDimitry Andric VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
431ac9a064cSDimitry Andric Builder.Insert(VectorEarlyExit);
432ac9a064cSDimitry Andric
433ac9a064cSDimitry Andric DTU.applyUpdates(
434ac9a064cSDimitry Andric {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
435ac9a064cSDimitry Andric {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
436ac9a064cSDimitry Andric
437ac9a064cSDimitry Andric // Increment the index counter and calculate the predicate for the next
438ac9a064cSDimitry Andric // iteration of the loop. We branch back to the start of the loop if there
439ac9a064cSDimitry Andric // is at least one active lane.
440ac9a064cSDimitry Andric Builder.SetInsertPoint(VectorLoopIncBlock);
441ac9a064cSDimitry Andric Value *NewVectorIndexPhi =
442ac9a064cSDimitry Andric Builder.CreateAdd(VectorIndexPhi, VecLen, "",
443ac9a064cSDimitry Andric /*HasNUW=*/true, /*HasNSW=*/true);
444ac9a064cSDimitry Andric VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
445ac9a064cSDimitry Andric Value *NewPred =
446ac9a064cSDimitry Andric Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
447ac9a064cSDimitry Andric {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd});
448ac9a064cSDimitry Andric LoopPred->addIncoming(NewPred, VectorLoopIncBlock);
449ac9a064cSDimitry Andric
450ac9a064cSDimitry Andric Value *PredHasActiveLanes =
451ac9a064cSDimitry Andric Builder.CreateExtractElement(NewPred, uint64_t(0));
452ac9a064cSDimitry Andric BranchInst *VectorLoopBranchBack =
453ac9a064cSDimitry Andric BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
454ac9a064cSDimitry Andric Builder.Insert(VectorLoopBranchBack);
455ac9a064cSDimitry Andric
456ac9a064cSDimitry Andric DTU.applyUpdates(
457ac9a064cSDimitry Andric {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
458ac9a064cSDimitry Andric {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
459ac9a064cSDimitry Andric
460ac9a064cSDimitry Andric // If we found a mismatch then we need to calculate which lane in the vector
461ac9a064cSDimitry Andric // had a mismatch and add that on to the current loop index.
462ac9a064cSDimitry Andric Builder.SetInsertPoint(VectorLoopMismatchBlock);
463ac9a064cSDimitry Andric PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred");
464ac9a064cSDimitry Andric FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock);
465ac9a064cSDimitry Andric PHINode *LastLoopPred =
466ac9a064cSDimitry Andric Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred");
467ac9a064cSDimitry Andric LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock);
468ac9a064cSDimitry Andric PHINode *VectorFoundIndex =
469ac9a064cSDimitry Andric Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index");
470ac9a064cSDimitry Andric VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
471ac9a064cSDimitry Andric
472ac9a064cSDimitry Andric Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred);
473ac9a064cSDimitry Andric Value *Ctz = Builder.CreateIntrinsic(
474ac9a064cSDimitry Andric Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()},
475ac9a064cSDimitry Andric {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)});
476ac9a064cSDimitry Andric Ctz = Builder.CreateZExt(Ctz, I64Type);
477ac9a064cSDimitry Andric Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "",
478ac9a064cSDimitry Andric /*HasNUW=*/true, /*HasNSW=*/true);
479ac9a064cSDimitry Andric return Builder.CreateTrunc(VectorLoopRes64, ResType);
480ac9a064cSDimitry Andric }
481ac9a064cSDimitry Andric
createPredicatedFindMismatch(IRBuilder<> & Builder,DomTreeUpdater & DTU,GetElementPtrInst * GEPA,GetElementPtrInst * GEPB,Value * ExtStart,Value * ExtEnd)482ac9a064cSDimitry Andric Value *LoopIdiomVectorize::createPredicatedFindMismatch(
483ac9a064cSDimitry Andric IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
484ac9a064cSDimitry Andric GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) {
485ac9a064cSDimitry Andric Type *I64Type = Builder.getInt64Ty();
486ac9a064cSDimitry Andric Type *I32Type = Builder.getInt32Ty();
487ac9a064cSDimitry Andric Type *ResType = I32Type;
488ac9a064cSDimitry Andric Type *LoadType = Builder.getInt8Ty();
489ac9a064cSDimitry Andric Value *PtrA = GEPA->getPointerOperand();
490ac9a064cSDimitry Andric Value *PtrB = GEPB->getPointerOperand();
491ac9a064cSDimitry Andric
492ac9a064cSDimitry Andric auto *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
493ac9a064cSDimitry Andric Builder.Insert(JumpToVectorLoop);
494ac9a064cSDimitry Andric
495ac9a064cSDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
496ac9a064cSDimitry Andric VectorLoopStartBlock}});
497ac9a064cSDimitry Andric
498ac9a064cSDimitry Andric // Set up the first Vector loop block by creating the PHIs, doing the vector
499ac9a064cSDimitry Andric // loads and comparing the vectors.
500ac9a064cSDimitry Andric Builder.SetInsertPoint(VectorLoopStartBlock);
501ac9a064cSDimitry Andric auto *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vector_index");
502ac9a064cSDimitry Andric VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
503ac9a064cSDimitry Andric
504ac9a064cSDimitry Andric // Calculate AVL by subtracting the vector loop index from the trip count
505ac9a064cSDimitry Andric Value *AVL = Builder.CreateSub(ExtEnd, VectorIndexPhi, "avl", /*HasNUW=*/true,
506ac9a064cSDimitry Andric /*HasNSW=*/true);
507ac9a064cSDimitry Andric
508ac9a064cSDimitry Andric auto *VectorLoadType = ScalableVectorType::get(LoadType, ByteCompareVF);
509ac9a064cSDimitry Andric auto *VF = ConstantInt::get(I32Type, ByteCompareVF);
510ac9a064cSDimitry Andric
511ac9a064cSDimitry Andric Value *VL = Builder.CreateIntrinsic(Intrinsic::experimental_get_vector_length,
512ac9a064cSDimitry Andric {I64Type}, {AVL, VF, Builder.getTrue()});
513ac9a064cSDimitry Andric Value *GepOffset = VectorIndexPhi;
514ac9a064cSDimitry Andric
515ac9a064cSDimitry Andric Value *VectorLhsGep =
516ac9a064cSDimitry Andric Builder.CreateGEP(LoadType, PtrA, GepOffset, "", GEPA->isInBounds());
517ac9a064cSDimitry Andric VectorType *TrueMaskTy =
518ac9a064cSDimitry Andric VectorType::get(Builder.getInt1Ty(), VectorLoadType->getElementCount());
519ac9a064cSDimitry Andric Value *AllTrueMask = Constant::getAllOnesValue(TrueMaskTy);
520ac9a064cSDimitry Andric Value *VectorLhsLoad = Builder.CreateIntrinsic(
521ac9a064cSDimitry Andric Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
522ac9a064cSDimitry Andric {VectorLhsGep, AllTrueMask, VL}, nullptr, "lhs.load");
523ac9a064cSDimitry Andric
524ac9a064cSDimitry Andric Value *VectorRhsGep =
525ac9a064cSDimitry Andric Builder.CreateGEP(LoadType, PtrB, GepOffset, "", GEPB->isInBounds());
526ac9a064cSDimitry Andric Value *VectorRhsLoad = Builder.CreateIntrinsic(
527ac9a064cSDimitry Andric Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
528ac9a064cSDimitry Andric {VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load");
529ac9a064cSDimitry Andric
530ac9a064cSDimitry Andric StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE);
531ac9a064cSDimitry Andric auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr);
532ac9a064cSDimitry Andric Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS);
533ac9a064cSDimitry Andric Value *VectorMatchCmp = Builder.CreateIntrinsic(
534ac9a064cSDimitry Andric Intrinsic::vp_icmp, {VectorLhsLoad->getType()},
535ac9a064cSDimitry Andric {VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr,
536ac9a064cSDimitry Andric "mismatch.cmp");
537ac9a064cSDimitry Andric Value *CTZ = Builder.CreateIntrinsic(
538ac9a064cSDimitry Andric Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()},
539ac9a064cSDimitry Andric {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(false), AllTrueMask,
540ac9a064cSDimitry Andric VL});
541ac9a064cSDimitry Andric Value *MismatchFound = Builder.CreateICmpNE(CTZ, VL);
542ac9a064cSDimitry Andric auto *VectorEarlyExit = BranchInst::Create(VectorLoopMismatchBlock,
543ac9a064cSDimitry Andric VectorLoopIncBlock, MismatchFound);
544ac9a064cSDimitry Andric Builder.Insert(VectorEarlyExit);
545ac9a064cSDimitry Andric
546ac9a064cSDimitry Andric DTU.applyUpdates(
547ac9a064cSDimitry Andric {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
548ac9a064cSDimitry Andric {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
549ac9a064cSDimitry Andric
550ac9a064cSDimitry Andric // Increment the index counter and calculate the predicate for the next
551ac9a064cSDimitry Andric // iteration of the loop. We branch back to the start of the loop if there
552ac9a064cSDimitry Andric // is at least one active lane.
553ac9a064cSDimitry Andric Builder.SetInsertPoint(VectorLoopIncBlock);
554ac9a064cSDimitry Andric Value *VL64 = Builder.CreateZExt(VL, I64Type);
555ac9a064cSDimitry Andric Value *NewVectorIndexPhi =
556ac9a064cSDimitry Andric Builder.CreateAdd(VectorIndexPhi, VL64, "",
557ac9a064cSDimitry Andric /*HasNUW=*/true, /*HasNSW=*/true);
558ac9a064cSDimitry Andric VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
559ac9a064cSDimitry Andric Value *ExitCond = Builder.CreateICmpNE(NewVectorIndexPhi, ExtEnd);
560ac9a064cSDimitry Andric auto *VectorLoopBranchBack =
561ac9a064cSDimitry Andric BranchInst::Create(VectorLoopStartBlock, EndBlock, ExitCond);
562ac9a064cSDimitry Andric Builder.Insert(VectorLoopBranchBack);
563ac9a064cSDimitry Andric
564ac9a064cSDimitry Andric DTU.applyUpdates(
565ac9a064cSDimitry Andric {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
566ac9a064cSDimitry Andric {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
567ac9a064cSDimitry Andric
568ac9a064cSDimitry Andric // If we found a mismatch then we need to calculate which lane in the vector
569ac9a064cSDimitry Andric // had a mismatch and add that on to the current loop index.
570ac9a064cSDimitry Andric Builder.SetInsertPoint(VectorLoopMismatchBlock);
571ac9a064cSDimitry Andric
572ac9a064cSDimitry Andric // Add LCSSA phis for CTZ and VectorIndexPhi.
573ac9a064cSDimitry Andric auto *CTZLCSSAPhi = Builder.CreatePHI(CTZ->getType(), 1, "ctz");
574ac9a064cSDimitry Andric CTZLCSSAPhi->addIncoming(CTZ, VectorLoopStartBlock);
575ac9a064cSDimitry Andric auto *VectorIndexLCSSAPhi =
576ac9a064cSDimitry Andric Builder.CreatePHI(VectorIndexPhi->getType(), 1, "mismatch_vector_index");
577ac9a064cSDimitry Andric VectorIndexLCSSAPhi->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
578ac9a064cSDimitry Andric
579ac9a064cSDimitry Andric Value *CTZI64 = Builder.CreateZExt(CTZLCSSAPhi, I64Type);
580ac9a064cSDimitry Andric Value *VectorLoopRes64 = Builder.CreateAdd(VectorIndexLCSSAPhi, CTZI64, "",
581ac9a064cSDimitry Andric /*HasNUW=*/true, /*HasNSW=*/true);
582ac9a064cSDimitry Andric return Builder.CreateTrunc(VectorLoopRes64, ResType);
583ac9a064cSDimitry Andric }
584ac9a064cSDimitry Andric
expandFindMismatch(IRBuilder<> & Builder,DomTreeUpdater & DTU,GetElementPtrInst * GEPA,GetElementPtrInst * GEPB,Instruction * Index,Value * Start,Value * MaxLen)585ac9a064cSDimitry Andric Value *LoopIdiomVectorize::expandFindMismatch(
586ac9a064cSDimitry Andric IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
587ac9a064cSDimitry Andric GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) {
588ac9a064cSDimitry Andric Value *PtrA = GEPA->getPointerOperand();
589ac9a064cSDimitry Andric Value *PtrB = GEPB->getPointerOperand();
590ac9a064cSDimitry Andric
591ac9a064cSDimitry Andric // Get the arguments and types for the intrinsic.
592ac9a064cSDimitry Andric BasicBlock *Preheader = CurLoop->getLoopPreheader();
593ac9a064cSDimitry Andric BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
594ac9a064cSDimitry Andric LLVMContext &Ctx = PHBranch->getContext();
595ac9a064cSDimitry Andric Type *LoadType = Type::getInt8Ty(Ctx);
596ac9a064cSDimitry Andric Type *ResType = Builder.getInt32Ty();
597ac9a064cSDimitry Andric
598ac9a064cSDimitry Andric // Split block in the original loop preheader.
599ac9a064cSDimitry Andric EndBlock = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end");
600ac9a064cSDimitry Andric
601ac9a064cSDimitry Andric // Create the blocks that we're going to need:
602ac9a064cSDimitry Andric // 1. A block for checking the zero-extended length exceeds 0
603ac9a064cSDimitry Andric // 2. A block to check that the start and end addresses of a given array
604ac9a064cSDimitry Andric // lie on the same page.
605ac9a064cSDimitry Andric // 3. The vector loop preheader.
606ac9a064cSDimitry Andric // 4. The first vector loop block.
607ac9a064cSDimitry Andric // 5. The vector loop increment block.
608ac9a064cSDimitry Andric // 6. A block we can jump to from the vector loop when a mismatch is found.
609ac9a064cSDimitry Andric // 7. The first block of the scalar loop itself, containing PHIs , loads
610ac9a064cSDimitry Andric // and cmp.
611ac9a064cSDimitry Andric // 8. A scalar loop increment block to increment the PHIs and go back
612ac9a064cSDimitry Andric // around the loop.
613ac9a064cSDimitry Andric
614ac9a064cSDimitry Andric BasicBlock *MinItCheckBlock = BasicBlock::Create(
615ac9a064cSDimitry Andric Ctx, "mismatch_min_it_check", EndBlock->getParent(), EndBlock);
616ac9a064cSDimitry Andric
617ac9a064cSDimitry Andric // Update the terminator added by SplitBlock to branch to the first block
618ac9a064cSDimitry Andric Preheader->getTerminator()->setSuccessor(0, MinItCheckBlock);
619ac9a064cSDimitry Andric
620ac9a064cSDimitry Andric BasicBlock *MemCheckBlock = BasicBlock::Create(
621ac9a064cSDimitry Andric Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock);
622ac9a064cSDimitry Andric
623ac9a064cSDimitry Andric VectorLoopPreheaderBlock = BasicBlock::Create(
624ac9a064cSDimitry Andric Ctx, "mismatch_vec_loop_preheader", EndBlock->getParent(), EndBlock);
625ac9a064cSDimitry Andric
626ac9a064cSDimitry Andric VectorLoopStartBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop",
627ac9a064cSDimitry Andric EndBlock->getParent(), EndBlock);
628ac9a064cSDimitry Andric
629ac9a064cSDimitry Andric VectorLoopIncBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_inc",
630ac9a064cSDimitry Andric EndBlock->getParent(), EndBlock);
631ac9a064cSDimitry Andric
632ac9a064cSDimitry Andric VectorLoopMismatchBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_found",
633ac9a064cSDimitry Andric EndBlock->getParent(), EndBlock);
634ac9a064cSDimitry Andric
635ac9a064cSDimitry Andric BasicBlock *LoopPreHeaderBlock = BasicBlock::Create(
636ac9a064cSDimitry Andric Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock);
637ac9a064cSDimitry Andric
638ac9a064cSDimitry Andric BasicBlock *LoopStartBlock =
639ac9a064cSDimitry Andric BasicBlock::Create(Ctx, "mismatch_loop", EndBlock->getParent(), EndBlock);
640ac9a064cSDimitry Andric
641ac9a064cSDimitry Andric BasicBlock *LoopIncBlock = BasicBlock::Create(
642ac9a064cSDimitry Andric Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock);
643ac9a064cSDimitry Andric
644ac9a064cSDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock},
645ac9a064cSDimitry Andric {DominatorTree::Delete, Preheader, EndBlock}});
646ac9a064cSDimitry Andric
647ac9a064cSDimitry Andric // Update LoopInfo with the new vector & scalar loops.
648ac9a064cSDimitry Andric auto VectorLoop = LI->AllocateLoop();
649ac9a064cSDimitry Andric auto ScalarLoop = LI->AllocateLoop();
650ac9a064cSDimitry Andric
651ac9a064cSDimitry Andric if (CurLoop->getParentLoop()) {
652ac9a064cSDimitry Andric CurLoop->getParentLoop()->addBasicBlockToLoop(MinItCheckBlock, *LI);
653ac9a064cSDimitry Andric CurLoop->getParentLoop()->addBasicBlockToLoop(MemCheckBlock, *LI);
654ac9a064cSDimitry Andric CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopPreheaderBlock,
655ac9a064cSDimitry Andric *LI);
656ac9a064cSDimitry Andric CurLoop->getParentLoop()->addChildLoop(VectorLoop);
657ac9a064cSDimitry Andric CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopMismatchBlock, *LI);
658ac9a064cSDimitry Andric CurLoop->getParentLoop()->addBasicBlockToLoop(LoopPreHeaderBlock, *LI);
659ac9a064cSDimitry Andric CurLoop->getParentLoop()->addChildLoop(ScalarLoop);
660ac9a064cSDimitry Andric } else {
661ac9a064cSDimitry Andric LI->addTopLevelLoop(VectorLoop);
662ac9a064cSDimitry Andric LI->addTopLevelLoop(ScalarLoop);
663ac9a064cSDimitry Andric }
664ac9a064cSDimitry Andric
665ac9a064cSDimitry Andric // Add the new basic blocks to their associated loops.
666ac9a064cSDimitry Andric VectorLoop->addBasicBlockToLoop(VectorLoopStartBlock, *LI);
667ac9a064cSDimitry Andric VectorLoop->addBasicBlockToLoop(VectorLoopIncBlock, *LI);
668ac9a064cSDimitry Andric
669ac9a064cSDimitry Andric ScalarLoop->addBasicBlockToLoop(LoopStartBlock, *LI);
670ac9a064cSDimitry Andric ScalarLoop->addBasicBlockToLoop(LoopIncBlock, *LI);
671ac9a064cSDimitry Andric
672ac9a064cSDimitry Andric // Set up some types and constants that we intend to reuse.
673ac9a064cSDimitry Andric Type *I64Type = Builder.getInt64Ty();
674ac9a064cSDimitry Andric
675ac9a064cSDimitry Andric // Check the zero-extended iteration count > 0
676ac9a064cSDimitry Andric Builder.SetInsertPoint(MinItCheckBlock);
677ac9a064cSDimitry Andric Value *ExtStart = Builder.CreateZExt(Start, I64Type);
678ac9a064cSDimitry Andric Value *ExtEnd = Builder.CreateZExt(MaxLen, I64Type);
679ac9a064cSDimitry Andric // This check doesn't really cost us very much.
680ac9a064cSDimitry Andric
681ac9a064cSDimitry Andric Value *LimitCheck = Builder.CreateICmpULE(Start, MaxLen);
682ac9a064cSDimitry Andric BranchInst *MinItCheckBr =
683ac9a064cSDimitry Andric BranchInst::Create(MemCheckBlock, LoopPreHeaderBlock, LimitCheck);
684ac9a064cSDimitry Andric MinItCheckBr->setMetadata(
685ac9a064cSDimitry Andric LLVMContext::MD_prof,
686ac9a064cSDimitry Andric MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1));
687ac9a064cSDimitry Andric Builder.Insert(MinItCheckBr);
688ac9a064cSDimitry Andric
689ac9a064cSDimitry Andric DTU.applyUpdates(
690ac9a064cSDimitry Andric {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock},
691ac9a064cSDimitry Andric {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}});
692ac9a064cSDimitry Andric
693ac9a064cSDimitry Andric // For each of the arrays, check the start/end addresses are on the same
694ac9a064cSDimitry Andric // page.
695ac9a064cSDimitry Andric Builder.SetInsertPoint(MemCheckBlock);
696ac9a064cSDimitry Andric
697ac9a064cSDimitry Andric // The early exit in the original loop means that when performing vector
698ac9a064cSDimitry Andric // loads we are potentially reading ahead of the early exit. So we could
699ac9a064cSDimitry Andric // fault if crossing a page boundary. Therefore, we create runtime memory
700ac9a064cSDimitry Andric // checks based on the minimum page size as follows:
701ac9a064cSDimitry Andric // 1. Calculate the addresses of the first memory accesses in the loop,
702ac9a064cSDimitry Andric // i.e. LhsStart and RhsStart.
703ac9a064cSDimitry Andric // 2. Get the last accessed addresses in the loop, i.e. LhsEnd and RhsEnd.
704ac9a064cSDimitry Andric // 3. Determine which pages correspond to all the memory accesses, i.e
705ac9a064cSDimitry Andric // LhsStartPage, LhsEndPage, RhsStartPage, RhsEndPage.
706ac9a064cSDimitry Andric // 4. If LhsStartPage == LhsEndPage and RhsStartPage == RhsEndPage, then
707ac9a064cSDimitry Andric // we know we won't cross any page boundaries in the loop so we can
708ac9a064cSDimitry Andric // enter the vector loop! Otherwise we fall back on the scalar loop.
709ac9a064cSDimitry Andric Value *LhsStartGEP = Builder.CreateGEP(LoadType, PtrA, ExtStart);
710ac9a064cSDimitry Andric Value *RhsStartGEP = Builder.CreateGEP(LoadType, PtrB, ExtStart);
711ac9a064cSDimitry Andric Value *RhsStart = Builder.CreatePtrToInt(RhsStartGEP, I64Type);
712ac9a064cSDimitry Andric Value *LhsStart = Builder.CreatePtrToInt(LhsStartGEP, I64Type);
713ac9a064cSDimitry Andric Value *LhsEndGEP = Builder.CreateGEP(LoadType, PtrA, ExtEnd);
714ac9a064cSDimitry Andric Value *RhsEndGEP = Builder.CreateGEP(LoadType, PtrB, ExtEnd);
715ac9a064cSDimitry Andric Value *LhsEnd = Builder.CreatePtrToInt(LhsEndGEP, I64Type);
716ac9a064cSDimitry Andric Value *RhsEnd = Builder.CreatePtrToInt(RhsEndGEP, I64Type);
717ac9a064cSDimitry Andric
718ac9a064cSDimitry Andric const uint64_t MinPageSize = TTI->getMinPageSize().value();
719ac9a064cSDimitry Andric const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize);
720ac9a064cSDimitry Andric Value *LhsStartPage = Builder.CreateLShr(LhsStart, AddrShiftAmt);
721ac9a064cSDimitry Andric Value *LhsEndPage = Builder.CreateLShr(LhsEnd, AddrShiftAmt);
722ac9a064cSDimitry Andric Value *RhsStartPage = Builder.CreateLShr(RhsStart, AddrShiftAmt);
723ac9a064cSDimitry Andric Value *RhsEndPage = Builder.CreateLShr(RhsEnd, AddrShiftAmt);
724ac9a064cSDimitry Andric Value *LhsPageCmp = Builder.CreateICmpNE(LhsStartPage, LhsEndPage);
725ac9a064cSDimitry Andric Value *RhsPageCmp = Builder.CreateICmpNE(RhsStartPage, RhsEndPage);
726ac9a064cSDimitry Andric
727ac9a064cSDimitry Andric Value *CombinedPageCmp = Builder.CreateOr(LhsPageCmp, RhsPageCmp);
728ac9a064cSDimitry Andric BranchInst *CombinedPageCmpCmpBr = BranchInst::Create(
729ac9a064cSDimitry Andric LoopPreHeaderBlock, VectorLoopPreheaderBlock, CombinedPageCmp);
730ac9a064cSDimitry Andric CombinedPageCmpCmpBr->setMetadata(
731ac9a064cSDimitry Andric LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext())
732ac9a064cSDimitry Andric .createBranchWeights(10, 90));
733ac9a064cSDimitry Andric Builder.Insert(CombinedPageCmpCmpBr);
734ac9a064cSDimitry Andric
735ac9a064cSDimitry Andric DTU.applyUpdates(
736ac9a064cSDimitry Andric {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock},
737ac9a064cSDimitry Andric {DominatorTree::Insert, MemCheckBlock, VectorLoopPreheaderBlock}});
738ac9a064cSDimitry Andric
739ac9a064cSDimitry Andric // Set up the vector loop preheader, i.e. calculate initial loop predicate,
740ac9a064cSDimitry Andric // zero-extend MaxLen to 64-bits, determine the number of vector elements
741ac9a064cSDimitry Andric // processed in each iteration, etc.
742ac9a064cSDimitry Andric Builder.SetInsertPoint(VectorLoopPreheaderBlock);
743ac9a064cSDimitry Andric
744ac9a064cSDimitry Andric // At this point we know two things must be true:
745ac9a064cSDimitry Andric // 1. Start <= End
746ac9a064cSDimitry Andric // 2. ExtMaxLen <= MinPageSize due to the page checks.
747ac9a064cSDimitry Andric // Therefore, we know that we can use a 64-bit induction variable that
748ac9a064cSDimitry Andric // starts from 0 -> ExtMaxLen and it will not overflow.
749ac9a064cSDimitry Andric Value *VectorLoopRes = nullptr;
750ac9a064cSDimitry Andric switch (VectorizeStyle) {
751ac9a064cSDimitry Andric case LoopIdiomVectorizeStyle::Masked:
752ac9a064cSDimitry Andric VectorLoopRes =
753ac9a064cSDimitry Andric createMaskedFindMismatch(Builder, DTU, GEPA, GEPB, ExtStart, ExtEnd);
754ac9a064cSDimitry Andric break;
755ac9a064cSDimitry Andric case LoopIdiomVectorizeStyle::Predicated:
756ac9a064cSDimitry Andric VectorLoopRes = createPredicatedFindMismatch(Builder, DTU, GEPA, GEPB,
757ac9a064cSDimitry Andric ExtStart, ExtEnd);
758ac9a064cSDimitry Andric break;
759ac9a064cSDimitry Andric }
760ac9a064cSDimitry Andric
761ac9a064cSDimitry Andric Builder.Insert(BranchInst::Create(EndBlock));
762ac9a064cSDimitry Andric
763ac9a064cSDimitry Andric DTU.applyUpdates(
764ac9a064cSDimitry Andric {{DominatorTree::Insert, VectorLoopMismatchBlock, EndBlock}});
765ac9a064cSDimitry Andric
766ac9a064cSDimitry Andric // Generate code for scalar loop.
767ac9a064cSDimitry Andric Builder.SetInsertPoint(LoopPreHeaderBlock);
768ac9a064cSDimitry Andric Builder.Insert(BranchInst::Create(LoopStartBlock));
769ac9a064cSDimitry Andric
770ac9a064cSDimitry Andric DTU.applyUpdates(
771ac9a064cSDimitry Andric {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}});
772ac9a064cSDimitry Andric
773ac9a064cSDimitry Andric Builder.SetInsertPoint(LoopStartBlock);
774ac9a064cSDimitry Andric PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index");
775ac9a064cSDimitry Andric IndexPhi->addIncoming(Start, LoopPreHeaderBlock);
776ac9a064cSDimitry Andric
777ac9a064cSDimitry Andric // Otherwise compare the values
778ac9a064cSDimitry Andric // Load bytes from each array and compare them.
779ac9a064cSDimitry Andric Value *GepOffset = Builder.CreateZExt(IndexPhi, I64Type);
780ac9a064cSDimitry Andric
781ac9a064cSDimitry Andric Value *LhsGep =
782ac9a064cSDimitry Andric Builder.CreateGEP(LoadType, PtrA, GepOffset, "", GEPA->isInBounds());
783ac9a064cSDimitry Andric Value *LhsLoad = Builder.CreateLoad(LoadType, LhsGep);
784ac9a064cSDimitry Andric
785ac9a064cSDimitry Andric Value *RhsGep =
786ac9a064cSDimitry Andric Builder.CreateGEP(LoadType, PtrB, GepOffset, "", GEPB->isInBounds());
787ac9a064cSDimitry Andric Value *RhsLoad = Builder.CreateLoad(LoadType, RhsGep);
788ac9a064cSDimitry Andric
789ac9a064cSDimitry Andric Value *MatchCmp = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
790ac9a064cSDimitry Andric // If we have a mismatch then exit the loop ...
791ac9a064cSDimitry Andric BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp);
792ac9a064cSDimitry Andric Builder.Insert(MatchCmpBr);
793ac9a064cSDimitry Andric
794ac9a064cSDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock},
795ac9a064cSDimitry Andric {DominatorTree::Insert, LoopStartBlock, EndBlock}});
796ac9a064cSDimitry Andric
797ac9a064cSDimitry Andric // Have we reached the maximum permitted length for the loop?
798ac9a064cSDimitry Andric Builder.SetInsertPoint(LoopIncBlock);
799ac9a064cSDimitry Andric Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "",
800ac9a064cSDimitry Andric /*HasNUW=*/Index->hasNoUnsignedWrap(),
801ac9a064cSDimitry Andric /*HasNSW=*/Index->hasNoSignedWrap());
802ac9a064cSDimitry Andric IndexPhi->addIncoming(PhiInc, LoopIncBlock);
803ac9a064cSDimitry Andric Value *IVCmp = Builder.CreateICmpEQ(PhiInc, MaxLen);
804ac9a064cSDimitry Andric BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp);
805ac9a064cSDimitry Andric Builder.Insert(IVCmpBr);
806ac9a064cSDimitry Andric
807ac9a064cSDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock},
808ac9a064cSDimitry Andric {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}});
809ac9a064cSDimitry Andric
810ac9a064cSDimitry Andric // In the end block we need to insert a PHI node to deal with three cases:
811ac9a064cSDimitry Andric // 1. We didn't find a mismatch in the scalar loop, so we return MaxLen.
812ac9a064cSDimitry Andric // 2. We exitted the scalar loop early due to a mismatch and need to return
813ac9a064cSDimitry Andric // the index that we found.
814ac9a064cSDimitry Andric // 3. We didn't find a mismatch in the vector loop, so we return MaxLen.
815ac9a064cSDimitry Andric // 4. We exitted the vector loop early due to a mismatch and need to return
816ac9a064cSDimitry Andric // the index that we found.
817ac9a064cSDimitry Andric Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt());
818ac9a064cSDimitry Andric PHINode *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result");
819ac9a064cSDimitry Andric ResPhi->addIncoming(MaxLen, LoopIncBlock);
820ac9a064cSDimitry Andric ResPhi->addIncoming(IndexPhi, LoopStartBlock);
821ac9a064cSDimitry Andric ResPhi->addIncoming(MaxLen, VectorLoopIncBlock);
822ac9a064cSDimitry Andric ResPhi->addIncoming(VectorLoopRes, VectorLoopMismatchBlock);
823ac9a064cSDimitry Andric
824ac9a064cSDimitry Andric Value *FinalRes = Builder.CreateTrunc(ResPhi, ResType);
825ac9a064cSDimitry Andric
826ac9a064cSDimitry Andric if (VerifyLoops) {
827ac9a064cSDimitry Andric ScalarLoop->verifyLoop();
828ac9a064cSDimitry Andric VectorLoop->verifyLoop();
829ac9a064cSDimitry Andric if (!VectorLoop->isRecursivelyLCSSAForm(*DT, *LI))
830ac9a064cSDimitry Andric report_fatal_error("Loops must remain in LCSSA form!");
831ac9a064cSDimitry Andric if (!ScalarLoop->isRecursivelyLCSSAForm(*DT, *LI))
832ac9a064cSDimitry Andric report_fatal_error("Loops must remain in LCSSA form!");
833ac9a064cSDimitry Andric }
834ac9a064cSDimitry Andric
835ac9a064cSDimitry Andric return FinalRes;
836ac9a064cSDimitry Andric }
837ac9a064cSDimitry Andric
transformByteCompare(GetElementPtrInst * GEPA,GetElementPtrInst * GEPB,PHINode * IndPhi,Value * MaxLen,Instruction * Index,Value * Start,bool IncIdx,BasicBlock * FoundBB,BasicBlock * EndBB)838ac9a064cSDimitry Andric void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
839ac9a064cSDimitry Andric GetElementPtrInst *GEPB,
840ac9a064cSDimitry Andric PHINode *IndPhi, Value *MaxLen,
841ac9a064cSDimitry Andric Instruction *Index, Value *Start,
842ac9a064cSDimitry Andric bool IncIdx, BasicBlock *FoundBB,
843ac9a064cSDimitry Andric BasicBlock *EndBB) {
844ac9a064cSDimitry Andric
845ac9a064cSDimitry Andric // Insert the byte compare code at the end of the preheader block
846ac9a064cSDimitry Andric BasicBlock *Preheader = CurLoop->getLoopPreheader();
847ac9a064cSDimitry Andric BasicBlock *Header = CurLoop->getHeader();
848ac9a064cSDimitry Andric BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
849ac9a064cSDimitry Andric IRBuilder<> Builder(PHBranch);
850ac9a064cSDimitry Andric DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
851ac9a064cSDimitry Andric Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
852ac9a064cSDimitry Andric
853ac9a064cSDimitry Andric // Increment the pointer if this was done before the loads in the loop.
854ac9a064cSDimitry Andric if (IncIdx)
855ac9a064cSDimitry Andric Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1));
856ac9a064cSDimitry Andric
857ac9a064cSDimitry Andric Value *ByteCmpRes =
858ac9a064cSDimitry Andric expandFindMismatch(Builder, DTU, GEPA, GEPB, Index, Start, MaxLen);
859ac9a064cSDimitry Andric
860ac9a064cSDimitry Andric // Replaces uses of index & induction Phi with intrinsic (we already
861ac9a064cSDimitry Andric // checked that the the first instruction of Header is the Phi above).
862ac9a064cSDimitry Andric assert(IndPhi->hasOneUse() && "Index phi node has more than one use!");
863ac9a064cSDimitry Andric Index->replaceAllUsesWith(ByteCmpRes);
864ac9a064cSDimitry Andric
865ac9a064cSDimitry Andric assert(PHBranch->isUnconditional() &&
866ac9a064cSDimitry Andric "Expected preheader to terminate with an unconditional branch.");
867ac9a064cSDimitry Andric
868ac9a064cSDimitry Andric // If no mismatch was found, we can jump to the end block. Create a
869ac9a064cSDimitry Andric // new basic block for the compare instruction.
870ac9a064cSDimitry Andric auto *CmpBB = BasicBlock::Create(Preheader->getContext(), "byte.compare",
871ac9a064cSDimitry Andric Preheader->getParent());
872ac9a064cSDimitry Andric CmpBB->moveBefore(EndBB);
873ac9a064cSDimitry Andric
874ac9a064cSDimitry Andric // Replace the branch in the preheader with an always-true conditional branch.
875ac9a064cSDimitry Andric // This ensures there is still a reference to the original loop.
876ac9a064cSDimitry Andric Builder.CreateCondBr(Builder.getTrue(), CmpBB, Header);
877ac9a064cSDimitry Andric PHBranch->eraseFromParent();
878ac9a064cSDimitry Andric
879ac9a064cSDimitry Andric BasicBlock *MismatchEnd = cast<Instruction>(ByteCmpRes)->getParent();
880ac9a064cSDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, MismatchEnd, CmpBB}});
881ac9a064cSDimitry Andric
882ac9a064cSDimitry Andric // Create the branch to either the end or found block depending on the value
883ac9a064cSDimitry Andric // returned by the intrinsic.
884ac9a064cSDimitry Andric Builder.SetInsertPoint(CmpBB);
885ac9a064cSDimitry Andric if (FoundBB != EndBB) {
886ac9a064cSDimitry Andric Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen);
887ac9a064cSDimitry Andric Builder.CreateCondBr(FoundCmp, EndBB, FoundBB);
888ac9a064cSDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB},
889ac9a064cSDimitry Andric {DominatorTree::Insert, CmpBB, EndBB}});
890ac9a064cSDimitry Andric
891ac9a064cSDimitry Andric } else {
892ac9a064cSDimitry Andric Builder.CreateBr(FoundBB);
893ac9a064cSDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}});
894ac9a064cSDimitry Andric }
895ac9a064cSDimitry Andric
896ac9a064cSDimitry Andric auto fixSuccessorPhis = [&](BasicBlock *SuccBB) {
897ac9a064cSDimitry Andric for (PHINode &PN : SuccBB->phis()) {
898ac9a064cSDimitry Andric // At this point we've already replaced all uses of the result from the
899ac9a064cSDimitry Andric // loop with ByteCmp. Look through the incoming values to find ByteCmp,
900ac9a064cSDimitry Andric // meaning this is a Phi collecting the results of the byte compare.
901ac9a064cSDimitry Andric bool ResPhi = false;
902ac9a064cSDimitry Andric for (Value *Op : PN.incoming_values())
903ac9a064cSDimitry Andric if (Op == ByteCmpRes) {
904ac9a064cSDimitry Andric ResPhi = true;
905ac9a064cSDimitry Andric break;
906ac9a064cSDimitry Andric }
907ac9a064cSDimitry Andric
908ac9a064cSDimitry Andric // Any PHI that depended upon the result of the byte compare needs a new
909ac9a064cSDimitry Andric // incoming value from CmpBB. This is because the original loop will get
910ac9a064cSDimitry Andric // deleted.
911ac9a064cSDimitry Andric if (ResPhi)
912ac9a064cSDimitry Andric PN.addIncoming(ByteCmpRes, CmpBB);
913ac9a064cSDimitry Andric else {
914ac9a064cSDimitry Andric // There should be no other outside uses of other values in the
915ac9a064cSDimitry Andric // original loop. Any incoming values should either:
916ac9a064cSDimitry Andric // 1. Be for blocks outside the loop, which aren't interesting. Or ..
917ac9a064cSDimitry Andric // 2. These are from blocks in the loop with values defined outside
918ac9a064cSDimitry Andric // the loop. We should a similar incoming value from CmpBB.
919ac9a064cSDimitry Andric for (BasicBlock *BB : PN.blocks())
920ac9a064cSDimitry Andric if (CurLoop->contains(BB)) {
921ac9a064cSDimitry Andric PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB);
922ac9a064cSDimitry Andric break;
923ac9a064cSDimitry Andric }
924ac9a064cSDimitry Andric }
925ac9a064cSDimitry Andric }
926ac9a064cSDimitry Andric };
927ac9a064cSDimitry Andric
928ac9a064cSDimitry Andric // Ensure all Phis in the successors of CmpBB have an incoming value from it.
929ac9a064cSDimitry Andric fixSuccessorPhis(EndBB);
930ac9a064cSDimitry Andric if (EndBB != FoundBB)
931ac9a064cSDimitry Andric fixSuccessorPhis(FoundBB);
932ac9a064cSDimitry Andric
933ac9a064cSDimitry Andric // The new CmpBB block isn't part of the loop, but will need to be added to
934ac9a064cSDimitry Andric // the outer loop if there is one.
935ac9a064cSDimitry Andric if (!CurLoop->isOutermost())
936ac9a064cSDimitry Andric CurLoop->getParentLoop()->addBasicBlockToLoop(CmpBB, *LI);
937ac9a064cSDimitry Andric
938ac9a064cSDimitry Andric if (VerifyLoops && CurLoop->getParentLoop()) {
939ac9a064cSDimitry Andric CurLoop->getParentLoop()->verifyLoop();
940ac9a064cSDimitry Andric if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
941ac9a064cSDimitry Andric report_fatal_error("Loops must remain in LCSSA form!");
942ac9a064cSDimitry Andric }
943ac9a064cSDimitry Andric }
944