summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJun Bum Lim <junbuml@codeaurora.org>2017-12-15 16:09:54 +0000
committerJun Bum Lim <junbuml@codeaurora.org>2017-12-15 16:09:54 +0000
commit20ea5a516182ba80d098f722ff822fda9702d583 (patch)
tree93172453fb744a870cfd52ab0c7fe05d01431ec5
parent213645abbdf5e9f079496abf8798b94221026db9 (diff)
[LICM] Allow sinking when foldable in loop
Summary: Continue trying to sink an instruction if its users in the loop is foldable. This will allow the instruction to be folded in the loop by decoupling it from the user outside of the loop. Reviewers: hfinkel, majnemer, davidxl, efriedma, danielcdh, bmakam, mcrosier Reviewed By: hfinkel Subscribers: javed.absar, bmakam, mcrosier, llvm-commits Differential Revision: https://reviews.llvm.org/D37076 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@320823 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/Transforms/Utils/LoopUtils.h5
-rw-r--r--lib/Transforms/Scalar/LICM.cpp107
-rw-r--r--test/Transforms/LICM/sink-foldable.ll147
3 files changed, 228 insertions, 31 deletions
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index 90299c5e6a2..75066613650 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -436,8 +436,9 @@ bool formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
/// instructions of the loop and loop safety information as
/// arguments. Diagnostics is emitted via \p ORE. It returns changed status.
bool sinkRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
- TargetLibraryInfo *, Loop *, AliasSetTracker *,
- LoopSafetyInfo *, OptimizationRemarkEmitter *ORE);
+ TargetLibraryInfo *, TargetTransformInfo *, Loop *,
+ AliasSetTracker *, LoopSafetyInfo *,
+ OptimizationRemarkEmitter *ORE);
/// \brief Walk the specified region of the CFG (defined by all blocks
/// dominated by the specified block, and that are in the current loop) in depth
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index f610aae2403..ccb0af2d85a 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -90,14 +90,15 @@ static cl::opt<uint32_t> MaxNumUsesTraversed(
"invariance in loop using invariant start (default = 8)"));
static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
- const LoopSafetyInfo *SafetyInfo);
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ TargetTransformInfo *TTI, bool &FreeInLoop);
static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
const LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE);
static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
- OptimizationRemarkEmitter *ORE);
+ OptimizationRemarkEmitter *ORE, bool FreeInLoop);
static bool isSafeToExecuteUnconditionally(Instruction &Inst,
const DominatorTree *DT,
const Loop *CurLoop,
@@ -115,7 +116,8 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
namespace {
struct LoopInvariantCodeMotion {
bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
- TargetLibraryInfo *TLI, ScalarEvolution *SE, MemorySSA *MSSA,
+ TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+ ScalarEvolution *SE, MemorySSA *MSSA,
OptimizationRemarkEmitter *ORE, bool DeleteAST);
DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() {
@@ -159,6 +161,8 @@ struct LegacyLICMPass : public LoopPass {
&getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
&getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent()),
SE ? &SE->getSE() : nullptr, MSSA, &ORE, false);
}
@@ -170,6 +174,7 @@ struct LegacyLICMPass : public LoopPass {
AU.addRequired<TargetLibraryInfoWrapperPass>();
if (EnableMSSALoopDependency)
AU.addRequired<MemorySSAWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
getLoopAnalysisUsage(AU);
}
@@ -210,8 +215,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
"cached at a higher level");
LoopInvariantCodeMotion LICM;
- if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, AR.MSSA, ORE,
- true))
+ if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE,
+ AR.MSSA, ORE, true))
return PreservedAnalyses::all();
auto PA = getLoopPassPreservedAnalyses();
@@ -224,6 +229,7 @@ INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion",
false, false)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
false)
@@ -236,12 +242,10 @@ Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
/// We should delete AST for inner loops in the new pass manager to avoid
/// memory leak.
///
-bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
- LoopInfo *LI, DominatorTree *DT,
- TargetLibraryInfo *TLI,
- ScalarEvolution *SE, MemorySSA *MSSA,
- OptimizationRemarkEmitter *ORE,
- bool DeleteAST) {
+bool LoopInvariantCodeMotion::runOnLoop(
+ Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
+ TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE,
+ MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool DeleteAST) {
bool Changed = false;
assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -266,7 +270,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
// instructions, we perform another pass to hoist them out of the loop.
//
if (L->hasDedicatedExits())
- Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
+ Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
CurAST, &SafetyInfo, ORE);
if (Preheader)
Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
@@ -359,7 +363,8 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
/// definitions, allowing us to sink a loop body in one pass without iteration.
///
bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
- DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+ DominatorTree *DT, TargetLibraryInfo *TLI,
+ TargetTransformInfo *TTI, Loop *CurLoop,
AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE) {
@@ -400,12 +405,15 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
// outside of the loop. In this case, it doesn't even matter if the
// operands of the instruction are loop invariant.
//
- if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
+ bool FreeInLoop = false;
+ if (isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
- if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE)) {
- ++II;
- CurAST->deleteValue(&I);
- I.eraseFromParent();
+ if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE, FreeInLoop)) {
+ if (!FreeInLoop) {
+ ++II;
+ CurAST->deleteValue(&I);
+ I.eraseFromParent();
+ }
Changed = true;
}
}
@@ -708,13 +716,39 @@ static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
return true;
}
+/// Return true if the instruction is free in the loop.
+static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
+ const TargetTransformInfo *TTI) {
+ if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ if (TTI->getUserCost(&I) != TargetTransformInfo::TCC_Free)
+ return false;
+ // For a GEP, we cannot simply use getUserCost because currently it
+ // optimistically assume that a GEP will fold into addressing mode
+ // regardless of its users.
+ const BasicBlock *BB = I.getParent();
+ for (const User *U : I.users()) {
+ const Instruction *UI = cast<Instruction>(U);
+ if (CurLoop->contains(UI) &&
+ (BB != UI->getParent() ||
+ (!isa<StoreInst>(UI) && !isa<LoadInst>(UI))))
+ return false;
+ }
+ return true;
+ } else
+ return TTI->getUserCost(&I) == TargetTransformInfo::TCC_Free;
+}
+
/// Return true if the only users of this instruction are outside of
/// the loop. If this is true, we can sink the instruction to the exit
/// blocks of the loop.
///
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
- const LoopSafetyInfo *SafetyInfo) {
+/// We also return true if the instruction could be folded away in lowering.
+/// (e.g., a GEP can be folded into a load as an addressing mode in the loop).
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ TargetTransformInfo *TTI, bool &FreeInLoop) {
const auto &BlockColors = SafetyInfo->BlockColors;
+ bool IsFree = isFreeInLoop(I, CurLoop, TTI);
for (const User *U : I.users()) {
const Instruction *UI = cast<Instruction>(U);
if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
@@ -731,8 +765,13 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
return false;
}
- if (CurLoop->contains(UI))
+ if (CurLoop->contains(UI)) {
+ if (IsFree) {
+ FreeInLoop = true;
+ continue;
+ }
return false;
+ }
}
return true;
}
@@ -888,7 +927,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
///
static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
- OptimizationRemarkEmitter *ORE) {
+ OptimizationRemarkEmitter *ORE, bool FreeInLoop) {
DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
ORE->emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
@@ -900,7 +939,6 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
else if (isa<CallInst>(I))
++NumMovedCalls;
++NumSunk;
- Changed = true;
// Iterate over users to be ready for actual sinking. Replace users via
// unrechable blocks with undef and make all user PHIs trivially replcable.
@@ -910,11 +948,12 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
Use &U = UI.getUse();
++UI;
- if (VisitedUsers.count(User))
+ if (VisitedUsers.count(User) || CurLoop->contains(User))
continue;
if (!DT->isReachableFromEntry(User->getParent())) {
U = UndefValue::get(I.getType());
+ Changed = true;
continue;
}
@@ -927,6 +966,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
BasicBlock *BB = PN->getIncomingBlock(U);
if (!DT->isReachableFromEntry(BB)) {
U = UndefValue::get(I.getType());
+ Changed = true;
continue;
}
@@ -935,7 +975,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
continue;
if (!canSplitPredecessors(PN))
- return false;
+ return Changed;
// Split predecessors of the PHI so that we can make users trivially
// replacable.
@@ -947,6 +987,9 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
UE = I.user_end();
}
+ if (VisitedUsers.empty())
+ return Changed;
+
#ifndef NDEBUG
SmallVector<BasicBlock *, 32> ExitBlocks;
CurLoop->getUniqueExitBlocks(ExitBlocks);
@@ -960,9 +1003,14 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
// If this instruction is only used outside of the loop, then all users are
// PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
// the instruction.
- while (!I.use_empty()) {
- Value::user_iterator UI = I.user_begin();
- PHINode *PN = cast<PHINode>(*UI);
+ SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
+ for (auto *UI : Users) {
+ auto *User = cast<Instruction>(UI);
+
+ if (CurLoop->contains(User))
+ continue;
+
+ PHINode *PN = cast<PHINode>(User);
assert(ExitBlockSet.count(PN->getParent()) &&
"The LCSSA PHI is not in an exit block!");
// The PHI must be trivially replacable.
@@ -970,6 +1018,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
SafetyInfo, CurLoop);
PN->replaceAllUsesWith(New);
PN->eraseFromParent();
+ Changed = true;
}
return Changed;
}
diff --git a/test/Transforms/LICM/sink-foldable.ll b/test/Transforms/LICM/sink-foldable.ll
new file mode 100644
index 00000000000..cf67e35efed
--- /dev/null
+++ b/test/Transforms/LICM/sink-foldable.ll
@@ -0,0 +1,147 @@
+; RUN: opt < %s -licm -S | FileCheck %s
+target triple = "aarch64--linux-gnueabi"
+
+; CHECK-LABEL:@test1
+; CHECK-LABEL:loopexit1:
+; CHECK: %[[PHI:.+]] = phi i8** [ %arrayidx0, %if.end ]
+; CHECK: getelementptr inbounds i8*, i8** %[[PHI]], i64 1
+define i8** @test1(i32 %j, i8** readonly %P, i8* readnone %Q) {
+entry:
+ %cmp0 = icmp slt i32 0, %j
+ br i1 %cmp0, label %for.body.lr.ph, label %return
+
+for.body.lr.ph:
+ br label %for.body
+
+for.body:
+ %P.addr = phi i8** [ %P, %for.body.lr.ph ], [ %arrayidx0, %if.end ]
+ %i0 = phi i32 [ 0, %for.body.lr.ph ], [ %i.add, %if.end]
+
+ %i0.ext = sext i32 %i0 to i64
+ %arrayidx0 = getelementptr inbounds i8*, i8** %P.addr, i64 %i0.ext
+ %l0 = load i8*, i8** %arrayidx0, align 8
+ %cmp1 = icmp ugt i8* %l0, %Q
+ br i1 %cmp1, label %loopexit0, label %if.end
+
+if.end: ; preds = %for.body
+ %arrayidx1 = getelementptr inbounds i8*, i8** %arrayidx0, i64 1
+ %l1 = load i8*, i8** %arrayidx1, align 8
+ %cmp4 = icmp ugt i8* %l1, %Q
+ %i.add = add nsw i32 %i0, 2
+ br i1 %cmp4, label %loopexit1, label %for.body
+
+loopexit0:
+ %p1 = phi i8** [%arrayidx0, %for.body]
+ br label %return
+
+loopexit1:
+ %p2 = phi i8** [%arrayidx1, %if.end]
+ br label %return
+
+return:
+ %retval.0 = phi i8** [ %p1, %loopexit0 ], [%p2, %loopexit1], [ null, %entry ]
+ ret i8** %retval.0
+}
+
+; CHECK-LABEL: @test2
+; CHECK-LABEL: loopexit2:
+; CHECK: %[[PHI:.*]] = phi i8** [ %add.ptr, %if.end ]
+; CHECK: getelementptr inbounds i8*, i8** %[[PHI]]
+define i8** @test2(i32 %j, i8** readonly %P, i8* readnone %Q) {
+
+entry:
+ br label %for.body
+
+for.cond:
+ %i.addr.0 = phi i32 [ %add, %if.end ]
+ %P.addr.0 = phi i8** [ %add.ptr, %if.end ]
+ %cmp = icmp slt i32 %i.addr.0, %j
+ br i1 %cmp, label %for.body, label %loopexit0
+
+for.body:
+ %P.addr = phi i8** [ %P, %entry ], [ %P.addr.0, %for.cond ]
+ %i.addr = phi i32 [ 0, %entry ], [ %i.addr.0, %for.cond ]
+
+ %idx.ext = sext i32 %i.addr to i64
+ %add.ptr = getelementptr inbounds i8*, i8** %P.addr, i64 %idx.ext
+ %l0 = load i8*, i8** %add.ptr, align 8
+
+ %cmp1 = icmp ugt i8* %l0, %Q
+ br i1 %cmp1, label %loopexit1, label %if.end
+
+if.end:
+ %add.i = add i32 %i.addr, 1
+ %idx2.ext = sext i32 %add.i to i64
+ %arrayidx2 = getelementptr inbounds i8*, i8** %add.ptr, i64 %idx2.ext
+ %l1 = load i8*, i8** %arrayidx2, align 8
+ %cmp2 = icmp ugt i8* %l1, %Q
+ %add = add nsw i32 %add.i, 1
+ br i1 %cmp2, label %loopexit2, label %for.cond
+
+loopexit0:
+ %p0 = phi i8** [ null, %for.cond ]
+ br label %return
+
+loopexit1:
+ %p1 = phi i8** [ %add.ptr, %for.body ]
+ br label %return
+
+loopexit2:
+ %p2 = phi i8** [ %arrayidx2, %if.end ]
+ br label %return
+
+return:
+ %retval.0 = phi i8** [ %p1, %loopexit1 ], [ %p2, %loopexit2 ], [ %p0, %loopexit0 ]
+ ret i8** %retval.0
+}
+
+
+; CHECK-LABEL: @test3
+; CHECK-LABEL: loopexit1:
+; CHECK: %[[ADD:.*]] = phi i64 [ %add, %if.end ]
+; CHECK: %[[ADDR:.*]] = phi i8** [ %P.addr, %if.end ]
+; CHECK: %[[TRUNC:.*]] = trunc i64 %[[ADD]] to i32
+; CHECK: getelementptr inbounds i8*, i8** %[[ADDR]], i32 %[[TRUNC]]
+; CHECK: call void @dummy(i32 %[[TRUNC]])
+define i8** @test3(i64 %j, i8** readonly %P, i8* readnone %Q) {
+entry:
+ %cmp0 = icmp slt i64 0, %j
+ br i1 %cmp0, label %for.body.lr.ph, label %return
+
+for.body.lr.ph:
+ br label %for.body
+
+for.body:
+ %P.addr = phi i8** [ %P, %for.body.lr.ph ], [ %arrayidx0, %if.end ]
+ %i0 = phi i32 [ 0, %for.body.lr.ph ], [ %i.add, %if.end]
+
+ %i0.ext = sext i32 %i0 to i64
+ %arrayidx0 = getelementptr inbounds i8*, i8** %P.addr, i64 %i0.ext
+ %l0 = load i8*, i8** %arrayidx0, align 8
+ %cmp1 = icmp ugt i8* %l0, %Q
+ br i1 %cmp1, label %loopexit0, label %if.end
+
+if.end: ; preds = %for.body
+ %add = add i64 %i0.ext, 1
+ %trunc = trunc i64 %add to i32
+ %arrayidx1 = getelementptr inbounds i8*, i8** %P.addr, i32 %trunc
+ %l1 = load i8*, i8** %arrayidx1, align 8
+ %cmp4 = icmp ugt i8* %l1, %Q
+ %i.add = add nsw i32 %i0, 2
+ br i1 %cmp4, label %loopexit1, label %for.body
+
+loopexit0:
+ %p1 = phi i8** [%arrayidx0, %for.body]
+ br label %return
+
+loopexit1:
+ %p2 = phi i8** [%arrayidx1, %if.end]
+ call void @dummy(i32 %trunc)
+ br label %return
+
+return:
+ %retval.0 = phi i8** [ %p1, %loopexit0 ], [%p2, %loopexit1], [ null, %entry ]
+ ret i8** %retval.0
+}
+
+declare void @dummy(i32)