[SimplifyCFG] Don't do if-conversion if there is a long dependence chain

If after if-conversion, most of the instructions in this new BB construct a long and slow dependence chain, it may be slower than cmp/branch, even if the branch has a high miss rate, because the control dependence is transformed into data dependence, and control dependence can be speculated, and thus, the second part can execute in parallel with the first part on modern OOO processor. This patch checks for the long dependence chain, and give up if-conversion if find one. Differential Revision: https://reviews.llvm.org/D39352 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@321377 91177308-0d34-0410-b5e6-96231b3b80d8
author: Guozhi Wei <carrot@google.com> 2017-12-22 18:54:04 +0000
committer: Guozhi Wei <carrot@google.com> 2017-12-22 18:54:04 +0000
commit: 7f53692f1dbc8c34463bac49aecc9b008dcb74c3 (patch)
tree: 259d063debc8e8256ddd5b3918c686589dbc7c9f
parent: 463ba76180cc06c5f2c0f1cfc97913ef48314456 (diff)
6 files changed, 427 insertions, 0 deletions
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index c20f20cfbe4..cecd8958e9d 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -646,6 +646,9 @@ public:
   /// \brief Additional properties of an operand's values.
   enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
 
+  /// \return True if target can execute instructions out of order.
+  bool isOutOfOrder() const;
+
   /// \return The number of scalar or vector registers that the target has.
   /// If 'Vectors' is true, it returns the number of vector registers. If it is
   /// set to false, it returns the number of scalar registers.
@@ -1018,6 +1021,7 @@ public:
                             Type *Ty) = 0;
   virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                             Type *Ty) = 0;
+  virtual bool isOutOfOrder() const = 0;
   virtual unsigned getNumberOfRegisters(bool Vector) = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
@@ -1295,6 +1299,9 @@ public:
                     Type *Ty) override {
     return Impl.getIntImmCost(IID, Idx, Imm, Ty);
   }
+  bool isOutOfOrder() const override {
+    return Impl.isOutOfOrder();
+  }
   unsigned getNumberOfRegisters(bool Vector) override {
     return Impl.getNumberOfRegisters(Vector);
   }
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4c37402278e..3625675d53d 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -337,6 +337,8 @@ public:
     return TTI::TCC_Free;
   }
 
+  bool isOutOfOrder() const { return false; }
+
   unsigned getNumberOfRegisters(bool Vector) { return 8; }
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index 526ddb1b970..f1f9275b078 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -402,6 +402,10 @@ public:
     return BaseT::getInstructionLatency(I);
   }
 
+  bool isOutOfOrder() const {
+    return getST()->getSchedModel().isOutOfOrder();
+  }
+
   /// @}
 
   /// \name Vector TTI Implementations
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index b744cae51ed..c9e9c6d1a41 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -314,6 +314,10 @@ int TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
   return Cost;
 }
 
+bool TargetTransformInfo::isOutOfOrder() const {
+  return TTIImpl->isOutOfOrder();
+}
+
 unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
   return TTIImpl->getNumberOfRegisters(Vector);
 }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index e7358dbcb62..b3c80424c8b 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -127,6 +127,16 @@ static cl::opt<unsigned> MaxSpeculationDepth(
     cl::desc("Limit maximum recursion depth when calculating costs of "
              "speculatively executed instructions"));
 
+static cl::opt<unsigned> DependenceChainLatency(
+    "dependence-chain-latency", cl::Hidden, cl::init(8),
+    cl::desc("Limit the maximum latency of dependence chain containing cmp "
+             "for if conversion"));
+
+static cl::opt<unsigned> SmallBBSize(
+    "small-bb-size", cl::Hidden, cl::init(40),
+    cl::desc("Check dependence chain latency only in basic block smaller than "
+             "this number"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
@@ -395,6 +405,166 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   return true;
 }
 
+/// Estimate the code size of the specified BB.
+static unsigned CountBBCodeSize(BasicBlock *BB,
+                                const TargetTransformInfo &TTI) {
+  unsigned Size = 0;
+  for (auto II = BB->begin(); !isa<TerminatorInst>(II); ++II)
+    Size += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_CodeSize);
+  return Size;
+}
+
+/// Find out the latency of the longest dependence chain in the BB if
+/// LongestChain is true, or the dependence chain containing the compare
+/// instruction feeding the block's conditional branch.
+static unsigned FindDependenceChainLatency(BasicBlock *BB,
+                            DenseMap<Instruction *, unsigned> &Instructions,
+                            const TargetTransformInfo &TTI,
+                            bool LongestChain) {
+  unsigned MaxLatency = 0;
+
+  BasicBlock::iterator II;
+  for (II = BB->begin(); !isa<TerminatorInst>(II); ++II) {
+    unsigned Latency = 0;
+    for (unsigned O = 0, E = II->getNumOperands(); O != E; ++O) {
+      Instruction *Op = dyn_cast<Instruction>(II->getOperand(O));
+      if (Op && Instructions.count(Op)) {
+        auto OpLatency = Instructions[Op];
+        if (OpLatency > Latency)
+          Latency = OpLatency;
+      }
+    }
+    Latency += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_Latency);
+    Instructions[&(*II)] = Latency;
+
+    if (Latency > MaxLatency)
+      MaxLatency = Latency;
+  }
+
+  if (LongestChain)
+    return MaxLatency;
+
+  // The length of the dependence chain containing the compare instruction is
+  // wanted, so the terminator must be a BranchInst.
+  assert(isa<BranchInst>(II));
+  BranchInst* Br = cast<BranchInst>(II);
+  Instruction *Cmp = dyn_cast<Instruction>(Br->getCondition());
+  if (Cmp && Instructions.count(Cmp))
+    return Instructions[Cmp];
+  else
+    return 0;
+}
+
+/// Instructions in BB2 may depend on instructions in BB1, and instructions
+/// in BB1 may have users in BB2. If the last (in terms of latency) such kind
+/// of instruction in BB1 is I, then the instructions after I can be executed
+/// in parallel with instructions in BB2.
+/// This function returns the latency of I.
+static unsigned LatencyAdjustment(BasicBlock *BB1, BasicBlock *BB2,
+                        BasicBlock *IfBlock1, BasicBlock *IfBlock2,
+                        DenseMap<Instruction *, unsigned> &BB1Instructions) {
+  unsigned LastLatency = 0;
+  SmallVector<Instruction *, 16> Worklist;
+  BasicBlock::iterator II;
+  for (II = BB2->begin(); !isa<TerminatorInst>(II); ++II) {
+    if (PHINode *PN = dyn_cast<PHINode>(II)) {
+      // Look for users in BB2.
+      bool InBBUser = false;
+      for (User *U : PN->users()) {
+        if (cast<Instruction>(U)->getParent() == BB2) {
+          InBBUser = true;
+          break;
+        }
+      }
+      // No such user, we don't care about this instruction and its operands.
+      if (!InBBUser)
+        break;
+    }
+    Worklist.push_back(&(*II));
+  }
+
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    for (unsigned O = 0, E = I->getNumOperands(); O != E; ++O) {
+      if (Instruction *Op = dyn_cast<Instruction>(I->getOperand(O))) {
+        if (Op->getParent() == IfBlock1 || Op->getParent() == IfBlock2)
+          Worklist.push_back(Op);
+        else if (Op->getParent() == BB1 && BB1Instructions.count(Op)) {
+          if (BB1Instructions[Op] > LastLatency)
+            LastLatency = BB1Instructions[Op];
+        }
+      }
+    }
+  }
+
+  return LastLatency;
+}
+
+/// If after if conversion, most of the instructions in this new BB construct a
+/// long and slow dependence chain, it may be slower than cmp/branch, even
+/// if the branch has a high miss rate, because the control dependence is
+/// transformed into data dependence, and control dependence can be speculated,
+/// and thus, the second part can execute in parallel with the first part on
+/// modern OOO processor.
+///
+/// To check this condition, this function finds the length of the dependence
+/// chain in BB1 (only the part that can be executed in parallel with code after
+/// branch in BB2) containing cmp, and if the length is longer than a threshold,
+/// don't perform if conversion.
+///
+/// BB1, BB2, IfBlock1 and IfBlock2 are candidate BBs for if conversion.
+/// SpeculationSize contains the code size of IfBlock1 and IfBlock2.
+static bool FindLongDependenceChain(BasicBlock *BB1, BasicBlock *BB2,
+                             BasicBlock *IfBlock1, BasicBlock *IfBlock2,
+                             unsigned SpeculationSize,
+                             const TargetTransformInfo &TTI) {
+  // Accumulated latency of each instruction in their BBs.
+  DenseMap<Instruction *, unsigned> BB1Instructions;
+  DenseMap<Instruction *, unsigned> BB2Instructions;
+
+  if (!TTI.isOutOfOrder())
+    return false;
+
+  unsigned NewBBSize = CountBBCodeSize(BB1, TTI) + CountBBCodeSize(BB2, TTI)
+                         + SpeculationSize;
+
+  // We check small BB only since it is more difficult to find unrelated
+  // instructions to fill functional units in a small BB.
+  if (NewBBSize > SmallBBSize)
+    return false;
+
+  auto BB1Chain =
+         FindDependenceChainLatency(BB1, BB1Instructions, TTI, false);
+  auto BB2Chain =
+         FindDependenceChainLatency(BB2, BB2Instructions, TTI, true);
+
+  // If there are many unrelated instructions in the new BB, there will be
+  // other instructions for the processor to issue regardless of the length
+  // of this new dependence chain.
+  // Modern processors can issue 3 or more instructions in each cycle. But in
+  // real world applications, an IPC of 2 is already very good for non-loop
+  // code with small basic blocks. Higher IPC is usually found in programs with
+  // small kernel. So IPC of 2 is more reasonable for most applications.
+  if ((BB1Chain + BB2Chain) * 2 <= NewBBSize)
+    return false;
+
+  // We only care about part of the dependence chain in BB1 that can be
+  // executed in parallel with BB2, so adjust the latency.
+  BB1Chain -=
+      LatencyAdjustment(BB1, BB2, IfBlock1, IfBlock2, BB1Instructions);
+
+  // Correctly predicted branch instruction can skip the dependence chain in
+  // BB1, but misprediction has a penalty, so only when the dependence chain is
+  // longer than DependenceChainLatency, then branch is better than select.
+  // Besides misprediction penalty, the threshold value DependenceChainLatency
+  // also depends on branch misprediction rate, taken branch latency and cmov
+  // latency.
+  if (BB1Chain >= DependenceChainLatency)
+    return true;
+
+  return false;
+}
+
 /// Extract ConstantInt from value, looking through IntToPtr
 /// and PointerNullValue. Return NULL if value is not a constant int.
 static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
@@ -2044,6 +2214,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
   if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue))
     return false;
 
+  // Don't do if conversion for long dependence chain.
+  if (FindLongDependenceChain(BB, EndBB, ThenBB, nullptr,
+                              CountBBCodeSize(ThenBB, TTI), TTI))
+    return false;
+
   // If we get here, we can hoist the instruction and if-convert.
   DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
 
@@ -2351,6 +2526,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
       }
   }
 
+  if (FindLongDependenceChain(DomBlock, BB, IfBlock1, IfBlock2,
+                              AggressiveInsts.size(), TTI))
+    return false;
+
   DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond << "  T: "
                << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n");
 
diff --git a/test/Transforms/SimplifyCFG/X86/if-conversion.ll b/test/Transforms/SimplifyCFG/X86/if-conversion.ll
new file mode 100644
index 00000000000..28702572d48
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/if-conversion.ll
@@ -0,0 +1,231 @@
+; RUN: opt < %s -simplifycfg -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -S | FileCheck %s
+; Avoid if-conversion if there is a long dependence chain.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; The first several cases test FindLongDependenceChain returns true, so
+; if-conversion is blocked.
+
+define i64 @test1(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-NOT: select
+}
+
+define i64 @test2(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = add i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-LABEL: @test2
+; CHECK-NOT: select
+}
+
+; The following cases test FindLongDependenceChain returns false, so
+; if-conversion will proceed.
+
+; Non trivial LatencyAdjustment.
+define i64 @test3(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %p4 = add i64 %p3, %1
+  %ptr = inttoptr i64 %p4 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-LABEL: @test3
+; CHECK: select
+}
+
+; Short dependence chain.
+define i64 @test4(i64* %pp, i64* %p) {
+entry:
+  %0 = load i64, i64* %pp, align 8
+  %cmp = icmp slt i64 %0, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  ret i64 %val
+
+; CHECK-LABEL: @test4
+; CHECK: select
+}
+
+; High IPC.
+define i64 @test5(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  %2 = add i64 %pint, 2
+  %3 = add i64 %pint, 3
+  %4 = or i64 %pint, 16
+  %5 = and i64 %pint, 255
+
+  %6 = or i64 %2, 9
+  %7 = and i64 %3, 255
+  %8 = add i64 %4, 4
+  %9 = add i64 %5, 5
+
+  %10 = add i64 %6, 2
+  %11 = add i64 %7, 3
+  %12 = add i64 %8, 4
+  %13 = add i64 %9, 5
+
+  %14 = add i64 %10, 6
+  %15 = add i64 %11, 7
+  %16 = add i64 %12, 8
+  %17 = add i64 %13, 9
+
+  %18 = add i64 %14, 10
+  %19 = add i64 %15, 11
+  %20 = add i64 %16, 12
+  %21 = add i64 %17, 13
+
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+
+  ret i64 %val
+
+; CHECK-LABEL: @test5
+; CHECK: select
+}
+
+; Large BB size.
+define i64 @test6(i64** %pp, i64* %p) {
+entry:
+  %0 = load i64*, i64** %pp, align 8
+  %1 = load i64, i64* %0, align 8
+  %cmp = icmp slt i64 %1, 0
+  %pint = ptrtoint i64* %p to i64
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  %p1 = add i64 %pint, 8
+  br label %cond.end
+
+cond.false:
+  %p2 = or i64 %pint, 16
+  br label %cond.end
+
+cond.end:
+  %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false]
+  %ptr = inttoptr i64 %p3 to i64*
+  %val = load i64, i64* %ptr, align 8
+  %2 = add i64 %pint, 2
+  %3 = add i64 %pint, 3
+  %4 = add i64 %2, 4
+  %5 = add i64 %3, 5
+  %6 = add i64 %4, 6
+  %7 = add i64 %5, 7
+  %8 = add i64 %6, 6
+  %9 = add i64 %7, 7
+  %10 = add i64 %8, 6
+  %11 = add i64 %9, 7
+  %12 = add i64 %10, 6
+  %13 = add i64 %11, 7
+  %14 = add i64 %12, 6
+  %15 = add i64 %13, 7
+  %16 = add i64 %14, 6
+  %17 = add i64 %15, 7
+  %18 = add i64 %16, 6
+  %19 = add i64 %17, 7
+  %20 = add i64 %18, 6
+  %21 = add i64 %19, 7
+  %22 = add i64 %20, 6
+  %23 = add i64 %21, 7
+  %24 = add i64 %22, 6
+  %25 = add i64 %23, 7
+  %26 = add i64 %24, 6
+  %27 = add i64 %25, 7
+  %28 = add i64 %26, 6
+  %29 = add i64 %27, 7
+  %30 = add i64 %28, 6
+  %31 = add i64 %29, 7
+  %32 = add i64 %30, 8
+  %33 = add i64 %31, 9
+  %34 = add i64 %32, %33
+  %35 = and i64 %34, 255
+  %res = add i64 %val, %35
+
+  ret i64 %res
+
+; CHECK-LABEL: @test6
+; CHECK: select
+}
author	Guozhi Wei <carrot@google.com>	2017-12-22 18:54:04 +0000
committer	Guozhi Wei <carrot@google.com>	2017-12-22 18:54:04 +0000
commit	7f53692f1dbc8c34463bac49aecc9b008dcb74c3 (patch)
tree	259d063debc8e8256ddd5b3918c686589dbc7c9f
parent	463ba76180cc06c5f2c0f1cfc97913ef48314456 (diff)