summaryrefslogtreecommitdiff
path: root/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
diff options
context:
space:
mode:
authorKrzysztof Parzyszek <kparzysz@codeaurora.org>2017-03-20 18:12:58 +0000
committerKrzysztof Parzyszek <kparzysz@codeaurora.org>2017-03-20 18:12:58 +0000
commitfea30225e19731bd4c7393e0ceecb33dc96d934c (patch)
tree11fa808a1a9076c6430e83a7590c6935489c2d9d /lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
parentf6fa4ce469b08f36f67ecc136aef9afca7ae9b24 (diff)
[Hexagon] Recognize polynomial-modulo loop idiom again
Regain the ability to recognize loops calculating polynomial modulo operation. This ability has been lost due to some changes in the preceding optimizations. Add code to preprocess the IR to a form that the pattern matching code can recognize. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@298282 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp')
-rw-r--r--lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp717
1 files changed, 700 insertions, 17 deletions
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index d775764b43b..30623021f86 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -129,6 +129,342 @@ INITIALIZE_PASS_END(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
"Recognize Hexagon-specific loop idioms", false, false)
+namespace {
+ struct Simplifier {
+ typedef std::function<Value* (Instruction*, LLVMContext&)> Rule;
+
+ void addRule(const Rule &R) { Rules.push_back(R); }
+
+ private:
+ typedef std::deque<Value*> WorkListType;
+ typedef std::set<Value*> ValueSetType;
+ std::vector<Rule> Rules;
+
+ public:
+ struct Context {
+ typedef DenseMap<Value*,Value*> ValueMapType;
+
+ Value *Root;
+ ValueSetType Used;
+ ValueMapType Clones, Orig;
+ LLVMContext &Ctx;
+
+ Context(Instruction *Exp)
+ : Ctx(Exp->getParent()->getParent()->getContext()) {
+ initialize(Exp);
+ reset();
+ }
+ ~Context() { cleanup(); }
+ void print(raw_ostream &OS, const Value *V) const;
+
+ Value *materialize(BasicBlock *B, BasicBlock::iterator At);
+
+ private:
+ void initialize(Instruction *Exp);
+ void reset();
+ void cleanup();
+ void cleanup(Value *V);
+
+ bool equal(const Instruction *I, const Instruction *J) const;
+ Value *find(Value *Tree, Value *Sub) const;
+ Value *subst(Value *Tree, Value *OldV, Value *NewV);
+ void replace(Value *OldV, Value *NewV);
+ void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At);
+
+ friend struct Simplifier;
+ };
+
+ Value *simplify(Context &C);
+ };
+
+ struct PE {
+ PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
+ const Simplifier::Context &C;
+ const Value *V;
+ };
+
+ raw_ostream &operator<< (raw_ostream &OS, const PE &P) LLVM_ATTRIBUTE_USED;
+ raw_ostream &operator<< (raw_ostream &OS, const PE &P) {
+ P.C.print(OS, P.V ? P.V : P.C.Root);
+ return OS;
+ }
+}
+
+
+void Simplifier::Context::print(raw_ostream &OS, const Value *V) const {
+ const auto *U = dyn_cast<const Instruction>(V);
+ if (!U) {
+ OS << V << '(' << *V << ')';
+ return;
+ }
+
+ if (U->getParent()) {
+ OS << U << '(';
+ U->printAsOperand(OS, true);
+ OS << ')';
+ return;
+ }
+
+ unsigned N = U->getNumOperands();
+ if (N != 0)
+ OS << U << '(';
+ OS << U->getOpcodeName();
+ for (const Value *Op : U->operands()) {
+ OS << ' ';
+ print(OS, Op);
+ }
+ if (N != 0)
+ OS << ')';
+}
+
+
+void Simplifier::Context::initialize(Instruction *Exp) {
+ // Perform a deep clone of the expression, set Root to the root
+ // of the clone, and build a map from the cloned values to the
+ // original ones.
+ BasicBlock *Block = Exp->getParent();
+ WorkListType Q;
+ Q.push_back(Exp);
+
+ while (!Q.empty()) {
+ Value *V = Q.front();
+ Q.pop_front();
+ if (Clones.find(V) != Clones.end())
+ continue;
+ if (Instruction *U = dyn_cast<Instruction>(V)) {
+ if (isa<PHINode>(U) || U->getParent() != Block)
+ continue;
+ for (Value *Op : U->operands())
+ Q.push_back(Op);
+ Clones.insert({U, U->clone()});
+ }
+ }
+
+ for (std::pair<Value*,Value*> P : Clones) {
+ Instruction *U = cast<Instruction>(P.second);
+ for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
+ auto F = Clones.find(U->getOperand(i));
+ if (F != Clones.end())
+ U->setOperand(i, F->second);
+ }
+ Orig.insert({P.second, P.first});
+ }
+
+ auto R = Clones.find(Exp);
+ assert(R != Clones.end());
+ Root = R->second;
+}
+
+
+void Simplifier::Context::reset() {
+ ValueSetType NewUsed;
+ WorkListType Q;
+ Q.push_back(Root);
+
+ while (!Q.empty()) {
+ Instruction *U = dyn_cast<Instruction>(Q.front());
+ Q.pop_front();
+ if (!U || U->getParent())
+ continue;
+ NewUsed.insert(U);
+ for (Value *Op : U->operands())
+ Q.push_back(Op);
+ }
+ for (Value *V : Used)
+ if (!NewUsed.count(V))
+ cast<Instruction>(V)->dropAllReferences();
+ Used = NewUsed;
+}
+
+
+Value *Simplifier::Context::subst(Value *Tree, Value *OldV, Value *NewV) {
+ if (Tree == OldV) {
+ cleanup(OldV);
+ return NewV;
+ }
+
+ WorkListType Q;
+ Q.push_back(Tree);
+ while (!Q.empty()) {
+ Instruction *U = dyn_cast<Instruction>(Q.front());
+ Q.pop_front();
+ // If U is not an instruction, or it's not a clone, skip it.
+ if (!U || U->getParent())
+ continue;
+ for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
+ Value *Op = U->getOperand(i);
+ if (Op == OldV) {
+ cleanup(OldV);
+ U->setOperand(i, NewV);
+ } else {
+ Q.push_back(Op);
+ }
+ }
+ }
+ return Tree;
+}
+
+
+void Simplifier::Context::replace(Value *OldV, Value *NewV) {
+ if (Root == OldV) {
+ Root = NewV;
+ reset();
+ return;
+ }
+
+ // NewV may be a complex tree that has just been created by one of the
+ // transformation rules. We need to make sure that it is commoned with
+ // the existing Root to the maximum extent possible.
+ // Identify all subtrees of NewV (including NewV itself) that have
+ // equivalent counterparts in Root, and replace those subtrees with
+ // these counterparts.
+ WorkListType Q;
+ Q.push_back(NewV);
+ while (!Q.empty()) {
+ Value *V = Q.front();
+ Q.pop_front();
+ Instruction *U = dyn_cast<Instruction>(V);
+ if (!U || U->getParent())
+ continue;
+ if (Value *DupV = find(Root, V)) {
+ if (DupV != V)
+ NewV = subst(NewV, V, DupV);
+ } else {
+ for (Value *Op : U->operands())
+ Q.push_back(Op);
+ }
+ }
+
+ // Now, simply replace OldV with NewV in Root.
+ Root = subst(Root, OldV, NewV);
+ reset();
+}
+
+
+void Simplifier::Context::cleanup() {
+ for (Value *V : Used) {
+ Instruction *U = cast<Instruction>(V);
+ if (!U->getParent())
+ U->dropAllReferences();
+ }
+}
+
+
+void Simplifier::Context::cleanup(Value *V) {
+ if (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != nullptr)
+ return;
+ WorkListType Q;
+ Q.push_back(V);
+ while (!Q.empty()) {
+ Instruction *U = dyn_cast<Instruction>(Q.front());
+ Q.pop_front();
+ if (!U || U->getParent() || Used.count(U))
+ continue;
+ for (Value *Op : U->operands())
+ Q.push_back(Op);
+ U->dropAllReferences();
+ }
+}
+
+
+bool Simplifier::Context::equal(const Instruction *I,
+ const Instruction *J) const {
+ if (I == J)
+ return true;
+ if (!I->isSameOperationAs(J))
+ return false;
+ if (isa<PHINode>(I))
+ return I->isIdenticalTo(J);
+
+ for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
+ Value *OpI = I->getOperand(i), *OpJ = J->getOperand(i);
+ if (OpI == OpJ)
+ continue;
+ auto *InI = dyn_cast<const Instruction>(OpI);
+ auto *InJ = dyn_cast<const Instruction>(OpJ);
+ if (InI && InJ) {
+ if (!equal(InI, InJ))
+ return false;
+ } else if (InI != InJ || !InI)
+ return false;
+ }
+ return true;
+}
+
+
+Value *Simplifier::Context::find(Value *Tree, Value *Sub) const {
+ Instruction *SubI = dyn_cast<Instruction>(Sub);
+ WorkListType Q;
+ Q.push_back(Tree);
+
+ while (!Q.empty()) {
+ Value *V = Q.front();
+ Q.pop_front();
+ if (V == Sub)
+ return V;
+ Instruction *U = dyn_cast<Instruction>(V);
+ if (!U || U->getParent())
+ continue;
+ if (SubI && equal(SubI, U))
+ return U;
+ assert(!isa<PHINode>(U));
+ for (Value *Op : U->operands())
+ Q.push_back(Op);
+ }
+ return nullptr;
+}
+
+
+void Simplifier::Context::link(Instruction *I, BasicBlock *B,
+ BasicBlock::iterator At) {
+ if (I->getParent())
+ return;
+
+ for (Value *Op : I->operands()) {
+ if (Instruction *OpI = dyn_cast<Instruction>(Op))
+ link(OpI, B, At);
+ }
+
+ B->getInstList().insert(At, I);
+}
+
+
+Value *Simplifier::Context::materialize(BasicBlock *B,
+ BasicBlock::iterator At) {
+ if (Instruction *RootI = dyn_cast<Instruction>(Root))
+ link(RootI, B, At);
+ return Root;
+}
+
+
+Value *Simplifier::simplify(Context &C) {
+ WorkListType Q;
+ Q.push_back(C.Root);
+
+ while (!Q.empty()) {
+ Instruction *U = dyn_cast<Instruction>(Q.front());
+ Q.pop_front();
+ if (!U || U->getParent() || !C.Used.count(U))
+ continue;
+ bool Changed = false;
+ for (Rule &R : Rules) {
+ Value *W = R(U, C.Ctx);
+ if (!W)
+ continue;
+ Changed = true;
+ C.replace(U, W);
+ Q.push_back(C.Root);
+ break;
+ }
+ if (!Changed) {
+ for (Value *Op : U->operands())
+ Q.push_back(Op);
+ }
+ }
+ return C.Root;
+}
+
+
//===----------------------------------------------------------------------===//
//
// Implementation of PolynomialMultiplyRecognize
@@ -147,6 +483,14 @@ namespace {
private:
typedef SetVector<Value*> ValueSeq;
+ IntegerType *getPmpyType() const {
+ LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext();
+ return IntegerType::get(Ctx, 32);
+ }
+ bool isPromotableTo(Value *V, IntegerType *Ty);
+ void promoteTo(Instruction *In, IntegerType *DestTy, BasicBlock *LoopB);
+ bool promoteTypes(BasicBlock *LoopB, BasicBlock *ExitB);
+
Value *getCountIV(BasicBlock *BB);
bool findCycle(Value *Out, Value *In, ValueSeq &Cycle);
void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early,
@@ -176,6 +520,9 @@ namespace {
unsigned getInverseMxN(unsigned QP);
Value *generate(BasicBlock::iterator At, ParsedValues &PV);
+ void setupSimplifier();
+
+ Simplifier Simp;
Loop *CurLoop;
const DataLayout &DL;
const DominatorTree &DT;
@@ -425,7 +772,6 @@ bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
BasicBlock *LoopB, BasicBlock *PrehB, Value *CIV, ParsedValues &PV,
bool PreScan) {
using namespace PatternMatch;
-
// The basic pattern for R = P.Q is:
// for i = 0..31
// R = phi (0, R')
@@ -529,6 +875,150 @@ bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
}
+bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
+ IntegerType *DestTy) {
+ IntegerType *T = dyn_cast<IntegerType>(Val->getType());
+ if (!T || T->getBitWidth() > DestTy->getBitWidth())
+ return false;
+ if (T->getBitWidth() == DestTy->getBitWidth())
+ return true;
+ // Non-instructions are promotable. The reason why an instruction may not
+ // be promotable is that it may produce a different result if its operands
+ // and the result are promoted, for example, it may produce more non-zero
+ // bits. While it would still be possible to represent the proper result
+ // in a wider type, it may require adding additional instructions (which
+ // we don't want to do).
+ Instruction *In = dyn_cast<Instruction>(Val);
+ if (!In)
+ return true;
+ // The bitwidth of the source type is smaller than the destination.
+ // Check if the individual operation can be promoted.
+ switch (In->getOpcode()) {
+ case Instruction::PHI:
+ case Instruction::ZExt:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::LShr: // Shift right is ok.
+ case Instruction::Select:
+ return true;
+ case Instruction::ICmp:
+ if (CmpInst *CI = cast<CmpInst>(In))
+ return CI->isEquality() || CI->isUnsigned();
+ llvm_unreachable("Cast failed unexpectedly");
+ case Instruction::Add:
+ return In->hasNoSignedWrap() && In->hasNoUnsignedWrap();
+ }
+ return false;
+}
+
+
+void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
+ IntegerType *DestTy, BasicBlock *LoopB) {
+ // Leave boolean values alone.
+ if (!In->getType()->isIntegerTy(1))
+ In->mutateType(DestTy);
+ unsigned DestBW = DestTy->getBitWidth();
+
+ // Handle PHIs.
+ if (PHINode *P = dyn_cast<PHINode>(In)) {
+ unsigned N = P->getNumIncomingValues();
+ for (unsigned i = 0; i != N; ++i) {
+ BasicBlock *InB = P->getIncomingBlock(i);
+ if (InB == LoopB)
+ continue;
+ Value *InV = P->getIncomingValue(i);
+ IntegerType *Ty = cast<IntegerType>(InV->getType());
+ // Do not promote values in PHI nodes of type i1.
+ if (Ty != P->getType()) {
+ // If the value type does not match the PHI type, the PHI type
+ // must have been promoted.
+ assert(Ty->getBitWidth() < DestBW);
+ InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy);
+ P->setIncomingValue(i, InV);
+ }
+ }
+ } else if (ZExtInst *Z = dyn_cast<ZExtInst>(In)) {
+ Value *Op = Z->getOperand(0);
+ if (Op->getType() == Z->getType())
+ Z->replaceAllUsesWith(Op);
+ Z->eraseFromParent();
+ return;
+ }
+
+ // Promote immediates.
+ for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) {
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(In->getOperand(i)))
+ if (CI->getType()->getBitWidth() < DestBW)
+ In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue()));
+ }
+}
+
+
+bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
+ BasicBlock *ExitB) {
+ assert(LoopB);
+ // Skip loops where the exit block has more than one predecessor. The values
+ // coming from the loop block will be promoted to another type, and so the
+ // values coming into the exit block from other predecessors would also have
+ // to be promoted.
+ if (!ExitB || (ExitB->getSinglePredecessor() != LoopB))
+ return false;
+ IntegerType *DestTy = getPmpyType();
+ // Check if the exit values have types that are no wider than the type
+ // that we want to promote to.
+ unsigned DestBW = DestTy->getBitWidth();
+ for (Instruction &In : *ExitB) {
+ PHINode *P = dyn_cast<PHINode>(&In);
+ if (!P)
+ break;
+ if (P->getNumIncomingValues() != 1)
+ return false;
+ assert(P->getIncomingBlock(0) == LoopB);
+ IntegerType *T = dyn_cast<IntegerType>(P->getType());
+ if (!T || T->getBitWidth() > DestBW)
+ return false;
+ }
+
+ // Check all instructions in the loop.
+ for (Instruction &In : *LoopB)
+ if (!In.isTerminator() && !isPromotableTo(&In, DestTy))
+ return false;
+
+ // Perform the promotion.
+ std::vector<Instruction*> LoopIns;
+ std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns),
+ [](Instruction &In) { return &In; });
+ for (Instruction *In : LoopIns)
+ promoteTo(In, DestTy, LoopB);
+
+ // Fix up the PHI nodes in the exit block.
+ Instruction *EndI = ExitB->getFirstNonPHI();
+ BasicBlock::iterator End = EndI ? EndI->getIterator() : ExitB->end();
+ for (auto I = ExitB->begin(); I != End; ++I) {
+ PHINode *P = dyn_cast<PHINode>(I);
+ if (!P)
+ break;
+ Type *Ty0 = P->getIncomingValue(0)->getType();
+ Type *PTy = P->getType();
+ if (PTy != Ty0) {
+ assert(Ty0 == DestTy);
+ // In order to create the trunc, P must have the promoted type.
+ P->mutateType(Ty0);
+ Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy);
+ // In order for the RAUW to work, the types of P and T must match.
+ P->mutateType(PTy);
+ P->replaceAllUsesWith(T);
+ // Final update of the P's type.
+ P->mutateType(Ty0);
+ cast<Instruction>(T)->setOperand(0, P);
+ }
+ }
+
+ return true;
+}
+
+
bool PolynomialMultiplyRecognize::findCycle(Value *Out, Value *In,
ValueSeq &Cycle) {
// Out = ..., In, ...
@@ -699,6 +1189,7 @@ bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V,
case Instruction::Select:
case Instruction::ICmp:
case Instruction::PHI:
+ case Instruction::ZExt:
return true;
}
}
@@ -985,13 +1476,170 @@ Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
}
+void PolynomialMultiplyRecognize::setupSimplifier() {
+ Simp.addRule(
+ // Sink zext past bitwise operations.
+ [](Instruction *I, LLVMContext &Ctx) -> Value* {
+ if (I->getOpcode() != Instruction::ZExt)
+ return nullptr;
+ Instruction *T = dyn_cast<Instruction>(I->getOperand(0));
+ if (!T)
+ return nullptr;
+ switch (T->getOpcode()) {
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ break;
+ default:
+ return nullptr;
+ }
+ IRBuilder<> B(Ctx);
+ return B.CreateBinOp(cast<BinaryOperator>(T)->getOpcode(),
+ B.CreateZExt(T->getOperand(0), I->getType()),
+ B.CreateZExt(T->getOperand(1), I->getType()));
+ });
+ Simp.addRule(
+ // (xor (and x a) (and y a)) -> (and (xor x y) a)
+ [](Instruction *I, LLVMContext &Ctx) -> Value* {
+ if (I->getOpcode() != Instruction::Xor)
+ return nullptr;
+ Instruction *And0 = dyn_cast<Instruction>(I->getOperand(0));
+ Instruction *And1 = dyn_cast<Instruction>(I->getOperand(1));
+ if (!And0 || !And1)
+ return nullptr;
+ if (And0->getOpcode() != Instruction::And ||
+ And1->getOpcode() != Instruction::And)
+ return nullptr;
+ if (And0->getOperand(1) != And1->getOperand(1))
+ return nullptr;
+ IRBuilder<> B(Ctx);
+ return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
+ And0->getOperand(1));
+ });
+ Simp.addRule(
+ // (Op (select c x y) z) -> (select c (Op x z) (Op y z))
+ // (Op x (select c y z)) -> (select c (Op x y) (Op x z))
+ [](Instruction *I, LLVMContext &Ctx) -> Value* {
+ BinaryOperator *BO = dyn_cast<BinaryOperator>(I);
+ if (!BO)
+ return nullptr;
+ Instruction::BinaryOps Op = BO->getOpcode();
+ if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(0))) {
+ IRBuilder<> B(Ctx);
+ Value *X = Sel->getTrueValue(), *Y = Sel->getFalseValue();
+ Value *Z = BO->getOperand(1);
+ return B.CreateSelect(Sel->getCondition(),
+ B.CreateBinOp(Op, X, Z),
+ B.CreateBinOp(Op, Y, Z));
+ }
+ if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(1))) {
+ IRBuilder<> B(Ctx);
+ Value *X = BO->getOperand(0);
+ Value *Y = Sel->getTrueValue(), *Z = Sel->getFalseValue();
+ return B.CreateSelect(Sel->getCondition(),
+ B.CreateBinOp(Op, X, Y),
+ B.CreateBinOp(Op, X, Z));
+ }
+ return nullptr;
+ });
+ Simp.addRule(
+ // (select c (select c x y) z) -> (select c x z)
+ // (select c x (select c y z)) -> (select c x z)
+ [](Instruction *I, LLVMContext &Ctx) -> Value* {
+ SelectInst *Sel = dyn_cast<SelectInst>(I);
+ if (!Sel)
+ return nullptr;
+ IRBuilder<> B(Ctx);
+ Value *C = Sel->getCondition();
+ if (SelectInst *Sel0 = dyn_cast<SelectInst>(Sel->getTrueValue())) {
+ if (Sel0->getCondition() == C)
+ return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue());
+ }
+ if (SelectInst *Sel1 = dyn_cast<SelectInst>(Sel->getFalseValue())) {
+ if (Sel1->getCondition() == C)
+ return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue());
+ }
+ return nullptr;
+ });
+ Simp.addRule(
+ // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
+ [](Instruction *I, LLVMContext &Ctx) -> Value* {
+ if (I->getOpcode() != Instruction::Or)
+ return nullptr;
+ Instruction *LShr = dyn_cast<Instruction>(I->getOperand(0));
+ if (!LShr || LShr->getOpcode() != Instruction::LShr)
+ return nullptr;
+ ConstantInt *One = dyn_cast<ConstantInt>(LShr->getOperand(1));
+ if (!One || One->getZExtValue() != 1)
+ return nullptr;
+ ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
+ if (!Msb || Msb->getZExtValue() != Msb->getType()->getSignBit())
+ return nullptr;
+ return IRBuilder<>(Ctx).CreateXor(LShr, Msb);
+ });
+ Simp.addRule(
+ // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
+ [](Instruction *I, LLVMContext &Ctx) -> Value* {
+ if (I->getOpcode() != Instruction::LShr)
+ return nullptr;
+ BinaryOperator *BitOp = dyn_cast<BinaryOperator>(I->getOperand(0));
+ if (!BitOp)
+ return nullptr;
+ switch (BitOp->getOpcode()) {
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ break;
+ default:
+ return nullptr;
+ }
+ IRBuilder<> B(Ctx);
+ Value *S = I->getOperand(1);
+ return B.CreateBinOp(BitOp->getOpcode(),
+ B.CreateLShr(BitOp->getOperand(0), S),
+ B.CreateLShr(BitOp->getOperand(1), S));
+ });
+ Simp.addRule(
+ // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
+ [](Instruction *I, LLVMContext &Ctx) -> Value* {
+ auto IsBitOp = [](unsigned Op) -> bool {
+ switch (Op) {
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return true;
+ }
+ return false;
+ };
+ BinaryOperator *BitOp1 = dyn_cast<BinaryOperator>(I);
+ if (!BitOp1 || !IsBitOp(BitOp1->getOpcode()))
+ return nullptr;
+ BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(BitOp1->getOperand(0));
+ if (!BitOp2 || !IsBitOp(BitOp2->getOpcode()))
+ return nullptr;
+ ConstantInt *CA = dyn_cast<ConstantInt>(BitOp2->getOperand(1));
+ ConstantInt *CB = dyn_cast<ConstantInt>(BitOp1->getOperand(1));
+ if (!CA || !CB)
+ return nullptr;
+ IRBuilder<> B(Ctx);
+ Value *X = BitOp2->getOperand(0);
+ return B.CreateBinOp(BitOp2->getOpcode(), X,
+ B.CreateBinOp(BitOp1->getOpcode(), CA, CB));
+ });
+}
+
+
bool PolynomialMultiplyRecognize::recognize() {
+ DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
+ << *CurLoop << '\n');
// Restrictions:
// - The loop must consist of a single block.
// - The iteration count must be known at compile-time.
// - The loop must have an induction variable starting from 0, and
// incremented in each iteration of the loop.
BasicBlock *LoopB = CurLoop->getHeader();
+ DEBUG(dbgs() << "Loop header:\n" << *LoopB);
+
if (LoopB != CurLoop->getLoopLatch())
return false;
BasicBlock *ExitB = CurLoop->getExitBlock();
@@ -1011,30 +1659,65 @@ bool PolynomialMultiplyRecognize::recognize() {
Value *CIV = getCountIV(LoopB);
ParsedValues PV;
PV.IterCount = IterCount;
+ DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount << '\n');
+
+ setupSimplifier();
+
+ // Perform a preliminary scan of select instructions to see if any of them
+ // looks like a generator of the polynomial multiply steps. Assume that a
+ // loop can only contain a single transformable operation, so stop the
+ // traversal after the first reasonable candidate was found.
+ // XXX: Currently this approach can modify the loop before being 100% sure
+ // that the transformation can be carried out.
+ bool FoundPreScan = false;
+ for (Instruction &In : *LoopB) {
+ SelectInst *SI = dyn_cast<SelectInst>(&In);
+ if (!SI)
+ continue;
- // Test function to see if a given select instruction is a part of the
- // pmpy pattern. The argument PreScan set to "true" indicates that only
- // a preliminary scan is needed, "false" indicated an exact match.
- auto CouldBePmpy = [this, LoopB, EntryB, CIV, &PV] (bool PreScan)
- -> std::function<bool (Instruction &I)> {
- return [this, LoopB, EntryB, CIV, &PV, PreScan] (Instruction &I) -> bool {
- if (auto *SelI = dyn_cast<SelectInst>(&I))
- return scanSelect(SelI, LoopB, EntryB, CIV, PV, PreScan);
- return false;
- };
- };
- auto PreF = std::find_if(LoopB->begin(), LoopB->end(), CouldBePmpy(true));
- if (PreF == LoopB->end())
+ Simplifier::Context C(SI);
+ Value *T = Simp.simplify(C);
+ SelectInst *SelI = (T && isa<SelectInst>(T)) ? cast<SelectInst>(T) : SI;
+ DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
+ if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) {
+ FoundPreScan = true;
+ if (SelI != SI) {
+ Value *NewSel = C.materialize(LoopB, SI->getIterator());
+ SI->replaceAllUsesWith(NewSel);
+ RecursivelyDeleteTriviallyDeadInstructions(SI, &TLI);
+ }
+ break;
+ }
+ }
+
+ if (!FoundPreScan) {
+ DEBUG(dbgs() << "Have not found candidates for pmpy\n");
return false;
+ }
if (!PV.Left) {
+ // The right shift version actually only returns the higher bits of
+ // the result (each iteration discards the LSB). If we want to convert it
+ // to a left-shifting loop, the working data type must be at least as
+ // wide as the target's pmpy instruction.
+ if (!promoteTypes(LoopB, ExitB))
+ return false;
convertShiftsToLeft(LoopB, ExitB, IterCount);
cleanupLoopBody(LoopB);
}
- auto PostF = std::find_if(LoopB->begin(), LoopB->end(), CouldBePmpy(false));
- if (PostF == LoopB->end())
- return false;
+ // Scan the loop again, find the generating select instruction.
+ bool FoundScan = false;
+ for (Instruction &In : *LoopB) {
+ SelectInst *SelI = dyn_cast<SelectInst>(&In);
+ if (!SelI)
+ continue;
+ DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
+ FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
+ if (FoundScan)
+ break;
+ }
+ assert(FoundScan);
DEBUG({
StringRef PP = (PV.M ? "(P+M)" : "P");