//===- BypassSlowDivision.cpp - Bypass slow division ----------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file contains an optimization for div and rem on architectures that // execute short instructions significantly faster than longer instructions. // For example, on Intel Atom 32-bit divides are slow enough that during // runtime it is profitable to check the value of the operands, and if they are // positive and less than 256 use an unsigned 8-bit divide. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" #include "llvm/Support/KnownBits.h" #include #include using namespace llvm; #define DEBUG_TYPE "bypass-slow-division" namespace { struct QuotRemPair { Value *Quotient; Value *Remainder; QuotRemPair(Value *InQuotient, Value *InRemainder) : Quotient(InQuotient), Remainder(InRemainder) {} }; /// A quotient and remainder, plus a BB from which they logically "originate". /// If you use Quotient or Remainder in a Phi node, you should use BB as its /// corresponding predecessor. struct QuotRemWithBB { BasicBlock *BB = nullptr; Value *Quotient = nullptr; Value *Remainder = nullptr; }; using DivCacheTy = DenseMap; using BypassWidthsTy = DenseMap; using VisitedSetTy = SmallPtrSet; enum ValueRange { /// Operand definitely fits into BypassType. No runtime checks are needed. VALRNG_KNOWN_SHORT, /// A runtime check is required, as value range is unknown. VALRNG_UNKNOWN, /// Operand is unlikely to fit into BypassType. The bypassing should be /// disabled. VALRNG_LIKELY_LONG }; class FastDivInsertionTask { bool IsValidTask = false; Instruction *SlowDivOrRem = nullptr; IntegerType *BypassType = nullptr; BasicBlock *MainBB = nullptr; bool isHashLikeValue(Value *V, VisitedSetTy &Visited); ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); QuotRemWithBB createSlowBB(BasicBlock *Successor); QuotRemWithBB createFastBB(BasicBlock *Successor); QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, BasicBlock *PhiBB); Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); Optional insertFastDivAndRem(); bool isSignedOp() { return SlowDivOrRem->getOpcode() == Instruction::SDiv || SlowDivOrRem->getOpcode() == Instruction::SRem; } bool isDivisionOp() { return SlowDivOrRem->getOpcode() == Instruction::SDiv || SlowDivOrRem->getOpcode() == Instruction::UDiv; } Type *getSlowType() { return SlowDivOrRem->getType(); } public: FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); Value *getReplacement(DivCacheTy &Cache); }; } // end anonymous namespace FastDivInsertionTask::FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths) { switch (I->getOpcode()) { case Instruction::UDiv: case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: SlowDivOrRem = I; break; default: // I is not a div/rem operation. return; } // Skip division on vector types. Only optimize integer instructions. IntegerType *SlowType = dyn_cast(SlowDivOrRem->getType()); if (!SlowType) return; // Skip if this bitwidth is not bypassed. auto BI = BypassWidths.find(SlowType->getBitWidth()); if (BI == BypassWidths.end()) return; // Get type for div/rem instruction with bypass bitwidth. IntegerType *BT = IntegerType::get(I->getContext(), BI->second); BypassType = BT; // The original basic block. MainBB = I->getParent(); // The instruction is indeed a slow div or rem operation. IsValidTask = true; } /// Reuses previously-computed dividend or remainder from the current BB if /// operands and operation are identical. Otherwise calls insertFastDivAndRem to /// perform the optimization and caches the resulting dividend and remainder. /// If no replacement can be generated, nullptr is returned. Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { // First, make sure that the task is valid. if (!IsValidTask) return nullptr; // Then, look for a value in Cache. Value *Dividend = SlowDivOrRem->getOperand(0); Value *Divisor = SlowDivOrRem->getOperand(1); DivRemMapKey Key(isSignedOp(), Dividend, Divisor); auto CacheI = Cache.find(Key); if (CacheI == Cache.end()) { // If previous instance does not exist, try to insert fast div. Optional OptResult = insertFastDivAndRem(); // Bail out if insertFastDivAndRem has failed. if (!OptResult) return nullptr; CacheI = Cache.insert({Key, *OptResult}).first; } QuotRemPair &Value = CacheI->second; return isDivisionOp() ? Value.Quotient : Value.Remainder; } /// Check if a value looks like a hash. /// /// The routine is expected to detect values computed using the most common hash /// algorithms. Typically, hash computations end with one of the following /// instructions: /// /// 1) MUL with a constant wider than BypassType /// 2) XOR instruction /// /// And even if we are wrong and the value is not a hash, it is still quite /// unlikely that such values will fit into BypassType. /// /// To detect string hash algorithms like FNV we have to look through PHI-nodes. /// It is implemented as a depth-first search for values that look neither long /// nor hash-like. bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { Instruction *I = dyn_cast(V); if (!I) return false; switch (I->getOpcode()) { case Instruction::Xor: return true; case Instruction::Mul: { // After Constant Hoisting pass, long constants may be represented as // bitcast instructions. As a result, some constants may look like an // instruction at first, and an additional check is necessary to find out if // an operand is actually a constant. Value *Op1 = I->getOperand(1); ConstantInt *C = dyn_cast(Op1); if (!C && isa(Op1)) C = dyn_cast(cast(Op1)->getOperand(0)); return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth(); } case Instruction::PHI: // Stop IR traversal in case of a crazy input code. This limits recursion // depth. if (Visited.size() >= 16) return false; // Do not visit nodes that have been visited already. We return true because // it means that we couldn't find any value that doesn't look hash-like. if (Visited.find(I) != Visited.end()) return true; Visited.insert(I); return llvm::all_of(cast(I)->incoming_values(), [&](Value *V) { // Ignore undef values as they probably don't affect the division // operands. return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || isa(V); }); default: return false; } } /// Check if an integer value fits into our bypass type. ValueRange FastDivInsertionTask::getValueRange(Value *V, VisitedSetTy &Visited) { unsigned ShortLen = BypassType->getBitWidth(); unsigned LongLen = V->getType()->getIntegerBitWidth(); assert(LongLen > ShortLen && "Value type must be wider than BypassType"); unsigned HiBits = LongLen - ShortLen; const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); KnownBits Known(LongLen); computeKnownBits(V, Known, DL); if (Known.countMinLeadingZeros() >= HiBits) return VALRNG_KNOWN_SHORT; if (Known.countMaxLeadingZeros() < HiBits) return VALRNG_LIKELY_LONG; // Long integer divisions are often used in hashtable implementations. It's // not worth bypassing such divisions because hash values are extremely // unlikely to have enough leading zeros. The call below tries to detect // values that are unlikely to fit BypassType (including hashes). if (isHashLikeValue(V, Visited)) return VALRNG_LIKELY_LONG; return VALRNG_UNKNOWN; } /// Add new basic block for slow div and rem operations and put it before /// SuccessorBB. QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { QuotRemWithBB DivRemPair; DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", MainBB->getParent(), SuccessorBB); IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); Value *Dividend = SlowDivOrRem->getOperand(0); Value *Divisor = SlowDivOrRem->getOperand(1); if (isSignedOp()) { DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); } else { DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); } Builder.CreateBr(SuccessorBB); return DivRemPair; } /// Add new basic block for fast div and rem operations and put it before /// SuccessorBB. QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { QuotRemWithBB DivRemPair; DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", MainBB->getParent(), SuccessorBB); IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); Value *Dividend = SlowDivOrRem->getOperand(0); Value *Divisor = SlowDivOrRem->getOperand(1); Value *ShortDivisorV = Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); Value *ShortDividendV = Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); // udiv/urem because this optimization only handles positive numbers. Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); DivRemPair.Quotient = Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); DivRemPair.Remainder = Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); Builder.CreateBr(SuccessorBB); return DivRemPair; } /// Creates Phi nodes for result of Div and Rem. QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, BasicBlock *PhiBB) { IRBuilder<> Builder(PhiBB, PhiBB->begin()); PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); QuoPhi->addIncoming(LHS.Quotient, LHS.BB); QuoPhi->addIncoming(RHS.Quotient, RHS.BB); PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); RemPhi->addIncoming(LHS.Remainder, LHS.BB); RemPhi->addIncoming(RHS.Remainder, RHS.BB); return QuotRemPair(QuoPhi, RemPhi); } /// Creates a runtime check to test whether both the divisor and dividend fit /// into BypassType. The check is inserted at the end of MainBB. True return /// value means that the operands fit. Either of the operands may be NULL if it /// doesn't need a runtime check. Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { assert((Op1 || Op2) && "Nothing to check"); IRBuilder<> Builder(MainBB, MainBB->end()); Value *OrV; if (Op1 && Op2) OrV = Builder.CreateOr(Op1, Op2); else OrV = Op1 ? Op1 : Op2; // BitMask is inverted to check if the operands are // larger than the bypass type uint64_t BitMask = ~BypassType->getBitMask(); Value *AndV = Builder.CreateAnd(OrV, BitMask); // Compare operand values Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); return Builder.CreateICmpEQ(AndV, ZeroV); } /// Substitutes the div/rem instruction with code that checks the value of the /// operands and uses a shorter-faster div/rem instruction when possible. Optional FastDivInsertionTask::insertFastDivAndRem() { Value *Dividend = SlowDivOrRem->getOperand(0); Value *Divisor = SlowDivOrRem->getOperand(1); VisitedSetTy SetL; ValueRange DividendRange = getValueRange(Dividend, SetL); if (DividendRange == VALRNG_LIKELY_LONG) return None; VisitedSetTy SetR; ValueRange DivisorRange = getValueRange(Divisor, SetR); if (DivisorRange == VALRNG_LIKELY_LONG) return None; bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); if (DividendShort && DivisorShort) { // If both operands are known to be short then just replace the long // division with a short one in-place. Since we're not introducing control // flow in this case, narrowing the division is always a win, even if the // divisor is a constant (and will later get replaced by a multiplication). IRBuilder<> Builder(SlowDivOrRem); Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); return QuotRemPair(ExtDiv, ExtRem); } if (isa(Divisor)) { // If the divisor is not a constant, DAGCombiner will convert it to a // multiplication by a magic constant. It isn't clear if it is worth // introducing control flow to get a narrower multiply. return None; } // After Constant Hoisting pass, long constants may be represented as // bitcast instructions. As a result, some constants may look like an // instruction at first, and an additional check is necessary to find out if // an operand is actually a constant. if (auto *BCI = dyn_cast(Divisor)) if (BCI->getParent() == SlowDivOrRem->getParent() && isa(BCI->getOperand(0))) return None; if (DividendShort && !isSignedOp()) { // If the division is unsigned and Dividend is known to be short, then // either // 1) Divisor is less or equal to Dividend, and the result can be computed // with a short division. // 2) Divisor is greater than Dividend. In this case, no division is needed // at all: The quotient is 0 and the remainder is equal to Dividend. // // So instead of checking at runtime whether Divisor fits into BypassType, // we emit a runtime check to differentiate between these two cases. This // lets us entirely avoid a long div. // Split the basic block before the div/rem. BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); // Remove the unconditional branch from MainBB to SuccessorBB. MainBB->getInstList().back().eraseFromParent(); QuotRemWithBB Long; Long.BB = MainBB; Long.Quotient = ConstantInt::get(getSlowType(), 0); Long.Remainder = Dividend; QuotRemWithBB Fast = createFastBB(SuccessorBB); QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); IRBuilder<> Builder(MainBB, MainBB->end()); Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); return Result; } else { // General case. Create both slow and fast div/rem pairs and choose one of // them at runtime. // Split the basic block before the div/rem. BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); // Remove the unconditional branch from MainBB to SuccessorBB. MainBB->getInstList().back().eraseFromParent(); QuotRemWithBB Fast = createFastBB(SuccessorBB); QuotRemWithBB Slow = createSlowBB(SuccessorBB); QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, DivisorShort ? nullptr : Divisor); IRBuilder<> Builder(MainBB, MainBB->end()); Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); return Result; } } /// This optimization identifies DIV/REM instructions in a BB that can be /// profitably bypassed and carried out with a shorter, faster divide. bool llvm::bypassSlowDivision(BasicBlock *BB, const BypassWidthsTy &BypassWidths) { DivCacheTy PerBBDivCache; bool MadeChange = false; Instruction* Next = &*BB->begin(); while (Next != nullptr) { // We may add instructions immediately after I, but we want to skip over // them. Instruction* I = Next; Next = Next->getNextNode(); FastDivInsertionTask Task(I, BypassWidths); if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { I->replaceAllUsesWith(Replacement); I->eraseFromParent(); MadeChange = true; } } // Above we eagerly create divs and rems, as pairs, so that we can efficiently // create divrem machine instructions. Now erase any unused divs / rems so we // don't leave extra instructions sitting around. for (auto &KV : PerBBDivCache) for (Value *V : {KV.second.Quotient, KV.second.Remainder}) RecursivelyDeleteTriviallyDeadInstructions(V); return MadeChange; }