#include "llvm/Transforms/Utils/BypassSlowDivision.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
}
namespace {
+enum ValueRange {
+ /// Operand definitely fits into BypassType. No runtime checks are needed.
+ VALRNG_SHORT,
+ /// A runtime check is required, as value range is unknown.
+ VALRNG_UNKNOWN,
+ /// Operand is unlikely to fit into BypassType. The bypassing should be
+ /// disabled.
+ VALRNG_LONG
+};
+
class FastDivInsertionTask {
bool IsValidTask = false;
Instruction *SlowDivOrRem = nullptr;
IntegerType *BypassType = nullptr;
BasicBlock *MainBB = nullptr;
+ ValueRange getValueRange(Value *Op);
QuotRemWithBB createSlowBB(BasicBlock *Successor);
QuotRemWithBB createFastBB(BasicBlock *Successor);
QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
BasicBlock *PhiBB);
- Value *insertOperandRuntimeCheck();
+ Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
Optional<QuotRemPair> insertFastDivAndRem();
bool isSignedOp() {
return isDivisionOp() ? Value.Quotient : Value.Remainder;
}
+/// Check if an integer value fits into our bypass type.
+ValueRange FastDivInsertionTask::getValueRange(Value *V) {
+ unsigned ShortLen = BypassType->getBitWidth();
+ unsigned LongLen = V->getType()->getIntegerBitWidth();
+
+ assert(LongLen > ShortLen && "Value type must be wider than BypassType");
+ unsigned HiBits = LongLen - ShortLen;
+
+ const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
+ APInt Zeros(LongLen, 0), Ones(LongLen, 0);
+
+ computeKnownBits(V, Zeros, Ones, DL);
+
+ if (Zeros.countLeadingOnes() >= HiBits)
+ return VALRNG_SHORT;
+
+ if (Ones.countLeadingZeros() < HiBits)
+ return VALRNG_LONG;
+
+ return VALRNG_UNKNOWN;
+}
+
/// Add new basic block for slow div and rem operations and put it before
/// SuccessorBB.
QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
/// Creates a runtime check to test whether both the divisor and dividend fit
/// into BypassType. The check is inserted at the end of MainBB. True return
-/// value means that the operands fit.
-Value *FastDivInsertionTask::insertOperandRuntimeCheck() {
+/// value means that the operands fit. Either of the operands may be NULL if it
+/// doesn't need a runtime check.
+Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
+ assert((Op1 || Op2) && "Nothing to check");
IRBuilder<> Builder(MainBB, MainBB->end());
- Value *Dividend = SlowDivOrRem->getOperand(0);
- Value *Divisor = SlowDivOrRem->getOperand(1);
-
- // We should have bailed out above if the divisor is a constant, but the
- // dividend may still be a constant. Set OrV to our non-constant operands
- // OR'ed together.
- assert(!isa<ConstantInt>(Divisor));
Value *OrV;
- if (!isa<ConstantInt>(Dividend))
- OrV = Builder.CreateOr(Dividend, Divisor);
+ if (Op1 && Op2)
+ OrV = Builder.CreateOr(Op1, Op2);
else
- OrV = Divisor;
+ OrV = Op1 ? Op1 : Op2;
// BitMask is inverted to check if the operands are
// larger than the bypass type
return None;
}
- // If the numerator is a constant, bail if it doesn't fit into BypassType.
- if (ConstantInt *ConstDividend = dyn_cast<ConstantInt>(Dividend))
- if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth())
- return None;
-
- // Split the basic block before the div/rem.
- BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
- // Remove the unconditional branch from MainBB to SuccessorBB.
- MainBB->getInstList().back().eraseFromParent();
- QuotRemWithBB Fast = createFastBB(SuccessorBB);
- QuotRemWithBB Slow = createSlowBB(SuccessorBB);
- QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
- Value *CmpV = insertOperandRuntimeCheck();
- IRBuilder<> Builder(MainBB, MainBB->end());
- Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
- return Result;
+ ValueRange DividendRange = getValueRange(Dividend);
+ if (DividendRange == VALRNG_LONG)
+ return None;
+
+ ValueRange DivisorRange = getValueRange(Divisor);
+ if (DivisorRange == VALRNG_LONG)
+ return None;
+
+ bool DividendShort = (DividendRange == VALRNG_SHORT);
+ bool DivisorShort = (DivisorRange == VALRNG_SHORT);
+
+ if (DividendShort && DivisorShort) {
+ // If both operands are known to be short then just replace the long
+ // division with a short one in-place.
+
+ IRBuilder<> Builder(SlowDivOrRem);
+ Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
+ Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
+ Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
+ Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
+ Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
+ Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
+ return QuotRemPair(ExtDiv, ExtRem);
+ } else if (DividendShort && !isSignedOp()) {
+ // If the division is unsigned and Dividend is known to be short, then
+ // either
+ // 1) Divisor is less or equal to Dividend, and the result can be computed
+ // with a short division.
+ // 2) Divisor is greater than Dividend. In this case, no division is needed
+ // at all: The quotient is 0 and the remainder is equal to Dividend.
+ //
+ // So instead of checking at runtime whether Divisor fits into BypassType,
+ // we emit a runtime check to differentiate between these two cases. This
+ // lets us entirely avoid a long div.
+
+ // Split the basic block before the div/rem.
+ BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+ // Remove the unconditional branch from MainBB to SuccessorBB.
+ MainBB->getInstList().back().eraseFromParent();
+ QuotRemWithBB Long;
+ Long.BB = MainBB;
+ Long.Quotient = ConstantInt::get(getSlowType(), 0);
+ Long.Remainder = Dividend;
+ QuotRemWithBB Fast = createFastBB(SuccessorBB);
+ QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
+ IRBuilder<> Builder(MainBB, MainBB->end());
+ Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
+ Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
+ return Result;
+ } else {
+ // General case. Create both slow and fast div/rem pairs and choose one of
+ // them at runtime.
+
+ // Split the basic block before the div/rem.
+ BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+ // Remove the unconditional branch from MainBB to SuccessorBB.
+ MainBB->getInstList().back().eraseFromParent();
+ QuotRemWithBB Fast = createFastBB(SuccessorBB);
+ QuotRemWithBB Slow = createSlowBB(SuccessorBB);
+ QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
+ Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
+ DivisorShort ? nullptr : Divisor);
+ IRBuilder<> Builder(MainBB, MainBB->end());
+ Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
+ return Result;
+ }
}
/// This optimization identifies DIV/REM instructions in a BB that can be
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; No bypassing should be done in apparently unsuitable cases.
+define void @Test_no_bypassing(i32 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_no_bypassing(
+; CHECK-NEXT: [[A_1:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT: [[A_2:%.*]] = sub i64 -1, [[A_1]]
+; CHECK-NEXT: [[RES:%.*]] = srem i64 [[A_2]], [[B:%.*]]
+; CHECK-NEXT: store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT: ret void
+;
+ %a.1 = zext i32 %a to i64
+ ; %a.2 is always negative so the division cannot be bypassed.
+ %a.2 = sub i64 -1, %a.1
+ %res = srem i64 %a.2, %b
+ store i64 %res, i64* %retptr
+ ret void
+}
+
+; No OR instruction is needed if one of the operands (divisor) is known
+; to fit into 32 bits.
+define void @Test_check_one_operand(i64 %a, i32 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_check_one_operand(
+; CHECK-NEXT: [[B_1:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[A:%.*]], -4294967296
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP8:%.*]]
+; CHECK: [[TMP4:%.*]] = trunc i64 [[B_1]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[A]] to i32
+; CHECK-NEXT: [[TMP6:%.*]] = udiv i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT: br label [[TMP10:%.*]]
+; CHECK: [[TMP9:%.*]] = sdiv i64 [[A]], [[B_1]]
+; CHECK-NEXT: br label [[TMP10]]
+; CHECK: [[TMP11:%.*]] = phi i64 [ [[TMP7]], [[TMP3]] ], [ [[TMP9]], [[TMP8]] ]
+; CHECK-NEXT: store i64 [[TMP11]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT: ret void
+;
+ %b.1 = zext i32 %b to i64
+ %res = sdiv i64 %a, %b.1
+ store i64 %res, i64* %retptr
+ ret void
+}
+
+; If both operands are known to fit into 32 bits, then replace the division
+; in-place without CFG modification.
+define void @Test_check_none(i64 %a, i32 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_check_none(
+; CHECK-NEXT: [[A_1:%.*]] = and i64 [[A:%.*]], 4294967295
+; CHECK-NEXT: [[B_1:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[A_1]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[B_1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = udiv i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT: store i64 [[TMP4]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT: ret void
+;
+ %a.1 = and i64 %a, 4294967295
+ %b.1 = zext i32 %b to i64
+ %res = udiv i64 %a.1, %b.1
+ store i64 %res, i64* %retptr
+ ret void
+}
+
+; In case of unsigned long division with a short dividend,
+; the long division is not needed any more.
+define void @Test_special_case(i32 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_special_case(
+; CHECK-NEXT: [[A_1:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = icmp uge i64 [[A_1]], [[B:%.*]]
+; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP9:%.*]]
+; CHECK: [[TMP3:%.*]] = trunc i64 [[B]] to i32
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[A_1]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT: [[TMP6:%.*]] = urem i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT: br label [[TMP9]]
+; CHECK: [[TMP10:%.*]] = phi i64 [ [[TMP7]], [[TMP2]] ], [ 0, [[TMP0:%.*]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP8]], [[TMP2]] ], [ [[A_1]], [[TMP0]] ]
+; CHECK-NEXT: [[RES:%.*]] = add i64 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT: ret void
+;
+ %a.1 = zext i32 %a to i64
+ %div = udiv i64 %a.1, %b
+ %rem = urem i64 %a.1, %b
+ %res = add i64 %div, %rem
+ store i64 %res, i64* %retptr
+ ret void
+}