[BypassSlowDivision] Use ValueTracking to simplify run-time checks

author Nikolai Bozhenov <nikolai.bozhenov@intel.com>

Thu, 2 Mar 2017 22:12:15 +0000 (22:12 +0000)

committer Nikolai Bozhenov <nikolai.bozhenov@intel.com>

Thu, 2 Mar 2017 22:12:15 +0000 (22:12 +0000)
author Nikolai Bozhenov <nikolai.bozhenov@intel.com>
Thu, 2 Mar 2017 22:12:15 +0000 (22:12 +0000)
committer Nikolai Bozhenov <nikolai.bozhenov@intel.com>
Thu, 2 Mar 2017 22:12:15 +0000 (22:12 +0000)
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp

index ed663de01998bc69300a79e0f2733d5dfc4b15af..9d8cb3187eec9c2526277a050129d1c5b649b00d 100644 (file)
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -17,6 +17,7 @@
  
  #include "llvm/Transforms/Utils/BypassSlowDivision.h"
  #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/ValueTracking.h"
  #include "llvm/IR/Function.h"
  #include "llvm/IR/IRBuilder.h"
  #include "llvm/IR/Instructions.h"
@@ -83,17 +84,28 @@ namespace llvm {
  }
  
  namespace {
+enum ValueRange {
+  /// Operand definitely fits into BypassType. No runtime checks are needed.
+  VALRNG_SHORT,
+  /// A runtime check is required, as value range is unknown.
+  VALRNG_UNKNOWN,
+  /// Operand is unlikely to fit into BypassType. The bypassing should be
+  /// disabled.
+  VALRNG_LONG
+};
+
  class FastDivInsertionTask {
    bool IsValidTask = false;
    Instruction *SlowDivOrRem = nullptr;
    IntegerType *BypassType = nullptr;
    BasicBlock *MainBB = nullptr;
  
+  ValueRange getValueRange(Value *Op);
    QuotRemWithBB createSlowBB(BasicBlock *Successor);
    QuotRemWithBB createFastBB(BasicBlock *Successor);
    QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
                                     BasicBlock *PhiBB);
-  Value *insertOperandRuntimeCheck();
+  Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
    Optional<QuotRemPair> insertFastDivAndRem();
  
    bool isSignedOp() {
@@ -175,6 +187,28 @@ Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
    return isDivisionOp() ? Value.Quotient : Value.Remainder;
  }
  
+/// Check if an integer value fits into our bypass type.
+ValueRange FastDivInsertionTask::getValueRange(Value *V) {
+  unsigned ShortLen = BypassType->getBitWidth();
+  unsigned LongLen = V->getType()->getIntegerBitWidth();
+
+  assert(LongLen > ShortLen && "Value type must be wider than BypassType");
+  unsigned HiBits = LongLen - ShortLen;
+
+  const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
+  APInt Zeros(LongLen, 0), Ones(LongLen, 0);
+
+  computeKnownBits(V, Zeros, Ones, DL);
+
+  if (Zeros.countLeadingOnes() >= HiBits)
+    return VALRNG_SHORT;
+
+  if (Ones.countLeadingZeros() < HiBits)
+    return VALRNG_LONG;
+
+  return VALRNG_UNKNOWN;
+}
+
  /// Add new basic block for slow div and rem operations and put it before
  /// SuccessorBB.
  QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
@@ -241,22 +275,17 @@ QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
  
  /// Creates a runtime check to test whether both the divisor and dividend fit
  /// into BypassType. The check is inserted at the end of MainBB. True return
-/// value means that the operands fit.
-Value *FastDivInsertionTask::insertOperandRuntimeCheck() {
+/// value means that the operands fit. Either of the operands may be NULL if it
+/// doesn't need a runtime check.
+Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
+  assert((Op1 || Op2) && "Nothing to check");
    IRBuilder<> Builder(MainBB, MainBB->end());
-  Value *Dividend = SlowDivOrRem->getOperand(0);
-  Value *Divisor = SlowDivOrRem->getOperand(1);
-
-  // We should have bailed out above if the divisor is a constant, but the
-  // dividend may still be a constant.  Set OrV to our non-constant operands
-  // OR'ed together.
-  assert(!isa<ConstantInt>(Divisor));
  
    Value *OrV;
-  if (!isa<ConstantInt>(Dividend))
-    OrV = Builder.CreateOr(Dividend, Divisor);
+  if (Op1 && Op2)
+    OrV = Builder.CreateOr(Op1, Op2);
    else
-    OrV = Divisor;
+    OrV = Op1 ? Op1 : Op2;
  
    // BitMask is inverted to check if the operands are
    // larger than the bypass type
@@ -279,22 +308,72 @@ Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
      return None;
    }
  
-  // If the numerator is a constant, bail if it doesn't fit into BypassType.
-  if (ConstantInt *ConstDividend = dyn_cast<ConstantInt>(Dividend))
-    if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth())
-      return None;
-
-  // Split the basic block before the div/rem.
-  BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
-  // Remove the unconditional branch from MainBB to SuccessorBB.
-  MainBB->getInstList().back().eraseFromParent();
-  QuotRemWithBB Fast = createFastBB(SuccessorBB);
-  QuotRemWithBB Slow = createSlowBB(SuccessorBB);
-  QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
-  Value *CmpV = insertOperandRuntimeCheck();
-  IRBuilder<> Builder(MainBB, MainBB->end());
-  Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
-  return Result;
+  ValueRange DividendRange = getValueRange(Dividend);
+  if (DividendRange == VALRNG_LONG)
+    return None;
+
+  ValueRange DivisorRange = getValueRange(Divisor);
+  if (DivisorRange == VALRNG_LONG)
+    return None;
+
+  bool DividendShort = (DividendRange == VALRNG_SHORT);
+  bool DivisorShort = (DivisorRange == VALRNG_SHORT);
+
+  if (DividendShort && DivisorShort) {
+    // If both operands are known to be short then just replace the long
+    // division with a short one in-place.
+
+    IRBuilder<> Builder(SlowDivOrRem);
+    Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
+    Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
+    Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
+    Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
+    Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
+    Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
+    return QuotRemPair(ExtDiv, ExtRem);
+  } else if (DividendShort && !isSignedOp()) {
+    // If the division is unsigned and Dividend is known to be short, then
+    // either
+    // 1) Divisor is less or equal to Dividend, and the result can be computed
+    //    with a short division.
+    // 2) Divisor is greater than Dividend. In this case, no division is needed
+    //    at all: The quotient is 0 and the remainder is equal to Dividend.
+    //
+    // So instead of checking at runtime whether Divisor fits into BypassType,
+    // we emit a runtime check to differentiate between these two cases. This
+    // lets us entirely avoid a long div.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Long;
+    Long.BB = MainBB;
+    Long.Quotient = ConstantInt::get(getSlowType(), 0);
+    Long.Remainder = Dividend;
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
+    Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
+    return Result;
+  } else {
+    // General case. Create both slow and fast div/rem pairs and choose one of
+    // them at runtime.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemWithBB Slow = createSlowBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
+    Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
+                                            DivisorShort ? nullptr : Divisor);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
+    return Result;
+  }
  }
  
  /// This optimization identifies DIV/REM instructions in a BB that can be
diff --git a/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll b/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll

new file mode 100644 (file)

index 0000000..c820ad7
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; No bypassing should be done in apparently unsuitable cases.
+define void @Test_no_bypassing(i32 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_no_bypassing(
+; CHECK-NEXT:    [[A_1:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[A_2:%.*]] = sub i64 -1, [[A_1]]
+; CHECK-NEXT:    [[RES:%.*]] = srem i64 [[A_2]], [[B:%.*]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %a.1 = zext i32 %a to i64
+  ; %a.2 is always negative so the division cannot be bypassed.
+  %a.2 = sub i64 -1, %a.1
+  %res = srem i64 %a.2, %b
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+; No OR instruction is needed if one of the operands (divisor) is known
+; to fit into 32 bits.
+define void @Test_check_one_operand(i64 %a, i32 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_check_one_operand(
+; CHECK-NEXT:    [[B_1:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[A:%.*]], -4294967296
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP8:%.*]]
+; CHECK:         [[TMP4:%.*]] = trunc i64 [[B_1]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[A]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    br label [[TMP10:%.*]]
+; CHECK:         [[TMP9:%.*]] = sdiv i64 [[A]], [[B_1]]
+; CHECK-NEXT:    br label [[TMP10]]
+; CHECK:         [[TMP11:%.*]] = phi i64 [ [[TMP7]], [[TMP3]] ], [ [[TMP9]], [[TMP8]] ]
+; CHECK-NEXT:    store i64 [[TMP11]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %b.1 = zext i32 %b to i64
+  %res = sdiv i64 %a, %b.1
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+; If both operands are known to fit into 32 bits, then replace the division
+; in-place without CFG modification.
+define void @Test_check_none(i64 %a, i32 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_check_none(
+; CHECK-NEXT:    [[A_1:%.*]] = and i64 [[A:%.*]], 4294967295
+; CHECK-NEXT:    [[B_1:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[A_1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[B_1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    store i64 [[TMP4]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %a.1 = and i64 %a, 4294967295
+  %b.1 = zext i32 %b to i64
+  %res = udiv i64 %a.1, %b.1
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+; In case of unsigned long division with a short dividend,
+; the long division is not needed any more.
+define void @Test_special_case(i32 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_special_case(
+; CHECK-NEXT:    [[A_1:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i64 [[A_1]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP9:%.*]]
+; CHECK:         [[TMP3:%.*]] = trunc i64 [[B]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[A_1]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = urem i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    br label [[TMP9]]
+; CHECK:         [[TMP10:%.*]] = phi i64 [ [[TMP7]], [[TMP2]] ], [ 0, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP8]], [[TMP2]] ], [ [[A_1]], [[TMP0]] ]
+; CHECK-NEXT:    [[RES:%.*]] = add i64 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %a.1 = zext i32 %a to i64
+  %div = udiv i64 %a.1, %b
+  %rem = urem i64 %a.1, %b
+  %res = add i64 %div, %rem
+  store i64 %res, i64* %retptr
+  ret void
+}
author	Nikolai Bozhenov <nikolai.bozhenov@intel.com>
	Thu, 2 Mar 2017 22:12:15 +0000 (22:12 +0000)
committer	Nikolai Bozhenov <nikolai.bozhenov@intel.com>
	Thu, 2 Mar 2017 22:12:15 +0000 (22:12 +0000)
lib/Transforms/Utils/BypassSlowDivision.cpp		patch \| blob \| history
test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll	[new file with mode: 0644]	patch \| blob