bool tryToSinkFreeOperands(Instruction *I);
bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp,
Intrinsic::ID IID);
- bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT);
- bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
- bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
+ bool optimizeCmp(CmpInst *Cmp);
+ bool combineToUSubWithOverflow(CmpInst *Cmp);
+ bool combineToUAddWithOverflow(CmpInst *Cmp);
};
} // end anonymous namespace
if (!LargeOffsetGEPMap.empty())
MadeChange |= splitLargeGEPOffsets();
- // Really free removed instructions during promotion.
- for (Instruction *I : RemovedInsts)
+ // Really free instructions removed during promotion or kept around to
+ // improve efficiency (see comments in overflow intrinsic transforms).
+ for (Instruction *I : RemovedInsts) {
+ if (I->getParent())
+ I->removeFromParent();
I->deleteValue();
+ }
EverMadeChange |= MadeChange;
SeenChainsForSExt.clear();
bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
CmpInst *Cmp,
Intrinsic::ID IID) {
+ if (BO->getParent() != Cmp->getParent()) {
+ // We used to use a dominator tree here to allow multi-block optimization.
+ // But that was problematic because:
+ // 1. It could cause a perf regression by hoisting the math op into the
+ // critical path.
+ // 2. It could cause a perf regression by creating a value that was live
+ // across multiple blocks and increasing register pressure.
+ // 3. Use of a dominator tree could cause large compile-time regression.
+ // This is because we recompute the DT on every change in the main CGP
+ // run-loop. The recomputing is probably unnecessary in many cases, so if
+ // that was fixed, using a DT here would be ok.
+ return false;
+ }
+
// We allow matching the canonical IR (add X, C) back to (usubo X, -C).
Value *Arg0 = BO->getOperand(0);
Value *Arg1 = BO->getOperand(1);
Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
}
- Instruction *InsertPt;
- if (BO->hasOneUse() && BO->user_back() == Cmp) {
- // If the math is only used by the compare, insert at the compare to keep
- // the condition in the same block as its users. (CGP aggressively sinks
- // compares to help out SDAG.)
- InsertPt = Cmp;
- } else {
- // The math and compare may be independent instructions. Check dominance to
- // determine the insertion point for the intrinsic.
- bool MathDominates = getDT(*BO->getFunction()).dominates(BO, Cmp);
- if (!MathDominates && !getDT(*BO->getFunction()).dominates(Cmp, BO))
- return false;
-
- BasicBlock *MathBB = BO->getParent(), *CmpBB = Cmp->getParent();
- if (MathBB != CmpBB) {
- // Avoid hoisting an extra op into a dominating block and creating a
- // potentially longer critical path.
- if (!MathDominates)
- return false;
- // Check that the insertion doesn't create a value that is live across
- // more than two blocks, so to minimise the increase in register pressure.
- BasicBlock *Dominator = MathDominates ? MathBB : CmpBB;
- BasicBlock *Dominated = MathDominates ? CmpBB : MathBB;
- auto Successors = successors(Dominator);
- if (llvm::find(Successors, Dominated) == Successors.end())
- return false;
+ // Insert at the first instruction of the pair.
+ Instruction *InsertPt = nullptr;
+ for (Instruction &Iter : *Cmp->getParent()) {
+ if (&Iter == BO || &Iter == Cmp) {
+ InsertPt = &Iter;
+ break;
}
-
- InsertPt = MathDominates ? cast<Instruction>(BO) : cast<Instruction>(Cmp);
}
+ assert(InsertPt != nullptr && "Parent block did not contain cmp or binop");
IRBuilder<> Builder(InsertPt);
Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
+
+ // Delay the actual removal/deletion of the binop and compare for efficiency.
+ // If we delete those now, we would invalidate the instruction iterator and
+ // trigger dominator tree updates.
BO->replaceAllUsesWith(Math);
Cmp->replaceAllUsesWith(OV);
- BO->eraseFromParent();
- Cmp->eraseFromParent();
+ RemovedInsts.insert(BO);
+ RemovedInsts.insert(Cmp);
return true;
}
/// Try to combine the compare into a call to the llvm.uadd.with.overflow
/// intrinsic. Return true if any changes were made.
-bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
- bool &ModifiedDT) {
+bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp) {
Value *A, *B;
BinaryOperator *Add;
if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add))))
if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow))
return false;
- // Reset callers - do not crash by iterating over a dead instruction.
- ModifiedDT = true;
return true;
}
-bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
- bool &ModifiedDT) {
+bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp) {
// We are not expecting non-canonical/degenerate code. Just bail out.
Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
if (isa<Constant>(A) && isa<Constant>(B))
if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow))
return false;
- // Reset callers - do not crash by iterating over a dead instruction.
- ModifiedDT = true;
return true;
}
return MadeChange;
}
-bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) {
+bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp) {
if (sinkCmpExpression(Cmp, *TLI))
return true;
- if (combineToUAddWithOverflow(Cmp, ModifiedDT))
+ if (combineToUAddWithOverflow(Cmp))
return true;
- if (combineToUSubWithOverflow(Cmp, ModifiedDT))
+ if (combineToUSubWithOverflow(Cmp))
return true;
return false;
}
if (auto *Cmp = dyn_cast<CmpInst>(I))
- if (TLI && optimizeCmp(Cmp, ModifiedDT))
+ if (TLI && optimizeCmp(Cmp))
return true;
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
ret i64 %Q
}
+; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic.
+
define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp {
; CHECK-LABEL: @uaddo4(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]]
; CHECK-NEXT: br i1 [[C:%.*]], label [[NEXT:%.*]], label [[EXIT:%.*]]
; CHECK: next:
-; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]])
-; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
-; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
-; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[B]], [[ADD]]
+; CHECK-NEXT: [[Q:%.*]] = select i1 [[TMP0]], i64 [[B]], i64 42
; CHECK-NEXT: ret i64 [[Q]]
; CHECK: exit:
; CHECK-NEXT: ret i64 0
ret i1 %ov
}
-; Verify insertion point for multi-BB.
+; This used to verify insertion point for multi-BB, but now we just bail out.
declare void @call(i1)
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
; CHECK: t:
-; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
-; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
-; CHECK-NEXT: [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
-; CHECK-NEXT: store i64 [[MATH]], i64* [[P:%.*]]
+; CHECK-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: store i64 [[S]], i64* [[P:%.*]]
; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[F]]
; CHECK: f:
; CHECK-NEXT: ret i1 [[COND]]
; CHECK: end:
-; CHECK-NEXT: ret i1 [[OV1]]
+; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]]
+; CHECK-NEXT: ret i1 [[OV]]
;
entry:
br i1 %cond, label %t, label %f