/// no memory dependence cycles.
bool canVectorizeMemory() const { return CanVecMem; }
+ /// Return true if there is a convergent operation in the loop. There may
+ /// still be reported runtime pointer checks that would be required, but it is
+ /// not legal to insert them.
+ bool hasConvergentOp() const { return HasConvergentOp; }
+
const RuntimePointerChecking *getRuntimePointerChecking() const {
return PtrRtChecking.get();
}
/// Cache the result of analyzeLoop.
bool CanVecMem;
+ bool HasConvergentOp;
/// Indicator that there are non vectorizable stores to a uniform address.
bool HasDependenceInvolvingLoopInvariantAddress;
unsigned NumReads = 0;
unsigned NumReadWrites = 0;
+ bool HasComplexMemInst = false;
+
+ // A runtime check is only legal to insert if there are no convergent calls.
+ HasConvergentOp = false;
+
PtrRtChecking->Pointers.clear();
PtrRtChecking->Need = false;
// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
- // Scan the BB and collect legal loads and stores.
+ // Scan the BB and collect legal loads and stores. Also detect any
+ // convergent instructions.
for (Instruction &I : *BB) {
+ if (auto *Call = dyn_cast<CallBase>(&I)) {
+ if (Call->isConvergent())
+ HasConvergentOp = true;
+ }
+
+ // With both a non-vectorizable memory instruction and a convergent
+ // operation, found in this loop, no reason to continue the search.
+ if (HasComplexMemInst && HasConvergentOp) {
+ CanVecMem = false;
+ return;
+ }
+
+ // Avoid hitting recordAnalysis multiple times.
+ if (HasComplexMemInst)
+ continue;
+
// If this is a load, save it. If this instruction can read from memory
// but is not a load, then we quit. Notice that we don't handle function
// calls that read or write.
continue;
auto *Ld = dyn_cast<LoadInst>(&I);
- if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
+ if (!Ld) {
+ recordAnalysis("CantVectorizeInstruction", Ld)
+ << "instruction cannot be vectorized";
+ HasComplexMemInst = true;
+ continue;
+ }
+ if (!Ld->isSimple() && !IsAnnotatedParallel) {
recordAnalysis("NonSimpleLoad", Ld)
<< "read with atomic ordering or volatile read";
LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
- CanVecMem = false;
- return;
+ HasComplexMemInst = true;
+ continue;
}
NumLoads++;
Loads.push_back(Ld);
if (!St) {
recordAnalysis("CantVectorizeInstruction", St)
<< "instruction cannot be vectorized";
- CanVecMem = false;
- return;
+ HasComplexMemInst = true;
+ continue;
}
if (!St->isSimple() && !IsAnnotatedParallel) {
recordAnalysis("NonSimpleStore", St)
<< "write with atomic ordering or volatile write";
LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
- CanVecMem = false;
- return;
+ HasComplexMemInst = true;
+ continue;
}
NumStores++;
Stores.push_back(St);
} // Next instr.
} // Next block.
+ if (HasComplexMemInst) {
+ CanVecMem = false;
+ return;
+ }
+
// Now we have two lists that hold the loads and the stores.
// Next, we find the pointers that they use.
}
LLVM_DEBUG(
- dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
+ dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
CanVecMem = true;
if (Accesses.isDependencyCheckNeeded()) {
}
}
+ if (HasConvergentOp) {
+ recordAnalysis("CantInsertRuntimeCheckWithConvergent")
+ << "cannot add control dependency to convergent operation";
+ LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
+ "would be needed with a convergent operation\n");
+ CanVecMem = false;
+ return;
+ }
+
if (CanVecMem)
LLVM_DEBUG(
dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
+ HasConvergentOp(false),
HasDependenceInvolvingLoopInvariantAddress(false) {
if (canAnalyzeLoop())
analyzeLoop(AA, LI, TLI, DT);
OS << "\n";
}
+ if (HasConvergentOp)
+ OS.indent(Depth) << "Has convergent operation in loop\n";
+
if (Report)
OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
"cannot isolate unsafe dependencies");
}
- // Don't distribute the loop if we need too many SCEV run-time checks.
+ // Don't distribute the loop if we need too many SCEV run-time checks, or
+ // any if it's illegal.
const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
+ if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {
+ return fail("RuntimeCheckWithConvergent",
+ "may not insert runtime check with convergent operation");
+ }
+
if (Pred.getComplexity() > (IsForced.getValueOr(false)
? PragmaDistributeSCEVCheckThreshold
: DistributeSCEVCheckThreshold))
auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
RtPtrChecking);
+ if (LAI->hasConvergentOp() && !Checks.empty()) {
+ return fail("RuntimeCheckWithConvergent",
+ "may not insert runtime check with convergent operation");
+ }
+
if (!Pred.isAlwaysTrue() || !Checks.empty()) {
+ assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning");
+
MDNode *OrigLoopID = L->getLoopID();
LLVM_DEBUG(dbgs() << "\nPointers:\n");
--- /dev/null
+; RUN: opt -loop-accesses -analyze < %s | FileCheck %s
+; RUN: opt -passes='require<scalar-evolution>,require<aa>,loop(print-access-info)' -disable-output < %s 2>&1 | FileCheck %s
+
+; Analyze this loop:
+; for (i = 0; i < n; i++)
+; A[i + 1] = A[i] * B[i] * C[i];
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK: for.body:
+; CHECK: Has convergent operation in loop
+; CHECK: Report: cannot add control dependency to convergent operation
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Backward:
+; CHECK-NEXT: %loadA = load i16, i16* %arrayidxA, align 2 ->
+; CHECK-NEXT: store i16 %mul1, i16* %arrayidxA_plus_2, align 2
+; CHECK: Run-time memory checks:
+; CHECK-NEXT: 0:
+; CHECK-NEXT: Comparing group
+; CHECK-NEXT: %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
+; CHECK-NEXT: %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+; CHECK-NEXT: Against group
+; CHECK-NEXT: %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3
+; CHECK-NEXT: 1:
+; CHECK-NEXT: Comparing group
+; CHECK-NEXT: %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
+; CHECK-NEXT: %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+; CHECK-NEXT: Against group
+; CHECK-NEXT: %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3
+
+@B = common global i16* null, align 8
+@A = common global i16* null, align 8
+@C = common global i16* null, align 8
+
+define void @f() #1 {
+entry:
+ %a = load i16*, i16** @A, align 8
+ %b = load i16*, i16** @B, align 8
+ %c = load i16*, i16** @C, align 8
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %storemerge3 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+ %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
+ %loadA = load i16, i16* %arrayidxA, align 2
+
+ %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3
+ %loadB = load i16, i16* %arrayidxB, align 2
+
+ %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3
+ %loadC = load i16, i16* %arrayidxC, align 2
+
+ call void @llvm.convergent()
+
+ %mul = mul i16 %loadB, %loadA
+ %mul1 = mul i16 %mul, %loadC
+
+ %add = add nuw nsw i64 %storemerge3, 1
+ %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+ store i16 %mul1, i16* %arrayidxA_plus_2, align 2
+
+ %exitcond = icmp eq i64 %add, 20
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+declare void @llvm.convergent() #0
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }
; RUN: -verify-loop-info -verify-dom-info -S < %s | \
; RUN: FileCheck --check-prefix=VECTORIZE %s
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info \
+; RUN: -loop-accesses -analyze < %s | FileCheck %s --check-prefix=ANALYSIS
+
; The memcheck version of basic.ll. We should distribute and vectorize the
; second part of this loop with 5 memchecks (A+1 x {C, D, E} + C x {A, B})
;
for.end:
ret void
}
+
+declare i32 @llvm.convergent(i32) #0
+
+; This is the same as f, and would require the same bounds
+; check. However, it is not OK to introduce new control dependencies
+; on the convergent call.
+
+; CHECK-LABEL: @f_with_convergent(
+; CHECK: call i32 @llvm.convergent
+; CHECK-NOT: call i32 @llvm.convergent
+
+; ANALYSIS: for.body:
+; ANALYSIS: Report: cannot add control dependency to convergent operation
+define void @f_with_convergent() #1 {
+entry:
+ %a = load i32*, i32** @A, align 8
+ %b = load i32*, i32** @B, align 8
+ %c = load i32*, i32** @C, align 8
+ %d = load i32*, i32** @D, align 8
+ %e = load i32*, i32** @E, align 8
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+ %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+ %loadA = load i32, i32* %arrayidxA, align 4
+
+ %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+ %loadB = load i32, i32* %arrayidxB, align 4
+
+ %mulA = mul i32 %loadB, %loadA
+
+ %add = add nuw nsw i64 %ind, 1
+ %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+ store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+ %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+ %loadD = load i32, i32* %arrayidxD, align 4
+
+ %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+ %loadE = load i32, i32* %arrayidxE, align 4
+
+ %convergentD = call i32 @llvm.convergent(i32 %loadD)
+ %mulC = mul i32 %convergentD, %loadE
+
+ %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+ store i32 %mulC, i32* %arrayidxC, align 4
+
+ %exitcond = icmp eq i64 %add, 20
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+; Make sure an explicit request for distribution is ignored if it
+; requires possibly illegal checks.
+
+; CHECK-LABEL: @f_with_convergent_forced_distribute(
+; CHECK: call i32 @llvm.convergent
+; CHECK-NOT: call i32 @llvm.convergent
+define void @f_with_convergent_forced_distribute() #1 {
+entry:
+ %a = load i32*, i32** @A, align 8
+ %b = load i32*, i32** @B, align 8
+ %c = load i32*, i32** @C, align 8
+ %d = load i32*, i32** @D, align 8
+ %e = load i32*, i32** @E, align 8
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+ %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+ %loadA = load i32, i32* %arrayidxA, align 4
+
+ %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+ %loadB = load i32, i32* %arrayidxB, align 4
+
+ %mulA = mul i32 %loadB, %loadA
+
+ %add = add nuw nsw i64 %ind, 1
+ %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+ store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+ %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+ %loadD = load i32, i32* %arrayidxD, align 4
+
+ %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+ %loadE = load i32, i32* %arrayidxE, align 4
+
+ %convergentD = call i32 @llvm.convergent(i32 %loadD)
+ %mulC = mul i32 %convergentD, %loadE
+
+ %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+ store i32 %mulC, i32* %arrayidxC, align 4
+
+ %exitcond = icmp eq i64 %add, 20
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.distribute.enable", i1 true}
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.10.0"
+; CHECK-LABEL: @f(
define void @f(i32* noalias %a,
i32* noalias %b,
i32* noalias %c,
for.end: ; preds = %for.body
ret void
}
+
+declare i32 @llvm.convergent(i32) #0
+
+; It is OK to distribute with a convergent operation, since in each
+; new loop the convergent operation has the ssame control dependency.
+; CHECK-LABEL: @f_with_convergent(
+define void @f_with_convergent(i32* noalias %a,
+ i32* noalias %b,
+ i32* noalias %c,
+ i32* noalias %d,
+ i32* noalias %e) {
+entry:
+ br label %for.body
+
+; Verify the two distributed loops.
+
+; CHECK: entry.split.ldist1:
+; CHECK: br label %for.body.ldist1
+; CHECK: for.body.ldist1:
+; CHECK: %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1
+; CHECK: br i1 %exitcond.ldist1, label %entry.split, label %for.body.ldist1
+
+; CHECK: entry.split:
+; CHECK: br label %for.body
+; CHECK: for.body:
+; CHECK: %convergentD = call i32 @llvm.convergent(i32 %loadD)
+; CHECK: %mulC = mul i32 %convergentD, %loadE
+; CHECK: for.end:
+
+
+; ANALYSIS: for.body:
+; ANALYSIS-NEXT: Has convergent operation in loop
+; ANALYSIS-NEXT: Report: cannot add control dependency to convergent operation
+; ANALYSIS: for.body.ldist1:
+; ANALYSIS-NEXT: Report: unsafe dependent memory operations in loop
+
+; convergent instruction happens to block vectorization
+; VECTORIZE: call i32 @llvm.convergent
+; VECTORIZE: mul i32
+
+for.body: ; preds = %for.body, %entry
+ %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+ %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+ %loadA = load i32, i32* %arrayidxA, align 4
+
+ %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+ %loadB = load i32, i32* %arrayidxB, align 4
+
+ %mulA = mul i32 %loadB, %loadA
+
+ %add = add nuw nsw i64 %ind, 1
+ %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+ store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+ %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+ %loadD = load i32, i32* %arrayidxD, align 4
+
+ %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+ %loadE = load i32, i32* %arrayidxE, align 4
+
+ %convergentD = call i32 @llvm.convergent(i32 %loadD)
+ %mulC = mul i32 %convergentD, %loadE
+
+ %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+ store i32 %mulC, i32* %arrayidxC, align 4
+
+ %exitcond = icmp eq i64 %add, 20
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute \
+; RUN: -verify-loop-info -verify-dom-info -S < %s | FileCheck %s
+
+; Derived from crash-in-memcheck-generation.ll
+
+; Make sure the loop is distributed even with a convergent
+; op. LoopAccessAnalysis says that runtime checks are necessary, but
+; none are cross partition, so none are truly needed.
+
+define void @f(i32* %a, i32* %b, i32* noalias %c, i32* noalias %d, i32* noalias %e) #1 {
+; CHECK-LABEL: @f(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[ENTRY_SPLIT_LDIST1:%.*]]
+; CHECK: entry.split.ldist1:
+; CHECK-NEXT: br label [[FOR_BODY_LDIST1:%.*]]
+; CHECK: for.body.ldist1:
+; CHECK-NEXT: [[IND_LDIST1:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT_LDIST1]] ], [ [[ADD_LDIST1:%.*]], [[FOR_BODY_LDIST1]] ]
+; CHECK-NEXT: [[ARRAYIDXA_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IND_LDIST1]]
+; CHECK-NEXT: [[LOADA_LDIST1:%.*]] = load i32, i32* [[ARRAYIDXA_LDIST1]], align 4
+; CHECK-NEXT: [[ARRAYIDXB_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[IND_LDIST1]]
+; CHECK-NEXT: [[LOADB_LDIST1:%.*]] = load i32, i32* [[ARRAYIDXB_LDIST1]], align 4
+; CHECK-NEXT: [[MULA_LDIST1:%.*]] = mul i32 [[LOADB_LDIST1]], [[LOADA_LDIST1]]
+; CHECK-NEXT: [[ADD_LDIST1]] = add nuw nsw i64 [[IND_LDIST1]], 1
+; CHECK-NEXT: [[ARRAYIDXA_PLUS_4_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[ADD_LDIST1]]
+; CHECK-NEXT: store i32 [[MULA_LDIST1]], i32* [[ARRAYIDXA_PLUS_4_LDIST1]], align 4
+; CHECK-NEXT: [[EXITCOND_LDIST1:%.*]] = icmp eq i64 [[ADD_LDIST1]], 20
+; CHECK-NEXT: br i1 [[EXITCOND_LDIST1]], label [[ENTRY_SPLIT:%.*]], label [[FOR_BODY_LDIST1]]
+; CHECK: entry.split:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IND:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[IND]], 1
+; CHECK-NEXT: [[ARRAYIDXD:%.*]] = getelementptr inbounds i32, i32* [[D:%.*]], i64 [[IND]]
+; CHECK-NEXT: [[LOADD:%.*]] = load i32, i32* [[ARRAYIDXD]], align 4
+; CHECK-NEXT: [[CONVERGENTD:%.*]] = call i32 @llvm.convergent(i32 [[LOADD]])
+; CHECK-NEXT: [[ARRAYIDXE:%.*]] = getelementptr inbounds i32, i32* [[E:%.*]], i64 [[IND]]
+; CHECK-NEXT: [[LOADE:%.*]] = load i32, i32* [[ARRAYIDXE]], align 4
+; CHECK-NEXT: [[MULC:%.*]] = mul i32 [[CONVERGENTD]], [[LOADE]]
+; CHECK-NEXT: [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[IND]]
+; CHECK-NEXT: store i32 [[MULC]], i32* [[ARRAYIDXC]], align 4
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], 20
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+ %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+ %loadA = load i32, i32* %arrayidxA, align 4
+
+ %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+ %loadB = load i32, i32* %arrayidxB, align 4
+
+ %mulA = mul i32 %loadB, %loadA
+
+ %add = add nuw nsw i64 %ind, 1
+ %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+ store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+ %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+ %loadD = load i32, i32* %arrayidxD, align 4
+ %convergentD = call i32 @llvm.convergent(i32 %loadD)
+
+ %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+ %loadE = load i32, i32* %arrayidxE, align 4
+
+ %mulC = mul i32 %convergentD, %loadE
+
+ %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+ store i32 %mulC, i32* %arrayidxC, align 4
+
+ %exitcond = icmp eq i64 %add, 20
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+declare i32 @llvm.convergent(i32) #0
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }
ret void, !dbg !34
}
+; MISSED_REMARKS: /tmp/t.c:27:5: loop not distributed: use -Rpass-analysis=loop-distribute for more info
+; ANALYSIS_REMARKS: /tmp/t.c:27:5: loop not distributed: may not insert runtime check with convergent operation
+; ALWAYS: warning: /tmp/t.c:27:5: loop not distributed: failed explicitly specified loop distribution
+define void @convergent(i8* %A, i8* %B, i8* %C, i8* %D, i8* %E, i32 %N) #1 !dbg !45 {
+entry:
+ %cmp28 = icmp sgt i32 %N, 0, !dbg !46
+ br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !47
+
+ph:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+ %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !49
+ %0 = load i8, i8* %arrayidx, align 1, !dbg !49, !tbaa !13
+ %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !50
+ %1 = load i8, i8* %arrayidx2, align 1, !dbg !50, !tbaa !13
+ %add = add i8 %1, %0, !dbg !51
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !57
+ %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !52
+ store i8 %add, i8* %arrayidx7, align 1, !dbg !53, !tbaa !13
+ %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !54
+ %2 = load i8, i8* %arrayidx9, align 1, !dbg !54, !tbaa !13
+ %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !55
+ %3 = load i8, i8* %arrayidx12, align 1, !dbg !55, !tbaa !13
+ %mul = mul i8 %3, %2, !dbg !56
+ %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !57
+ store i8 %mul, i8* %arrayidx16, align 1, !dbg !58, !tbaa !13
+ call void @llvm.convergent()
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !57
+ %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !57
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !20, !dbg !57
+
+for.cond.cleanup:
+ ret void, !dbg !58
+}
+
+
+declare void @llvm.convergent() #0
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }
+
+
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
!42 = !DILocation(line: 17, column: 17, scope: !31)
!43 = !DILocation(line: 17, column: 5, scope: !31)
!44 = !DILocation(line: 17, column: 10, scope: !31)
+!45 = distinct !DISubprogram(name: "convergent", scope: !1, file: !1, line: 24, type: !8, isLocal: false, isDefinition: true, scopeLine: 24, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!46 = !DILocation(line: 25, column: 20, scope: !45)
+!47 = !DILocation(line: 25, column: 3, scope: !45)
+!48 = !DILocation(line: 29, column: 1, scope: !45)
+!49 = !DILocation(line: 26, column: 16, scope: !45)
+!50 = !DILocation(line: 26, column: 23, scope: !45)
+!51 = !DILocation(line: 26, column: 21, scope: !45)
+!52 = !DILocation(line: 26, column: 5, scope: !45)
+!53 = !DILocation(line: 26, column: 14, scope: !45)
+!54 = !DILocation(line: 27, column: 12, scope: !45)
+!55 = !DILocation(line: 27, column: 19, scope: !45)
+!56 = !DILocation(line: 27, column: 17, scope: !45)
+!57 = !DILocation(line: 27, column: 5, scope: !45)
+!58 = !DILocation(line: 27, column: 10, scope: !45)
; not based on memory access.
define void @f(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias %d, i32* noalias %e, i64 %N) {
-
; CHECK-LABEL: @f(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: ret void
+;
entry:
br label %for.body
for.end: ; preds = %for.body
ret void
}
+
+; Can't add control dependency with convergent in loop body.
+define void @f_with_convergent(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias %d, i32* noalias %e, i64 %N) #1 {
+; CHECK-LABEL: @f_with_convergent(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IND:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[IND1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[IND1]], 2
+; CHECK-NEXT: [[MUL_EXT:%.*]] = zext i32 [[MUL]] to i64
+; CHECK-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT: [[LOADA:%.*]] = load i32, i32* [[ARRAYIDXA]], align 4
+; CHECK-NEXT: [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT: [[LOADB:%.*]] = load i32, i32* [[ARRAYIDXB]], align 4
+; CHECK-NEXT: [[MULA:%.*]] = mul i32 [[LOADB]], [[LOADA]]
+; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[IND]], 1
+; CHECK-NEXT: [[INC1]] = add i32 [[IND1]], 1
+; CHECK-NEXT: [[ARRAYIDXA_PLUS_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[ADD]]
+; CHECK-NEXT: store i32 [[MULA]], i32* [[ARRAYIDXA_PLUS_4]], align 4
+; CHECK-NEXT: [[ARRAYIDXD:%.*]] = getelementptr inbounds i32, i32* [[D:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT: [[LOADD:%.*]] = load i32, i32* [[ARRAYIDXD]], align 4
+; CHECK-NEXT: [[ARRAYIDXE:%.*]] = getelementptr inbounds i32, i32* [[E:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT: [[LOADE:%.*]] = load i32, i32* [[ARRAYIDXE]], align 4
+; CHECK-NEXT: [[CONVERGENTD:%.*]] = call i32 @llvm.convergent(i32 [[LOADD]])
+; CHECK-NEXT: [[MULC:%.*]] = mul i32 [[CONVERGENTD]], [[LOADE]]
+; CHECK-NEXT: [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT: store i32 [[MULC]], i32* [[ARRAYIDXC]], align 4
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], [[N:%.*]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+ %ind1 = phi i32 [ 0, %entry ], [ %inc1, %for.body ]
+
+ %mul = mul i32 %ind1, 2
+ %mul_ext = zext i32 %mul to i64
+
+
+ %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %mul_ext
+ %loadA = load i32, i32* %arrayidxA, align 4
+
+ %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %mul_ext
+ %loadB = load i32, i32* %arrayidxB, align 4
+
+ %mulA = mul i32 %loadB, %loadA
+
+ %add = add nuw nsw i64 %ind, 1
+ %inc1 = add i32 %ind1, 1
+
+ %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+ store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+ %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %mul_ext
+ %loadD = load i32, i32* %arrayidxD, align 4
+
+ %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %mul_ext
+ %loadE = load i32, i32* %arrayidxE, align 4
+
+ %convergentD = call i32 @llvm.convergent(i32 %loadD)
+ %mulC = mul i32 %convergentD, %loadE
+
+ %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %mul_ext
+ store i32 %mulC, i32* %arrayidxC, align 4
+
+ %exitcond = icmp eq i64 %add, %N
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+declare i32 @llvm.convergent(i32) #0
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }