LoopDistribute/LAA: Respect convergent

author Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 12 Jun 2019 13:34:19 +0000 (13:34 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 12 Jun 2019 13:34:19 +0000 (13:34 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 12 Jun 2019 13:34:19 +0000 (13:34 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 12 Jun 2019 13:34:19 +0000 (13:34 +0000)
diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h

index fa34afaf9589ba73307147eb375b30602c65f260..9e9aaa32c64f2711a412fc63b0f34d752a87809d 100644 (file)
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -522,6 +522,11 @@ public:
    /// no memory dependence cycles.
    bool canVectorizeMemory() const { return CanVecMem; }
  
+  /// Return true if there is a convergent operation in the loop. There may
+  /// still be reported runtime pointer checks that would be required, but it is
+  /// not legal to insert them.
+  bool hasConvergentOp() const { return HasConvergentOp; }
+
    const RuntimePointerChecking *getRuntimePointerChecking() const {
      return PtrRtChecking.get();
    }
@@ -642,6 +647,7 @@ private:
  
    /// Cache the result of analyzeLoop.
    bool CanVecMem;
+  bool HasConvergentOp;
  
    /// Indicator that there are non vectorizable stores to a uniform address.
    bool HasDependenceInvolvingLoopInvariantAddress;
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp

index d6fbf6f2827fa8a6eca9ad217d5f2738b0e15a38..36bd9a8b7ea7990dac15ea1006afaa64f96cfaf3 100644 (file)
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1778,6 +1778,11 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
    unsigned NumReads = 0;
    unsigned NumReadWrites = 0;
  
+  bool HasComplexMemInst = false;
+
+  // A runtime check is only legal to insert if there are no convergent calls.
+  HasConvergentOp = false;
+
    PtrRtChecking->Pointers.clear();
    PtrRtChecking->Need = false;
  
@@ -1785,8 +1790,25 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
  
    // For each block.
    for (BasicBlock *BB : TheLoop->blocks()) {
-    // Scan the BB and collect legal loads and stores.
+    // Scan the BB and collect legal loads and stores. Also detect any
+    // convergent instructions.
      for (Instruction &I : *BB) {
+      if (auto *Call = dyn_cast<CallBase>(&I)) {
+        if (Call->isConvergent())
+          HasConvergentOp = true;
+      }
+
+      // With both a non-vectorizable memory instruction and a convergent
+      // operation, found in this loop, no reason to continue the search.
+      if (HasComplexMemInst && HasConvergentOp) {
+        CanVecMem = false;
+        return;
+      }
+
+      // Avoid hitting recordAnalysis multiple times.
+      if (HasComplexMemInst)
+        continue;
+
        // If this is a load, save it. If this instruction can read from memory
        // but is not a load, then we quit. Notice that we don't handle function
        // calls that read or write.
@@ -1805,12 +1827,18 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
            continue;
  
          auto *Ld = dyn_cast<LoadInst>(&I);
-        if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
+        if (!Ld) {
+          recordAnalysis("CantVectorizeInstruction", Ld)
+            << "instruction cannot be vectorized";
+          HasComplexMemInst = true;
+          continue;
+        }
+        if (!Ld->isSimple() && !IsAnnotatedParallel) {
            recordAnalysis("NonSimpleLoad", Ld)
                << "read with atomic ordering or volatile read";
            LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
-          CanVecMem = false;
-          return;
+          HasComplexMemInst = true;
+          continue;
          }
          NumLoads++;
          Loads.push_back(Ld);
@@ -1826,15 +1854,15 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
          if (!St) {
            recordAnalysis("CantVectorizeInstruction", St)
                << "instruction cannot be vectorized";
-          CanVecMem = false;
-          return;
+          HasComplexMemInst = true;
+          continue;
          }
          if (!St->isSimple() && !IsAnnotatedParallel) {
            recordAnalysis("NonSimpleStore", St)
                << "write with atomic ordering or volatile write";
            LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
-          CanVecMem = false;
-          return;
+          HasComplexMemInst = true;
+          continue;
          }
          NumStores++;
          Stores.push_back(St);
@@ -1845,6 +1873,11 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
      } // Next instr.
    } // Next block.
  
+  if (HasComplexMemInst) {
+    CanVecMem = false;
+    return;
+  }
+
    // Now we have two lists that hold the loads and the stores.
    // Next, we find the pointers that they use.
  
@@ -1962,7 +1995,7 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
    }
  
    LLVM_DEBUG(
-      dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
+    dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
  
    CanVecMem = true;
    if (Accesses.isDependencyCheckNeeded()) {
@@ -1997,6 +2030,15 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
      }
    }
  
+  if (HasConvergentOp) {
+    recordAnalysis("CantInsertRuntimeCheckWithConvergent")
+      << "cannot add control dependency to convergent operation";
+    LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
+                         "would be needed with a convergent operation\n");
+    CanVecMem = false;
+    return;
+  }
+
    if (CanVecMem)
      LLVM_DEBUG(
          dbgs() << "LAA: No unsafe dependent memory operations in loop.  We"
@@ -2285,6 +2327,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
        PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
        DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
        NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
+      HasConvergentOp(false),
        HasDependenceInvolvingLoopInvariantAddress(false) {
    if (canAnalyzeLoop())
      analyzeLoop(AA, LI, TLI, DT);
@@ -2301,6 +2344,9 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
      OS << "\n";
    }
  
+  if (HasConvergentOp)
+    OS.indent(Depth) << "Has convergent operation in loop\n";
+
    if (Report)
      OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
  
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp

index 4f5344b65b7c6a36c9ddc65bdd0194377d908543..f45e5fd0f50b4fd3b21f074bb86caccf43550c7e 100644 (file)
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -766,8 +766,14 @@ public:
                      "cannot isolate unsafe dependencies");
      }
  
-    // Don't distribute the loop if we need too many SCEV run-time checks.
+    // Don't distribute the loop if we need too many SCEV run-time checks, or
+    // any if it's illegal.
      const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
+    if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {
+      return fail("RuntimeCheckWithConvergent",
+                  "may not insert runtime check with convergent operation");
+    }
+
      if (Pred.getComplexity() > (IsForced.getValueOr(false)
                                      ? PragmaDistributeSCEVCheckThreshold
                                      : DistributeSCEVCheckThreshold))
@@ -795,7 +801,14 @@ public:
      auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
                                                    RtPtrChecking);
  
+    if (LAI->hasConvergentOp() && !Checks.empty()) {
+      return fail("RuntimeCheckWithConvergent",
+                  "may not insert runtime check with convergent operation");
+    }
+
      if (!Pred.isAlwaysTrue() || !Checks.empty()) {
+      assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning");
+
        MDNode *OrigLoopID = L->getLoopID();
  
        LLVM_DEBUG(dbgs() << "\nPointers:\n");
diff --git a/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll b/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll

new file mode 100644 (file)

index 0000000..7f42e27
--- /dev/null
+++ b/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll
@@ -0,0 +1,73 @@
+; RUN: opt -loop-accesses -analyze < %s | FileCheck %s
+; RUN: opt -passes='require<scalar-evolution>,require<aa>,loop(print-access-info)' -disable-output  < %s 2>&1 | FileCheck %s
+
+; Analyze this loop:
+;   for (i = 0; i < n; i++)
+;    A[i + 1] = A[i] * B[i] * C[i];
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK: for.body:
+; CHECK: Has convergent operation in loop
+; CHECK: Report: cannot add control dependency to convergent operation
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT:   Backward:
+; CHECK-NEXT:     %loadA = load i16, i16* %arrayidxA, align 2 ->
+; CHECK-NEXT:     store i16 %mul1, i16* %arrayidxA_plus_2, align 2
+; CHECK: Run-time memory checks:
+; CHECK-NEXT: 0:
+; CHECK-NEXT: Comparing group
+; CHECK-NEXT:   %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
+; CHECK-NEXT:   %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+; CHECK-NEXT: Against group
+; CHECK-NEXT:   %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3
+; CHECK-NEXT: 1:
+; CHECK-NEXT: Comparing group
+; CHECK-NEXT:   %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
+; CHECK-NEXT:   %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+; CHECK-NEXT: Against group
+; CHECK-NEXT:   %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3
+
+@B = common global i16* null, align 8
+@A = common global i16* null, align 8
+@C = common global i16* null, align 8
+
+define void @f() #1 {
+entry:
+  %a = load i16*, i16** @A, align 8
+  %b = load i16*, i16** @B, align 8
+  %c = load i16*, i16** @C, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %storemerge3 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
+  %loadA = load i16, i16* %arrayidxA, align 2
+
+  %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3
+  %loadB = load i16, i16* %arrayidxB, align 2
+
+  %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3
+  %loadC = load i16, i16* %arrayidxC, align 2
+
+  call void @llvm.convergent()
+
+  %mul = mul i16 %loadB, %loadA
+  %mul1 = mul i16 %mul, %loadC
+
+  %add = add nuw nsw i64 %storemerge3, 1
+  %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
+  store i16 %mul1, i16* %arrayidxA_plus_2, align 2
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare void @llvm.convergent() #0
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }
diff --git a/test/Transforms/LoopDistribute/basic-with-memchecks.ll b/test/Transforms/LoopDistribute/basic-with-memchecks.ll

index 301103eb5ec0bf543aec7b10bd6929bb523c5590..51a4fe8f60e616a4dcdf7d57537b8e72cfe7ed55 100644 (file)
--- a/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -5,6 +5,9 @@
  ; RUN:   -verify-loop-info -verify-dom-info -S < %s | \
  ; RUN:   FileCheck --check-prefix=VECTORIZE %s
  
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info \
+; RUN:   -loop-accesses -analyze < %s | FileCheck %s --check-prefix=ANALYSIS
+
  ; The memcheck version of basic.ll.  We should distribute and vectorize the
  ; second part of this loop with 5 memchecks (A+1 x {C, D, E} + C x {A, B})
  ;
@@ -173,3 +176,113 @@ for.body:
  for.end:
    ret void
  }
+
+declare i32 @llvm.convergent(i32) #0
+
+; This is the same as f, and would require the same bounds
+; check. However, it is not OK to introduce new control dependencies
+; on the convergent call.
+
+; CHECK-LABEL: @f_with_convergent(
+; CHECK: call i32 @llvm.convergent
+; CHECK-NOT: call i32 @llvm.convergent
+
+; ANALYSIS: for.body:
+; ANALYSIS: Report: cannot add control dependency to convergent operation
+define void @f_with_convergent() #1 {
+entry:
+  %a = load i32*, i32** @A, align 8
+  %b = load i32*, i32** @B, align 8
+  %c = load i32*, i32** @C, align 8
+  %d = load i32*, i32** @D, align 8
+  %e = load i32*, i32** @E, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %convergentD = call i32 @llvm.convergent(i32 %loadD)
+  %mulC = mul i32 %convergentD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure an explicit request for distribution is ignored if it
+; requires possibly illegal checks.
+
+; CHECK-LABEL: @f_with_convergent_forced_distribute(
+; CHECK: call i32 @llvm.convergent
+; CHECK-NOT: call i32 @llvm.convergent
+define void @f_with_convergent_forced_distribute() #1 {
+entry:
+  %a = load i32*, i32** @A, align 8
+  %b = load i32*, i32** @B, align 8
+  %c = load i32*, i32** @C, align 8
+  %d = load i32*, i32** @D, align 8
+  %e = load i32*, i32** @E, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %convergentD = call i32 @llvm.convergent(i32 %loadD)
+  %mulC = mul i32 %convergentD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.distribute.enable", i1 true}
diff --git a/test/Transforms/LoopDistribute/basic.ll b/test/Transforms/LoopDistribute/basic.ll

index 97296c2d123ee0dcc6e925a0c03ac78c2c086919..0d7b6f21824cd689d537dd0134b272f69eab7d73 100644 (file)
--- a/test/Transforms/LoopDistribute/basic.ll
+++ b/test/Transforms/LoopDistribute/basic.ll
@@ -18,6 +18,7 @@
  target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-apple-macosx10.10.0"
  
+; CHECK-LABEL: @f(
  define void @f(i32* noalias %a,
                 i32* noalias %b,
                 i32* noalias %c,
@@ -81,3 +82,78 @@ for.body:                                         ; preds = %for.body, %entry
  for.end:                                          ; preds = %for.body
    ret void
  }
+
+declare i32 @llvm.convergent(i32) #0
+
+; It is OK to distribute with a convergent operation, since in each
+; new loop the convergent operation has the ssame control dependency.
+; CHECK-LABEL: @f_with_convergent(
+define void @f_with_convergent(i32* noalias %a,
+                               i32* noalias %b,
+                               i32* noalias %c,
+                               i32* noalias %d,
+                               i32* noalias %e) {
+entry:
+  br label %for.body
+
+; Verify the two distributed loops.
+
+; CHECK: entry.split.ldist1:
+; CHECK:    br label %for.body.ldist1
+; CHECK: for.body.ldist1:
+; CHECK:    %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1
+; CHECK:    br i1 %exitcond.ldist1, label %entry.split, label %for.body.ldist1
+
+; CHECK: entry.split:
+; CHECK:    br label %for.body
+; CHECK: for.body:
+; CHECK:    %convergentD = call i32 @llvm.convergent(i32 %loadD)
+; CHECK:    %mulC = mul i32 %convergentD, %loadE
+; CHECK: for.end:
+
+
+; ANALYSIS: for.body:
+; ANALYSIS-NEXT: Has convergent operation in loop
+; ANALYSIS-NEXT: Report: cannot add control dependency to convergent operation
+; ANALYSIS: for.body.ldist1:
+; ANALYSIS-NEXT: Report: unsafe dependent memory operations in loop
+
+; convergent instruction happens to block vectorization
+; VECTORIZE: call i32 @llvm.convergent
+; VECTORIZE: mul i32
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %convergentD = call i32 @llvm.convergent(i32 %loadD)
+  %mulC = mul i32 %convergentD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
diff --git a/test/Transforms/LoopDistribute/convergent-no-cross-partition-checks.ll b/test/Transforms/LoopDistribute/convergent-no-cross-partition-checks.ll

new file mode 100644 (file)

index 0000000..1ea5b71
--- /dev/null
+++ b/test/Transforms/LoopDistribute/convergent-no-cross-partition-checks.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute \
+; RUN:   -verify-loop-info -verify-dom-info -S < %s | FileCheck %s
+
+; Derived from crash-in-memcheck-generation.ll
+
+; Make sure the loop is distributed even with a convergent
+; op. LoopAccessAnalysis says that runtime checks are necessary, but
+; none are cross partition, so none are truly needed.
+
+define void @f(i32* %a, i32* %b, i32* noalias %c, i32* noalias %d, i32* noalias %e) #1 {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_LDIST1:%.*]]
+; CHECK:       entry.split.ldist1:
+; CHECK-NEXT:    br label [[FOR_BODY_LDIST1:%.*]]
+; CHECK:       for.body.ldist1:
+; CHECK-NEXT:    [[IND_LDIST1:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT_LDIST1]] ], [ [[ADD_LDIST1:%.*]], [[FOR_BODY_LDIST1]] ]
+; CHECK-NEXT:    [[ARRAYIDXA_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IND_LDIST1]]
+; CHECK-NEXT:    [[LOADA_LDIST1:%.*]] = load i32, i32* [[ARRAYIDXA_LDIST1]], align 4
+; CHECK-NEXT:    [[ARRAYIDXB_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[IND_LDIST1]]
+; CHECK-NEXT:    [[LOADB_LDIST1:%.*]] = load i32, i32* [[ARRAYIDXB_LDIST1]], align 4
+; CHECK-NEXT:    [[MULA_LDIST1:%.*]] = mul i32 [[LOADB_LDIST1]], [[LOADA_LDIST1]]
+; CHECK-NEXT:    [[ADD_LDIST1]] = add nuw nsw i64 [[IND_LDIST1]], 1
+; CHECK-NEXT:    [[ARRAYIDXA_PLUS_4_LDIST1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[ADD_LDIST1]]
+; CHECK-NEXT:    store i32 [[MULA_LDIST1]], i32* [[ARRAYIDXA_PLUS_4_LDIST1]], align 4
+; CHECK-NEXT:    [[EXITCOND_LDIST1:%.*]] = icmp eq i64 [[ADD_LDIST1]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND_LDIST1]], label [[ENTRY_SPLIT:%.*]], label [[FOR_BODY_LDIST1]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IND:%.*]] = phi i64 [ 0, [[ENTRY_SPLIT]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i64 [[IND]], 1
+; CHECK-NEXT:    [[ARRAYIDXD:%.*]] = getelementptr inbounds i32, i32* [[D:%.*]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADD:%.*]] = load i32, i32* [[ARRAYIDXD]], align 4
+; CHECK-NEXT:    [[CONVERGENTD:%.*]] = call i32 @llvm.convergent(i32 [[LOADD]])
+; CHECK-NEXT:    [[ARRAYIDXE:%.*]] = getelementptr inbounds i32, i32* [[E:%.*]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADE:%.*]] = load i32, i32* [[ARRAYIDXE]], align 4
+; CHECK-NEXT:    [[MULC:%.*]] = mul i32 [[CONVERGENTD]], [[LOADE]]
+; CHECK-NEXT:    [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[IND]]
+; CHECK-NEXT:    store i32 [[MULC]], i32* [[ARRAYIDXC]], align 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+  %convergentD = call i32 @llvm.convergent(i32 %loadD)
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %convergentD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare i32 @llvm.convergent(i32) #0
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }
diff --git a/test/Transforms/LoopDistribute/diagnostics.ll b/test/Transforms/LoopDistribute/diagnostics.ll

index 92b516f8981bd57988dde3c817ad15442cdc119f..d7b2b1088efea65f7bc1fddda86b3a3eed0aaade 100644 (file)
--- a/test/Transforms/LoopDistribute/diagnostics.ll
+++ b/test/Transforms/LoopDistribute/diagnostics.ll
@@ -131,6 +131,50 @@ for.cond.cleanup:
    ret void, !dbg !34
  }
  
+; MISSED_REMARKS: /tmp/t.c:27:5: loop not distributed: use -Rpass-analysis=loop-distribute for more info
+; ANALYSIS_REMARKS: /tmp/t.c:27:5: loop not distributed: may not insert runtime check with convergent operation
+; ALWAYS: warning: /tmp/t.c:27:5: loop not distributed: failed explicitly specified loop distribution
+define void @convergent(i8* %A, i8* %B, i8* %C, i8* %D, i8* %E, i32 %N) #1 !dbg !45 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !46
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !47
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !49
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !49, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !50
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !50, !tbaa !13
+  %add = add i8 %1, %0, !dbg !51
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !57
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !52
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !53, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !54
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !54, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !55
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !55, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !56
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !57
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !58, !tbaa !13
+  call void @llvm.convergent()
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !57
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !57
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !20, !dbg !57
+
+for.cond.cleanup:
+  ret void, !dbg !58
+}
+
+
+declare void @llvm.convergent() #0
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }
+
+
  !llvm.dbg.cu = !{!0}
  !llvm.module.flags = !{!3, !4}
  
@@ -177,3 +221,17 @@ for.cond.cleanup:
  !42 = !DILocation(line: 17, column: 17, scope: !31)
  !43 = !DILocation(line: 17, column: 5, scope: !31)
  !44 = !DILocation(line: 17, column: 10, scope: !31)
+!45 = distinct !DISubprogram(name: "convergent", scope: !1, file: !1, line: 24, type: !8, isLocal: false, isDefinition: true, scopeLine: 24, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!46 = !DILocation(line: 25, column: 20, scope: !45)
+!47 = !DILocation(line: 25, column: 3, scope: !45)
+!48 = !DILocation(line: 29, column: 1, scope: !45)
+!49 = !DILocation(line: 26, column: 16, scope: !45)
+!50 = !DILocation(line: 26, column: 23, scope: !45)
+!51 = !DILocation(line: 26, column: 21, scope: !45)
+!52 = !DILocation(line: 26, column: 5, scope: !45)
+!53 = !DILocation(line: 26, column: 14, scope: !45)
+!54 = !DILocation(line: 27, column: 12, scope: !45)
+!55 = !DILocation(line: 27, column: 19, scope: !45)
+!56 = !DILocation(line: 27, column: 17, scope: !45)
+!57 = !DILocation(line: 27, column: 5, scope: !45)
+!58 = !DILocation(line: 27, column: 10, scope: !45)
diff --git a/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll b/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll

index 2b1fc7c7d14f2210587ee0bdc2d2c3c2a6c06eac..8ad19cb47075b0126f0a9e2e7fdd82ae7122fa5f 100644 (file)
--- a/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll
+++ b/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll
@@ -7,7 +7,6 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
  ; not based on memory access.
  
  define void @f(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias %d, i32* noalias %e, i64 %N) {
-
  ; CHECK-LABEL: @f(
  ; CHECK-NEXT:  entry:
  ; CHECK-NEXT:    [[A2:%.*]] = ptrtoint i32* [[A:%.*]] to i64
@@ -101,6 +100,7 @@ define void @f(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias %
  ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
  ; CHECK:       for.end:
  ; CHECK-NEXT:    ret void
+;
  entry:
    br label %for.body
  
@@ -143,3 +143,84 @@ for.body:                                         ; preds = %for.body, %entry
  for.end:                                          ; preds = %for.body
    ret void
  }
+
+; Can't add control dependency with convergent in loop body.
+define void @f_with_convergent(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias %d, i32* noalias %e, i64 %N) #1 {
+; CHECK-LABEL: @f_with_convergent(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IND:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IND1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[IND1]], 2
+; CHECK-NEXT:    [[MUL_EXT:%.*]] = zext i32 [[MUL]] to i64
+; CHECK-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT:    [[LOADA:%.*]] = load i32, i32* [[ARRAYIDXA]], align 4
+; CHECK-NEXT:    [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT:    [[LOADB:%.*]] = load i32, i32* [[ARRAYIDXB]], align 4
+; CHECK-NEXT:    [[MULA:%.*]] = mul i32 [[LOADB]], [[LOADA]]
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i64 [[IND]], 1
+; CHECK-NEXT:    [[INC1]] = add i32 [[IND1]], 1
+; CHECK-NEXT:    [[ARRAYIDXA_PLUS_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[ADD]]
+; CHECK-NEXT:    store i32 [[MULA]], i32* [[ARRAYIDXA_PLUS_4]], align 4
+; CHECK-NEXT:    [[ARRAYIDXD:%.*]] = getelementptr inbounds i32, i32* [[D:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT:    [[LOADD:%.*]] = load i32, i32* [[ARRAYIDXD]], align 4
+; CHECK-NEXT:    [[ARRAYIDXE:%.*]] = getelementptr inbounds i32, i32* [[E:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT:    [[LOADE:%.*]] = load i32, i32* [[ARRAYIDXE]], align 4
+; CHECK-NEXT:    [[CONVERGENTD:%.*]] = call i32 @llvm.convergent(i32 [[LOADD]])
+; CHECK-NEXT:    [[MULC:%.*]] = mul i32 [[CONVERGENTD]], [[LOADE]]
+; CHECK-NEXT:    [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[MUL_EXT]]
+; CHECK-NEXT:    store i32 [[MULC]], i32* [[ARRAYIDXC]], align 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %ind1 = phi i32 [ 0, %entry ], [ %inc1, %for.body ]
+
+  %mul = mul i32 %ind1, 2
+  %mul_ext = zext i32 %mul to i64
+
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %mul_ext
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %mul_ext
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %inc1 = add i32 %ind1, 1
+
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %mul_ext
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %mul_ext
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %convergentD = call i32 @llvm.convergent(i32 %loadD)
+  %mulC = mul i32 %convergentD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %mul_ext
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare i32 @llvm.convergent(i32) #0
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 12 Jun 2019 13:34:19 +0000 (13:34 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 12 Jun 2019 13:34:19 +0000 (13:34 +0000)
include/llvm/Analysis/LoopAccessAnalysis.h		patch \| blob \| history
lib/Analysis/LoopAccessAnalysis.cpp		patch \| blob \| history
lib/Transforms/Scalar/LoopDistribute.cpp		patch \| blob \| history
test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll	[new file with mode: 0644]	patch \| blob
test/Transforms/LoopDistribute/basic-with-memchecks.ll		patch \| blob \| history
test/Transforms/LoopDistribute/basic.ll		patch \| blob \| history
test/Transforms/LoopDistribute/convergent-no-cross-partition-checks.ll	[new file with mode: 0644]	patch \| blob
test/Transforms/LoopDistribute/diagnostics.ll		patch \| blob \| history
test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll		patch \| blob \| history