[LV] Process pointer IVs with PHINodes in collectLoopUniforms

author Matthew Simpson <mssimpso@codeaurora.org>

Wed, 14 Sep 2016 14:47:40 +0000 (14:47 +0000)

committer Matthew Simpson <mssimpso@codeaurora.org>

Wed, 14 Sep 2016 14:47:40 +0000 (14:47 +0000)
author Matthew Simpson <mssimpso@codeaurora.org>
Wed, 14 Sep 2016 14:47:40 +0000 (14:47 +0000)
committer Matthew Simpson <mssimpso@codeaurora.org>
Wed, 14 Sep 2016 14:47:40 +0000 (14:47 +0000)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index a08c8cc145afe13ac7ce9e3b734abc6ac16ca4a7..0b368d1df1dc277bbf2b15599821a9d990ed7e5b 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5393,9 +5393,18 @@ void LoopVectorizationLegality::collectLoopUniforms() {
        if (!Ptr)
          continue;
  
+      // True if all users of Ptr are memory accesses that have Ptr as their
+      // pointer operand.
+      auto UsersAreMemAccesses = all_of(Ptr->users(), [&](User *U) -> bool {
+        return getPointerOperand(U) == Ptr;
+      });
+
        // Ensure the memory instruction will not be scalarized, making its
-      // pointer operand non-uniform.
-      if (memoryInstructionMustBeScalarized(&I))
+      // pointer operand non-uniform. If the pointer operand is used by some
+      // instruction other than a memory access, we're not going to check if
+      // that other instruction may be scalarized here. Thus, conservatively
+      // assume the pointer operand may be non-uniform.
+      if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I))
          PossibleNonUniformPtrs.insert(Ptr);
  
        // If the memory instruction will be vectorized and its pointer operand
@@ -5433,11 +5442,18 @@ void LoopVectorizationLegality::collectLoopUniforms() {
      }
    }
  
+  // Returns true if Ptr is the pointer operand of a memory access instruction
+  // I, and I is known to not require scalarization.
+  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
+    return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I);
+  };
+
    // For an instruction to be added into Worklist above, all its users inside
    // the loop should also be in Worklist. However, this condition cannot be
    // true for phi nodes that form a cyclic dependence. We must process phi
    // nodes separately. An induction variable will remain uniform if all users
    // of the induction variable and induction variable update remain uniform.
+  // The code below handles both pointer and non-pointer induction variables.
    for (auto &Induction : Inductions) {
      auto *Ind = Induction.first;
      auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
@@ -5446,7 +5462,8 @@ void LoopVectorizationLegality::collectLoopUniforms() {
      // vectorization.
      auto UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
        auto *I = cast<Instruction>(U);
-      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
+      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
+             isVectorizedMemAccessUse(I, Ind);
      });
      if (!UniformInd)
        continue;
@@ -5455,7 +5472,8 @@ void LoopVectorizationLegality::collectLoopUniforms() {
      // uniform after vectorization.
      auto UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
        auto *I = cast<Instruction>(U);
-      return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
+      return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+             isVectorizedMemAccessUse(I, IndUpdate);
      });
      if (!UniformIndUpdate)
        continue;
diff --git a/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll

index f1fc1aa683639a9ce7223df84c6391386b17679e..a462d35ba01659789ce9d193c489f413e6d438e6 100644 (file)
--- a/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
+++ b/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
@@ -269,3 +269,172 @@ for.body:
  for.end:
    ret void
  }
+
+; CHECK-LABEL: pointer_iv_uniform
+;
+; Check that a pointer induction variable is recognized as uniform and remains
+; uniform after vectorization.
+;
+; CHECK:     LV: Found uniform instruction: %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NOT:   getelementptr
+; CHECK:       %next.gep = getelementptr i32, i32* %a, i64 %index
+; CHECK-NOT:   getelementptr
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @pointer_iv_uniform(i32* %a, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+  store i32 %x, i32* %p, align 8
+  %tmp03 = getelementptr inbounds i32, i32* %p, i32 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; INTER-LABEL: pointer_iv_non_uniform_0
+;
+; Check that a pointer induction variable with a non-uniform user is not
+; recognized as uniform and is not uniform after vectorization. The pointer
+; induction variable is used by getelementptr instructions that are non-uniform
+; due to scalarization of the stores.
+;
+; INTER-NOT: LV: Found uniform instruction: %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+; INTER:     vector.body
+; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTER:       %[[I0:.+]] = shl i64 %index, 2
+; INTER:       %next.gep = getelementptr i32, i32* %a, i64 %[[I0]]
+; INTER:       %[[S1:.+]] = shl i64 %index, 2
+; INTER:       %[[I1:.+]] = or i64 %[[S1]], 4
+; INTER:       %next.gep2 = getelementptr i32, i32* %a, i64 %[[I1]]
+; INTER:       %[[S2:.+]] = shl i64 %index, 2
+; INTER:       %[[I2:.+]] = or i64 %[[S2]], 8
+; INTER:       %next.gep3 = getelementptr i32, i32* %a, i64 %[[I2]]
+; INTER:       %[[S3:.+]] = shl i64 %index, 2
+; INTER:       %[[I3:.+]] = or i64 %[[S3]], 12
+; INTER:       %next.gep4 = getelementptr i32, i32* %a, i64 %[[I3]]
+; INTER:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @pointer_iv_non_uniform_0(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+  %tmp00 = load i32, i32* %p, align 8
+  %tmp01 = getelementptr inbounds i32, i32* %p, i32 1
+  %tmp02 = load i32, i32* %tmp01, align 8
+  %tmp03 = getelementptr inbounds i32, i32* %p, i32 4
+  %tmp04 = load i32, i32* %tmp03, align 8
+  %tmp05 = getelementptr inbounds i32, i32* %p, i32 5
+  %tmp06 = load i32, i32* %tmp05, align 8
+  %tmp07 = sub i32 %tmp04, %tmp00
+  %tmp08 = sub i32 %tmp02, %tmp02
+  %tmp09 = getelementptr inbounds i32, i32* %p, i32 2
+  store i32 %tmp07, i32* %tmp09, align 8
+  %tmp10 = getelementptr inbounds i32, i32* %p, i32 3
+  store i32 %tmp08, i32* %tmp10, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: pointer_iv_non_uniform_1
+;
+; Check that a pointer induction variable with a non-uniform user is not
+; recognized as uniform and is not uniform after vectorization. The pointer
+; induction variable is used by a store that will be scalarized.
+;
+; CHECK-NOT: LV: Found uniform instruction: %p = phi x86_fp80* [%tmp1, %for.body], [%a, %entry]
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %next.gep = getelementptr x86_fp80, x86_fp80* %a, i64 %index
+; CHECK:       %[[I1:.+]] = or i64 %index, 1
+; CHECK:       %next.gep2 = getelementptr x86_fp80, x86_fp80* %a, i64 %[[I1]]
+; CHECK:       %[[I2:.+]] = or i64 %index, 2
+; CHECK:       %next.gep3 = getelementptr x86_fp80, x86_fp80* %a, i64 %[[I2]]
+; CHECK:       %[[I3:.+]] = or i64 %index, 3
+; CHECK:       %next.gep4 = getelementptr x86_fp80, x86_fp80* %a, i64 %[[I3]]
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @pointer_iv_non_uniform_1(x86_fp80* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p = phi x86_fp80* [%tmp1, %for.body], [%a, %entry]
+  %tmp0 = sitofp i32 1 to x86_fp80
+  store x86_fp80 %tmp0, x86_fp80* %p, align 16
+  %tmp1 = getelementptr inbounds x86_fp80, x86_fp80* %p, i32 1
+  %i.next = add i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: pointer_iv_mixed
+;
+; Check multiple pointer induction variables where only one is recognized as
+; uniform and remains uniform after vectorization. The other pointer induction
+; variable is not recognized as uniform and is not uniform after vectorization
+; because it is stored to memory.
+;
+; CHECK-NOT: LV: Found uniform instruction: %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
+; CHECK:     LV: Found uniform instruction: %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %next.gep = getelementptr i32, i32* %a, i64 %index
+; CHECK:       %[[I1:.+]] = or i64 %index, 1
+; CHECK:       %next.gep10 = getelementptr i32, i32* %a, i64 %[[I1]]
+; CHECK:       %[[I2:.+]] = or i64 %index, 2
+; CHECK:       %next.gep11 = getelementptr i32, i32* %a, i64 %[[I2]]
+; CHECK:       %[[I3:.+]] = or i64 %index, 3
+; CHECK:       %next.gep12 = getelementptr i32, i32* %a, i64 %[[I3]]
+; CHECK:       %[[V0:.+]] = insertelement <4 x i32*> undef, i32* %next.gep, i32 0
+; CHECK:       %[[V1:.+]] = insertelement <4 x i32*> %[[V0]], i32* %next.gep10, i32 1
+; CHECK:       %[[V2:.+]] = insertelement <4 x i32*> %[[V1]], i32* %next.gep11, i32 2
+; CHECK:       %[[V3:.+]] = insertelement <4 x i32*> %[[V2]], i32* %next.gep12, i32 3
+; CHECK-NOT:   getelementptr
+; CHECK:       %next.gep13 = getelementptr i32*, i32** %b, i64 %index
+; CHECK-NOT:   getelementptr
+; CHECK:       %[[B0:.+]] = bitcast i32** %next.gep13 to <4 x i32*>*
+; CHECK:       store <4 x i32*> %[[V3]], <4 x i32*>* %[[B0]], align 8
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @pointer_iv_mixed(i32* %a, i32** %b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
+  %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
+  %tmp0 = phi i32 [ %tmp2, %for.body ], [ 0, %entry ]
+  %tmp1 = load i32, i32* %p, align 8
+  %tmp2 = add i32 %tmp1, %tmp0
+  store i32* %p, i32** %q, align 8
+  %tmp3 = getelementptr inbounds i32, i32* %p, i32 1
+  %tmp4 = getelementptr inbounds i32*, i32** %q, i32 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp5 = phi i32 [ %tmp2, %for.body ]
+  ret i32 %tmp5
+}
author	Matthew Simpson <mssimpso@codeaurora.org>
	Wed, 14 Sep 2016 14:47:40 +0000 (14:47 +0000)
committer	Matthew Simpson <mssimpso@codeaurora.org>
	Wed, 14 Sep 2016 14:47:40 +0000 (14:47 +0000)
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll		patch \| blob \| history