From: Alexey Bataev Date: Wed, 1 Mar 2017 10:50:44 +0000 (+0000) Subject: [SLP] Fix for PR32038: extra add of PHI node when it is not required. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=145ace26f579aec16cb58068732c7664695664ee;p=llvm [SLP] Fix for PR32038: extra add of PHI node when it is not required. Summary: If horizontal reduction tree starts from the binary operation that is used in PHI node, but this PHI is not used in horizontal reduction, we may end up with extra addition of this PHI node after vectorization. Here is an example: ``` %phi = phi i32 [ %tmp, %end], ... ... %tmp = add i32 %tmp1, %tmp2 end: ``` after vectorization we always have something like: ``` %phi = phi i32 [ %tmp, %end], ... ... %red = extractelement <8 x 32> %vec.red, 0 %tmp = add i32 %red, %phi end: ``` even if `%phi` is not used in reduction tree. Patch considers these PHI nodes as extra arguments and considers them in the final result iff they really used in reduction. Reviewers: mkuper, hfinkel, mzolotukhin Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D30409 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296606 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 9c6cc503ef0..c6d26abc76a 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4250,14 +4250,6 @@ class HorizontalReduction { MapVector ExtraArgs; BinaryOperator *ReductionRoot = nullptr; - // After successfull horizontal reduction vectorization attempt for PHI node - // vectorizer tries to update root binary op by combining vectorized tree and - // the ReductionPHI node. But during vectorization this ReductionPHI can be - // vectorized itself and replaced by the undef value, while the instruction - // itself is marked for deletion. This 'marked for deletion' PHI node then can - // be used in new binary operation, causing "Use still stuck around after Def - // is destroyed" crash upon PHI node deletion. - WeakVH ReductionPHI; /// The opcode of the reduction. Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd; @@ -4318,7 +4310,6 @@ public: ReductionOpcode = B->getOpcode(); ReducedValueOpcode = 0; ReductionRoot = B; - ReductionPHI = Phi; // We currently only support adds. if ((ReductionOpcode != Instruction::Add && @@ -4406,9 +4397,9 @@ public: Stack.push_back(std::make_pair(I, 0)); continue; } - // NextV is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), NextV); } + // NextV is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), NextV); } return true; } @@ -4497,12 +4488,7 @@ public: } } // Update users. - if (ReductionPHI && !isa(ReductionPHI)) { - assert(ReductionRoot && "Need a reduction operation"); - ReductionRoot->setOperand(0, VectorizedTree); - ReductionRoot->setOperand(1, ReductionPHI); - } else - ReductionRoot->replaceAllUsesWith(VectorizedTree); + ReductionRoot->replaceAllUsesWith(VectorizedTree); } return VectorizedTree != nullptr; } diff --git a/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/test/Transforms/SLPVectorizer/AArch64/gather-root.ll index 4085e099b90..b7fa5452f25 100644 --- a/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ b/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -9,7 +9,7 @@ target triple = "aarch64--linux-gnu" @a = common global [80 x i8] zeroinitializer, align 16 ; DEFAULT-LABEL: @PR28330( -; DEFAULT: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ] +; DEFAULT: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ] ; DEFAULT: %[[S0:.+]] = select <8 x i1> %1, <8 x i32> , <8 x i32> ; DEFAULT: %[[R0:.+]] = shufflevector <8 x i32> %[[S0]], <8 x i32> undef, <8 x i32> ; DEFAULT: %[[R1:.+]] = add <8 x i32> %[[S0]], %[[R0]] @@ -18,10 +18,10 @@ target triple = "aarch64--linux-gnu" ; DEFAULT: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> ; DEFAULT: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]] ; DEFAULT: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0 -; DEFAULT: %tmp34 = add i32 %[[R6]], %tmp17 +; DEFAULT: %bin.extra = add i32 %[[R6]], %tmp17 ; ; GATHER-LABEL: @PR28330( -; GATHER: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ] +; GATHER: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ] ; GATHER: %tmp19 = select i1 %tmp1, i32 -720, i32 -80 ; GATHER: %tmp21 = select i1 %tmp3, i32 -720, i32 -80 ; GATHER: %tmp23 = select i1 %tmp5, i32 -720, i32 -80 @@ -45,7 +45,7 @@ target triple = "aarch64--linux-gnu" ; GATHER: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> ; GATHER: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]] ; GATHER: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0 -; GATHER: %tmp34 = add i32 %[[R6]], %tmp17 +; GATHER: %bin.extra = add i32 %[[R6]], %tmp17 ; ; MAX-COST-LABEL: @PR28330( ; MAX-COST-NOT: shufflevector @@ -98,7 +98,7 @@ define void @PR32038(i32 %n) { ; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer ; DEFAULT-NEXT: br label [[FOR_BODY:%.*]] ; DEFAULT: for.body: -; DEFAULT-NEXT: [[TMP17:%.*]] = phi i32 [ [[TMP34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; DEFAULT-NEXT: [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; DEFAULT-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> , <8 x i32> ; DEFAULT-NEXT: [[TMP20:%.*]] = add i32 -5, undef ; DEFAULT-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], undef @@ -114,8 +114,8 @@ define void @PR32038(i32 %n) { ; DEFAULT-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> ; DEFAULT-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] ; DEFAULT-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; DEFAULT-NEXT: [[BIN_EXTRA:%.*]] = add i32 [[TMP3]], -5 -; DEFAULT-NEXT: [[TMP34]] = add i32 [[BIN_EXTRA]], [[TMP17]] +; DEFAULT-NEXT: [[BIN_EXTRA]] = add i32 [[TMP3]], -5 +; DEFAULT-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], undef ; DEFAULT-NEXT: br label [[FOR_BODY]] ; ; GATHER-LABEL: @PR32038( @@ -138,7 +138,7 @@ define void @PR32038(i32 %n) { ; GATHER-NEXT: [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0 ; GATHER-NEXT: br label [[FOR_BODY:%.*]] ; GATHER: for.body: -; GATHER-NEXT: [[TMP17:%.*]] = phi i32 [ [[TMP34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; GATHER-NEXT: [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; GATHER-NEXT: [[TMP19:%.*]] = select i1 [[TMP1]], i32 -720, i32 -80 ; GATHER-NEXT: [[TMP20:%.*]] = add i32 -5, [[TMP19]] ; GATHER-NEXT: [[TMP21:%.*]] = select i1 [[TMP3]], i32 -720, i32 -80 @@ -169,8 +169,8 @@ define void @PR32038(i32 %n) { ; GATHER-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> ; GATHER-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] ; GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; GATHER-NEXT: [[BIN_EXTRA:%.*]] = add i32 [[TMP8]], -5 -; GATHER-NEXT: [[TMP34]] = add i32 [[BIN_EXTRA]], [[TMP17]] +; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[TMP8]], -5 +; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]] ; GATHER-NEXT: br label [[FOR_BODY]] ; ; MAX-COST-LABEL: @PR32038( diff --git a/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/test/Transforms/SLPVectorizer/X86/reduction_loads.ll index 1ffbe0d87bf..8370c52e2ea 100644 --- a/test/Transforms/SLPVectorizer/X86/reduction_loads.ll +++ b/test/Transforms/SLPVectorizer/X86/reduction_loads.ll @@ -14,7 +14,7 @@ define i32 @test(i32* nocapture readonly %p) { ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* %p, i64 7 ; CHECK-NEXT: br label %for.body ; CHECK: for.body: -; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, %entry ], [ %add.7, %for.body ] +; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, %entry ], [ %bin.extra, %for.body ] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* %p to <8 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> , [[TMP1]] @@ -32,10 +32,10 @@ define i32 @test(i32* nocapture readonly %p) { ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> ; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[ADD_7:%.*]] = add i32 [[TMP4]], [[SUM]] -; CHECK-NEXT: br i1 true, label %for.end, label %for.body +; CHECK-NEXT: [[BIN_EXTRA:%.*]] = add i32 [[TMP4]], [[SUM]] +; CHECK: br i1 true, label %for.end, label %for.body ; CHECK: for.end: -; CHECK-NEXT: ret i32 [[ADD_7]] +; CHECK-NEXT: ret i32 [[BIN_EXTRA]] ; entry: %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1 diff --git a/test/Transforms/SLPVectorizer/X86/scheduling.ll b/test/Transforms/SLPVectorizer/X86/scheduling.ll index 5377ee82cf9..a01646b9dbe 100644 --- a/test/Transforms/SLPVectorizer/X86/scheduling.ll +++ b/test/Transforms/SLPVectorizer/X86/scheduling.ll @@ -12,7 +12,7 @@ define i32 @foo(i32* nocapture readonly %diff) #0 { ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: [[ADD52:%.*]] = add nsw i32 [[TMP15]], +; CHECK: [[ADD52:%.*]] = add i32 [[TMP15]], ; CHECK: ret i32 [[ADD52]] ; entry: