From 8d23745f59b90c6229cb58fb65ef8cf2725826ea Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 1 Mar 2017 12:43:39 +0000 Subject: [PATCH] [SLP] Preserve IR flags when vectorizing horizontal reductions. Summary: The SLP vectorizer should propagate IR-level optimization hints/flags (nsw, nuw, exact, fast-math) when converting scalar horizontal reductions instructions into vectors, just like for other vectorized instructions. It doe not include IR propagation for extra arguments, we need to handle original scalar operations for extra args to propagate correct flags. Reviewers: mkuper, mzolotukhin, hfinkel Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D30418 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296614 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 7 +++-- .../SLPVectorizer/X86/horizontal.ll | 28 +++++++++---------- .../SLPVectorizer/X86/scheduling.ll | 4 +-- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index de18a654046..b0c70285dd0 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4459,11 +4459,12 @@ public: // Emit a reduction. Value *ReducedSubTree = - emitReduction(VectorizedRoot, Builder, ReduxWidth); + emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps); if (VectorizedTree) { Builder.SetCurrentDebugLocation(Loc); VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, ReducedSubTree, "bin.rdx"); + propagateIRFlags(VectorizedTree, ReductionOps); } else VectorizedTree = ReducedSubTree; i += ReduxWidth; @@ -4477,6 +4478,7 @@ public: Builder.SetCurrentDebugLocation(I->getDebugLoc()); VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I); + propagateIRFlags(VectorizedTree, ReductionOps); } for (auto &Pair : ExternallyUsedValues) { assert(!Pair.second.empty() && @@ -4527,7 +4529,7 @@ private: /// \brief Emit a horizontal reduction of the vectorized value. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, - unsigned ReduxWidth) { + unsigned ReduxWidth, ArrayRef RedOps) { assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); @@ -4554,6 +4556,7 @@ private: TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx"); } + propagateIRFlags(TmpVec, RedOps); } // The result is in the first element of the vector. diff --git a/test/Transforms/SLPVectorizer/X86/horizontal.ll b/test/Transforms/SLPVectorizer/X86/horizontal.ll index 715da3e3abf..080f850f91c 100644 --- a/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ b/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -624,9 +624,9 @@ define void @i32_red_example4(i32* %res) { ; STORE-LABEL: @i32_red_example4( ; STORE: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16 ; STORE: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> -; STORE-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP0]], [[RDX_SHUF]] +; STORE-NEXT: [[BIN_RDX:%.*]] = add nsw <4 x i32> [[TMP0]], [[RDX_SHUF]] ; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> -; STORE-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; STORE-NEXT: [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; STORE-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 ; STORE: store i32 [[TMP1]], i32* %res, align 16 ; STORE-NEXT: ret void @@ -647,11 +647,11 @@ define void @i32_red_example8(i32* %res) { ; STORE-LABEL: @i32_red_example8( ; STORE: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16 ; STORE: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> -; STORE-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP0]], [[RDX_SHUF]] +; STORE-NEXT: [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]] ; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; STORE-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; STORE-NEXT: [[BIN_RDX2:%.*]] = add nsw <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; STORE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; STORE-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; STORE-NEXT: [[BIN_RDX4:%.*]] = add nsw <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] ; STORE-NEXT: [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 ; STORE: store i32 [[TMP1]], i32* %res, align 16 ; STORE-NEXT: ret void @@ -680,13 +680,13 @@ define void @i32_red_example16(i32* %res) { ; STORE-LABEL: @i32_red_example16( ; STORE: [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16 ; STORE: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <16 x i32> -; STORE-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP0]], [[RDX_SHUF]] +; STORE-NEXT: [[BIN_RDX:%.*]] = add nsw <16 x i32> [[TMP0]], [[RDX_SHUF]] ; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> -; STORE-NEXT: [[BIN_RDX2:%.*]] = add <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; STORE-NEXT: [[BIN_RDX2:%.*]] = add nsw <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; STORE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> -; STORE-NEXT: [[BIN_RDX4:%.*]] = add <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; STORE-NEXT: [[BIN_RDX4:%.*]] = add nsw <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] ; STORE-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> -; STORE-NEXT: [[BIN_RDX6:%.*]] = add <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; STORE-NEXT: [[BIN_RDX6:%.*]] = add nsw <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] ; STORE-NEXT: [[TMP1:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 ; STORE: store i32 [[TMP1]], i32* %res, align 16 ; STORE-NEXT: ret void @@ -731,15 +731,15 @@ define void @i32_red_example32(i32* %res) { ; STORE-LABEL: @i32_red_example32( ; STORE: [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16 ; STORE: [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> undef, <32 x i32> -; STORE-NEXT: [[BIN_RDX:%.*]] = add <32 x i32> [[TMP0]], [[RDX_SHUF]] +; STORE-NEXT: [[BIN_RDX:%.*]] = add nsw <32 x i32> [[TMP0]], [[RDX_SHUF]] ; STORE-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> -; STORE-NEXT: [[BIN_RDX2:%.*]] = add <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; STORE-NEXT: [[BIN_RDX2:%.*]] = add nsw <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; STORE-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> -; STORE-NEXT: [[BIN_RDX4:%.*]] = add <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; STORE-NEXT: [[BIN_RDX4:%.*]] = add nsw <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] ; STORE-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> -; STORE-NEXT: [[BIN_RDX6:%.*]] = add <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] +; STORE-NEXT: [[BIN_RDX6:%.*]] = add nsw <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] ; STORE-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> -; STORE-NEXT: [[BIN_RDX8:%.*]] = add <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] +; STORE-NEXT: [[BIN_RDX8:%.*]] = add nsw <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] ; STORE-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0 ; STORE: store i32 [[TMP1]], i32* %res, align 16 ; STORE-NEXT: ret void diff --git a/test/Transforms/SLPVectorizer/X86/scheduling.ll b/test/Transforms/SLPVectorizer/X86/scheduling.ll index 8395401c5df..c4f521c8963 100644 --- a/test/Transforms/SLPVectorizer/X86/scheduling.ll +++ b/test/Transforms/SLPVectorizer/X86/scheduling.ll @@ -8,9 +8,9 @@ define i32 @foo(i32* nocapture readonly %diff) #0 { ; CHECK: [[S1:%.+]] = add nsw <4 x i32> ; CHECK: store <4 x i32> [[S1]], ; CHECK: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[S1]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[S1]], [[RDX_SHUF]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add nsw <4 x i32> [[S1]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 ; CHECK: [[ADD52:%.*]] = add nsw i32 [[TMP15]], ; CHECK: ret i32 [[ADD52]] -- 2.50.1