From 106bcf1838fd7a38c3da1ecdcfe1cf4b122ef484 Mon Sep 17 00:00:00 2001 From: Alina Sbirlea Date: Wed, 23 Nov 2016 17:43:15 +0000 Subject: [PATCH] [LoadStoreVectorizer] Enable vectorization of stores in the presence of an aliasing load Summary: The "getVectorizablePrefix" method would give up if it found an aliasing load for a store chain. In practice, the aliasing load can be treated as a memory barrier and all stores that precede it are a valid vectorizable prefix. Issue found by volkan in D26962. Testcase is a pruned version of the one in the original patch. Reviewers: jlebar, arsenm, tstellarAMD Subscribers: mzolotukhin, wdng, nhaehnle, anna, volkan, llvm-commits Differential Revision: https://reviews.llvm.org/D27008 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@287781 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Vectorize/LoadStoreVectorizer.cpp | 28 ++++++++- .../AMDGPU/insertion-point.ll | 8 +-- .../AMDGPU/store_with_aliasing_load.ll | 58 +++++++++++++++++++ 3 files changed, 86 insertions(+), 8 deletions(-) create mode 100644 test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index ead17d01a96..c44a393cf84 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -477,10 +477,23 @@ Vectorizer::getVectorizablePrefix(ArrayRef Chain) { // Loop until we find an instruction in ChainInstrs that we can't vectorize. unsigned ChainInstrIdx = 0; + Instruction *BarrierMemoryInstr = nullptr; + for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; - bool AliasFound = false; + + // If a barrier memory instruction was found, chain instructions that follow + // will not be added to the valid prefix. + if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr)) + break; + + // Check (in BB order) if any instruction prevents ChainInstr from being + // vectorized. Find and store the first such "conflicting" instruction. for (Instruction *MemInstr : MemoryInstrs) { + // If a barrier memory instruction was found, do not check past it. + if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr)) + break; + if (isa(MemInstr) && isa(ChainInstr)) continue; @@ -508,12 +521,21 @@ Vectorizer::getVectorizablePrefix(ArrayRef Chain) { << " " << *ChainInstr << '\n' << " " << *getPointerOperand(ChainInstr) << '\n'; }); - AliasFound = true; + // Save this aliasing memory instruction as a barrier, but allow other + // instructions that precede the barrier to be vectorized with this one. + BarrierMemoryInstr = MemInstr; break; } } - if (AliasFound) + // Continue the search only for store chains, since vectorizing stores that + // precede an aliasing load is valid. Conversely, vectorizing loads is valid + // up to an aliasing store, but should not pull loads from further down in + // the basic block. + if (IsLoadChain && BarrierMemoryInstr) { + // The BarrierMemoryInstr is a store that precedes ChainInstr. + assert(OBB.dominates(BarrierMemoryInstr, ChainInstr)); break; + } } // Find the largest prefix of Chain whose elements are all in diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll index 7078a9b5c49..2b2f9cbcf50 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll @@ -61,13 +61,11 @@ entry: } ; Here we have four stores, with an aliasing load before the last one. We can -; vectorize the first two stores as <2 x float>, but this vectorized store must -; be inserted at the location of the second scalar store, not the fourth one. +; vectorize the first three stores as <3 x float>, but this vectorized store must +; be inserted at the location of the third scalar store, not the fourth one. ; ; CHECK-LABEL: @insert_store_point_alias -; CHECK: store <2 x float> -; CHECK: store float -; CHECK-SAME: %a.idx.2 +; CHECK: store <3 x float> ; CHECK: load float, float addrspace(1)* %a.idx.2 ; CHECK: store float ; CHECK-SAME: %a.idx.3 diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll new file mode 100644 index 00000000000..d70c449e14d --- /dev/null +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll @@ -0,0 +1,58 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s + +; Check that, in the presence of an aliasing load, the stores preceding the +; aliasing load are safe to vectorize. + +; CHECK-LABEL: store_vectorize_with_alias +; CHECK: store <4 x float> +; CHECK: load <4 x float> +; CHECK: store <4 x float> + +; Function Attrs: nounwind +define void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 { +bb: + %tmp = bitcast i8 addrspace(1)* %b to float addrspace(1)* + %tmp1 = load float, float addrspace(1)* %tmp, align 4 + + %tmp2 = bitcast i8 addrspace(1)* %a to float addrspace(1)* + store float %tmp1, float addrspace(1)* %tmp2, align 4 + %tmp3 = getelementptr i8, i8 addrspace(1)* %a, i64 4 + %tmp4 = bitcast i8 addrspace(1)* %tmp3 to float addrspace(1)* + store float %tmp1, float addrspace(1)* %tmp4, align 4 + %tmp5 = getelementptr i8, i8 addrspace(1)* %a, i64 8 + %tmp6 = bitcast i8 addrspace(1)* %tmp5 to float addrspace(1)* + store float %tmp1, float addrspace(1)* %tmp6, align 4 + %tmp7 = getelementptr i8, i8 addrspace(1)* %a, i64 12 + %tmp8 = bitcast i8 addrspace(1)* %tmp7 to float addrspace(1)* + store float %tmp1, float addrspace(1)* %tmp8, align 4 + + %tmp9 = getelementptr i8, i8 addrspace(1)* %b, i64 16 + %tmp10 = bitcast i8 addrspace(1)* %tmp9 to float addrspace(1)* + %tmp11 = load float, float addrspace(1)* %tmp10, align 4 + %tmp12 = getelementptr i8, i8 addrspace(1)* %b, i64 20 + %tmp13 = bitcast i8 addrspace(1)* %tmp12 to float addrspace(1)* + %tmp14 = load float, float addrspace(1)* %tmp13, align 4 + %tmp15 = getelementptr i8, i8 addrspace(1)* %b, i64 24 + %tmp16 = bitcast i8 addrspace(1)* %tmp15 to float addrspace(1)* + %tmp17 = load float, float addrspace(1)* %tmp16, align 4 + %tmp18 = getelementptr i8, i8 addrspace(1)* %b, i64 28 + %tmp19 = bitcast i8 addrspace(1)* %tmp18 to float addrspace(1)* + %tmp20 = load float, float addrspace(1)* %tmp19, align 4 + + %tmp21 = getelementptr i8, i8 addrspace(1)* %a, i64 16 + %tmp22 = bitcast i8 addrspace(1)* %tmp21 to float addrspace(1)* + store float %tmp11, float addrspace(1)* %tmp22, align 4 + %tmp23 = getelementptr i8, i8 addrspace(1)* %a, i64 20 + %tmp24 = bitcast i8 addrspace(1)* %tmp23 to float addrspace(1)* + store float %tmp14, float addrspace(1)* %tmp24, align 4 + %tmp25 = getelementptr i8, i8 addrspace(1)* %a, i64 24 + %tmp26 = bitcast i8 addrspace(1)* %tmp25 to float addrspace(1)* + store float %tmp17, float addrspace(1)* %tmp26, align 4 + %tmp27 = getelementptr i8, i8 addrspace(1)* %a, i64 28 + %tmp28 = bitcast i8 addrspace(1)* %tmp27 to float addrspace(1)* + store float %tmp20, float addrspace(1)* %tmp28, align 4 + + ret void +} + +attributes #0 = { argmemonly nounwind } -- 2.50.1