From: Matt Arsenault Date: Thu, 23 Feb 2017 03:58:53 +0000 (+0000) Subject: LoadStoreVectorizer: Split even sized illegal chains properly X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=210095c5c9a679a0e083f6fca8a9853dc02b16bc;p=llvm LoadStoreVectorizer: Split even sized illegal chains properly Implement isLegalToVectorizeLoadChain for AMDGPU to avoid producing private address spaces accesses that will need to be split up later. This was doing the wrong thing in the case where the queried chain was an even number of elements. A possible <4 x i32> store was being split into store <2 x i32> store i32 store i32 rather than store <2 x i32> store <2 x i32> when legal. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295933 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 6d2e2f0bbbb..a780a76c86a 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -129,6 +129,31 @@ unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { } } +bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + // We allow vectorization of flat stores, even though we may need to decompose + // them later if they may access private memory. We don't have enough context + // here, and legalization can handle it. + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { + return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && + ChainSizeInBytes <= ST->getMaxPrivateElementSize(); + } + return true; +} + +bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); +} + +bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); +} + unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Semi-arbitrary large amount. return 64; diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 3f72a63679f..c64c4bf5f6a 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -78,6 +78,17 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; + + bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + unsigned getMaxInterleaveFactor(unsigned VF); int getArithmeticInstrCost( diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 20fbcf4533d..4409d7a404f 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -432,9 +432,12 @@ Vectorizer::splitOddVectorElts(ArrayRef Chain, unsigned ElementSizeBytes = ElementSizeBits / 8; unsigned SizeBytes = ElementSizeBytes * Chain.size(); unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; - if (NumLeft == Chain.size()) - --NumLeft; - else if (NumLeft == 0) + if (NumLeft == Chain.size()) { + if ((NumLeft & 1) == 0) + NumLeft /= 2; // Split even in half + else + --NumLeft; // Split off last element + } else if (NumLeft == 0) NumLeft = 1; return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); } diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll index 4369dafa425..d8f72a8e1df 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll @@ -64,7 +64,10 @@ define void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 ; ALL: alloca [128 x i32], align 16 ; UNALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 1{{$}} -; ALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 4{{$}} + +; FIXME: Should change alignment +; ALIGNED: load i32 +; ALIGNED: load i32 define void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i32], align 16 %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll index fd0aaa615db..c85be874376 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll @@ -1,8 +1,9 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ALIGNED,ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ALIGNED,ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT8-UNALIGNED -check-prefix=ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ALIGNED,ALL %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT16-UNALIGNED -check-prefix=ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-ALIGNED,ALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-ALIGNED,ALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-ALIGNED,ALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-UNALIGNED,UNALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" @@ -28,6 +29,60 @@ define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 { ret void } +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align1( +; ALIGNED: store i32 9, i32* %out, align 1 +; ALIGNED: store i32 1, i32* %out.gep.1, align 1 +; ALIGNED: store i32 23, i32* %out.gep.2, align 1 +; ALIGNED: store i32 19, i32* %out.gep.3, align 1 + +; ELT16-UNALIGNED: store <4 x i32> , <4 x i32>* %1, align 1 + +; ELT8-UNALIGNED: store <2 x i32> , <2 x i32>* %1, align 1 +; ELT8-UNALIGNED: store <2 x i32> , <2 x i32>* %2, align 1 + +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +define void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 { + %out.gep.1 = getelementptr i32, i32* %out, i32 1 + %out.gep.2 = getelementptr i32, i32* %out, i32 2 + %out.gep.3 = getelementptr i32, i32* %out, i32 3 + + store i32 9, i32* %out, align 1 + store i32 1, i32* %out.gep.1, align 1 + store i32 23, i32* %out.gep.2, align 1 + store i32 19, i32* %out.gep.3, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2( +; ALIGNED: store i32 9, i32* %out, align 2 +; ALIGNED: store i32 1, i32* %out.gep.1, align 2 +; ALIGNED: store i32 23, i32* %out.gep.2, align 2 +; ALIGNED: store i32 19, i32* %out.gep.3, align 2 + +; ELT16-UNALIGNED: store <4 x i32> , <4 x i32>* %1, align 2 + +; ELT8-UNALIGNED: store <2 x i32> +; ELT8-UNALIGNED: store <2 x i32> + +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +define void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 { + %out.gep.1 = getelementptr i32, i32* %out, i32 1 + %out.gep.2 = getelementptr i32, i32* %out, i32 2 + %out.gep.3 = getelementptr i32, i32* %out, i32 3 + + store i32 9, i32* %out, align 2 + store i32 1, i32* %out.gep.1, align 2 + store i32 23, i32* %out.gep.2, align 2 + store i32 19, i32* %out.gep.3, align 2 + ret void +} + ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8( ; ALL: store <4 x i8> define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 { @@ -42,6 +97,25 @@ define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 { ret void } +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8_align1( +; ALIGNED: store i8 +; ALIGNED: store i8 +; ALIGNED: store i8 +; ALIGNED: store i8 + +; UNALIGNED: store <4 x i8> , <4 x i8>* %1, align 1 +define void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 { + %out.gep.1 = getelementptr i8, i8* %out, i32 1 + %out.gep.2 = getelementptr i8, i8* %out, i32 2 + %out.gep.3 = getelementptr i8, i8* %out, i32 3 + + store i8 9, i8* %out, align 1 + store i8 1, i8* %out.gep.1, align 1 + store i8 23, i8* %out.gep.2, align 1 + store i8 19, i8* %out.gep.3, align 1 + ret void +} + ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16( ; ALL: store <2 x i16> define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 { @@ -52,4 +126,106 @@ define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 { ret void } +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2( +; ALIGNED: store i16 +; ALIGNED: store i16 + +; UNALIGNED: store <2 x i16> , <2 x i16>* %1, align 2 +define void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 { + %out.gep.1 = getelementptr i16, i16* %out, i32 1 + + store i16 9, i16* %out, align 2 + store i16 12, i16* %out.gep.1, align 2 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align1( +; ALIGNED: store i16 +; ALIGNED: store i16 + +; UNALIGNED: store <2 x i16> , <2 x i16>* %1, align 1 +define void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 { + %out.gep.1 = getelementptr i16, i16* %out, i32 1 + + store i16 9, i16* %out, align 1 + store i16 12, i16* %out.gep.1, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8( +; ALL: store <2 x i16> , <2 x i16>* %1, align 8 +define void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 { + %out.gep.1 = getelementptr i16, i16* %out, i32 1 + + store i16 9, i16* %out, align 8 + store i16 12, i16* %out.gep.1, align 2 + ret void +} + +; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32 +; ELT4: store i32 +; ELT4: store i32 +; ELT4: store i32 + +; ELT8-ALIGNED: store i32 +; ELT8-ALIGNED: store i32 +; ELT8-ALIGNED: store i32 + +; ELT8-UNALIGNED: store <2 x i32> +; ELT8-UNALIGNED: store i32 + +; ELT16-ALIGNED: store i32 +; ELT16-ALIGNED: store i32 +; ELT16-ALIGNED: store i32 + +; ELT16-UNALIGNED: store <3 x i32> +define void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 { + %out.gep.1 = getelementptr i32, i32* %out, i32 1 + %out.gep.2 = getelementptr i32, i32* %out, i32 2 + + store i32 9, i32* %out + store i32 1, i32* %out.gep.1 + store i32 23, i32* %out.gep.2 + ret void +} + +; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32_align1( +; ALIGNED: store i32 +; ALIGNED: store i32 +; ALIGNED: store i32 + +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 +; ELT4-UNALIGNED: store i32 + +; ELT8-UNALIGNED: store <2 x i32> +; ELT8-UNALIGNED: store i32 + +; ELT16-UNALIGNED: store <3 x i32> +define void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 { + %out.gep.1 = getelementptr i32, i32* %out, i32 1 + %out.gep.2 = getelementptr i32, i32* %out, i32 2 + + store i32 9, i32* %out, align 1 + store i32 1, i32* %out.gep.1, align 1 + store i32 23, i32* %out.gep.2, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i8_align1( +; ALIGNED: store i8 +; ALIGNED: store i8 +; ALIGNED: store i8 + +; UNALIGNED: store <3 x i8> +define void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 { + %out.gep.1 = getelementptr i8, i8* %out, i8 1 + %out.gep.2 = getelementptr i8, i8* %out, i8 2 + + store i8 9, i8* %out, align 1 + store i8 1, i8* %out.gep.1, align 1 + store i8 23, i8* %out.gep.2, align 1 + ret void +} + attributes #0 = { nounwind }