From cf10da4c220d7dc4ec9c0abb7814cd3fbe78b66f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 10 Jul 2019 10:46:36 +0000 Subject: [PATCH] [X86][SSE] EltsFromConsecutiveLoads - add basic dereferenceable support This patch checks to see if the vector element loads are based off a dereferenceable pointer that covers the entire vector width, in which case we don't need to have element loads at both extremes of the vector width - just the start (base pointer) of it. Another step towards partial vector loads...... Differential Revision: https://reviews.llvm.org/D64205 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@365614 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 22 ++++++++---- test/CodeGen/X86/load-partial.ll | 57 ++++++------------------------ 2 files changed, 26 insertions(+), 53 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2812b088b61..6fee0bb8764 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7595,11 +7595,19 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, return NewLd; }; - // LOAD - all consecutive load/undefs (must start/end with a load). - // If we have found an entire vector of loads and undefs, then return a large - // load of the entire vector width starting at the base pointer. - // If the vector contains zeros, then attempt to shuffle those elements. - if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) && + // Check if the base load is entirely dereferenceable. + bool IsDereferenceable = + LDBase && + LDBase->getPointerInfo().isDereferenceable( + VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout()); + + // LOAD - all consecutive load/undefs (must start/end with a load or be + // entirely dereferenceable). If we have found an entire vector of loads and + // undefs, then return a large load of the entire vector width starting at the + // base pointer. If the vector contains zeros, then attempt to shuffle those + // elements. + if (FirstLoadedElt == 0 && + (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) && (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { assert(LDBase && "Did not find base load for merging consecutive loads"); EVT EltVT = LDBase->getValueType(0); @@ -7620,12 +7628,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, if (NumElems == 1) return DAG.getBitcast(VT, Elts[FirstLoadedElt]); - if (IsConsecutiveLoad) + if (!ZeroMask) return CreateLoad(VT, LDBase); // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded // vector and a zero vector to clear out the zero elements. - if (!isAfterLegalize && VT.isVector() && NumElems == VT.getVectorNumElements()) { + if (!isAfterLegalize && VT.isVector()) { SmallVector ClearMask(NumElems, -1); for (unsigned i = 0; i < NumElems; ++i) { if (ZeroMask[i]) diff --git a/test/CodeGen/X86/load-partial.ll b/test/CodeGen/X86/load-partial.ll index 6d8d6632bd9..a0a94b96f7d 100644 --- a/test/CodeGen/X86/load-partial.ll +++ b/test/CodeGen/X86/load-partial.ll @@ -10,30 +10,14 @@ ; define <4 x float> @load_float4_float3(<4 x float>* nocapture readonly dereferenceable(16)) { -; SSE2-LABEL: load_float4_float3: -; SSE2: # %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_float4_float3: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_float4_float3: -; SSE41: # %bb.0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: retq +; SSE-LABEL: load_float4_float3: +; SSE: # %bb.0: +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: load_float4_float3: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: retq %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0 %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1 @@ -48,30 +32,14 @@ define <4 x float> @load_float4_float3(<4 x float>* nocapture readonly dereferen } define <8 x float> @load_float8_float3(<4 x float>* nocapture readonly dereferenceable(16)) { -; SSE2-LABEL: load_float8_float3: -; SSE2: # %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_float8_float3: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_float8_float3: -; SSE41: # %bb.0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: retq +; SSE-LABEL: load_float8_float3: +; SSE: # %bb.0: +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: load_float8_float3: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: retq %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0 %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1 @@ -199,10 +167,7 @@ define <4 x double> @load_double4_0u2u(double* nocapture readonly dereferenceabl ; ; AVX-LABEL: load_double4_0u2u: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2] ; AVX-NEXT: retq %2 = load double, double* %0, align 8 %3 = insertelement <4 x double> undef, double %2, i32 0 -- 2.50.0