From 42864a7b262cafaa1f29b96676251888d2b7581a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 2 Jul 2017 14:16:25 +0000 Subject: [PATCH] [X86][SSE] Attempt to combine 64-bit and 32-bit shuffles to unary shuffles before bit shifts We are combining shuffles to bit shifts before unary permutes, which means we can't fold loads plus the destination register is destructive git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306978 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 53 ++++++++++++------------------ test/CodeGen/X86/cast-vsel.ll | 2 +- test/CodeGen/X86/extract-store.ll | 2 +- 3 files changed, 23 insertions(+), 34 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c72f195ab65..45407114541 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -27156,6 +27156,26 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, } } + // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns. + // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we + // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). + if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) && + !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) { + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { + // Narrow the repeated mask to create 32-bit element permutes. + SmallVector WordMask = RepeatedMask; + if (MaskScalarSizeInBits == 64) + scaleShuffleMask(2, RepeatedMask, WordMask); + + Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); + ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); + ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); + PermuteImm = getV4X86ShuffleImm(WordMask); + return true; + } + } + // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) { SmallVector RepeatedMask; @@ -27201,38 +27221,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, } } - // Ensure we don't contain any zero elements. - if (ContainsZeros) - return false; - - assert(llvm::all_of(Mask, [&](int M) { - return SM_SentinelUndef <= M && M < (int)NumMaskElts; - }) && "Expected unary shuffle"); - - // We only support permutation of 32/64 bit elements after this. - if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64) - return false; - - // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we - // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). - if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX()) - return false; - - // We need a repeating shuffle mask for VPERMILPS/PSHUFD. - SmallVector RepeatedMask; - if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) - return false; - - // Narrow the repeated mask for 32-bit element permutes. - SmallVector WordMask = RepeatedMask; - if (MaskScalarSizeInBits == 64) - scaleShuffleMask(2, RepeatedMask, WordMask); - - Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); - ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32); - ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); - PermuteImm = getV4X86ShuffleImm(WordMask); - return true; + return false; } // Attempt to match a combined unary shuffle mask against supported binary diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll index 83ab2fac2f1..260535985e2 100644 --- a/test/CodeGen/X86/cast-vsel.ll +++ b/test/CodeGen/X86/cast-vsel.ll @@ -148,7 +148,7 @@ define <4 x double> @fpext(<4 x double> %a, <4 x double> %b, <4 x float> %c, <4 ; SSE2-NEXT: andnps %xmm5, %xmm0 ; SSE2-NEXT: orps %xmm4, %xmm0 ; SSE2-NEXT: cvtps2pd %xmm0, %xmm2 -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: cvtps2pd %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq diff --git a/test/CodeGen/X86/extract-store.ll b/test/CodeGen/X86/extract-store.ll index 48cb8d70b97..4ea6b7801fb 100644 --- a/test/CodeGen/X86/extract-store.ll +++ b/test/CodeGen/X86/extract-store.ll @@ -345,7 +345,7 @@ define void @extract_i64_1(i64* nocapture %dst, <2 x i64> %foo) nounwind { ; SSE-X32-LABEL: extract_i64_1: ; SSE-X32: # BB#0: ; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE-X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-X32-NEXT: movq %xmm0, (%eax) ; SSE-X32-NEXT: retl ; -- 2.40.0