From 685d8c452b691271ad0deaf80116a2de89532c0d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 27 Nov 2016 21:08:19 +0000 Subject: [PATCH] [X86][SSE] Add support for combining target shuffles to 128/256-bit PSLL/PSRL bit shifts git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288006 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 71 ++++++------------- .../X86/vector-shuffle-combining-avx2.ll | 12 ++-- .../X86/vector-shuffle-combining-ssse3.ll | 12 ++-- 3 files changed, 34 insertions(+), 61 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 74687213856..25ad59f919b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -25480,63 +25480,36 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); - unsigned NumLanes = MaskVT.getSizeInBits() / 128; - unsigned NumEltsPerLane = NumMaskElts / NumLanes; bool FloatDomain = MaskVT.isFloatingPoint(); - // Attempt to match against PSLLDQ/PSRLDQ byte shifts. - // TODO: Share common code with lowerVectorShuffleAsShift? - // - // PSLLDQ : (little-endian) left byte shift - // [ zz, 0, 1, 2, 3, 4, 5, 6] - // [ zz, zz, -1, -1, 2, 3, 4, -1] - // [ zz, zz, zz, zz, zz, zz, -1, 1] - // PSRLDQ : (little-endian) right byte shift - // [ 5, 6, 7, zz, zz, zz, zz, zz] - // [ -1, 5, 6, 7, zz, zz, zz, zz] - // [ 1, 2, -1, -1, -1, -1, zz, zz] + bool ContainsZeros = false; + SmallBitVector Zeroable(NumMaskElts, false); + for (unsigned i = 0; i != NumMaskElts; ++i) { + int M = Mask[i]; + Zeroable[i] = isUndefOrZero(M); + ContainsZeros |= (M == SM_SentinelZero); + } + + // Attempt to match against byte/bit shifts. + // FIXME: Add 512-bit support. if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { - for (unsigned Shift = 1; Shift != NumEltsPerLane; ++Shift) { - bool IsVSHLDQ = true; - bool IsVSRLDQ = true; - - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - unsigned Base = Lane * NumEltsPerLane; - unsigned Ofs = NumEltsPerLane - Shift; - - IsVSHLDQ &= isUndefOrZeroInRange(Mask, Base, Shift); - IsVSHLDQ &= isSequentialOrUndefInRange(Mask, Base + Shift, Ofs, Base); - - IsVSRLDQ &= isUndefOrZeroInRange(Mask, Base + Ofs, Shift); - IsVSRLDQ &= isSequentialOrUndefInRange(Mask, Base, Ofs, Base + Shift); - - if (!IsVSHLDQ && !IsVSRLDQ) - break; - } - - if (IsVSHLDQ) { - Shuffle = X86ISD::VSHLDQ; - ShuffleVT = MVT::getVectorVT(MVT::i8, NumLanes * 16); - PermuteImm = Shift * (MaskVT.getScalarSizeInBits() / 8); - return true; - } - if (IsVSRLDQ) { - Shuffle = X86ISD::VSRLDQ; - ShuffleVT = MVT::getVectorVT(MVT::i8, NumLanes * 16); - PermuteImm = Shift * (MaskVT.getScalarSizeInBits() / 8); - return true; - } + int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, + MaskVT.getScalarSizeInBits(), Mask, + 0, Zeroable, Subtarget); + if (0 < ShiftAmt) { + PermuteImm = (unsigned)ShiftAmt; + return true; } } // Ensure we don't contain any zero elements. - for (int M : Mask) { - if (M == SM_SentinelZero) - return false; - assert(SM_SentinelUndef <= M && M < (int)Mask.size() && - "Expected unary shuffle"); - } + if (ContainsZeros) + return false; + + assert(llvm::all_of(Mask, [&](int M) { + return SM_SentinelUndef <= M && M < (int)NumMaskElts; + }) && "Expected unary shuffle"); unsigned InputSizeInBits = MaskVT.getSizeInBits(); unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size(); diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 4fd220702fc..89194c3e00c 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -514,12 +514,12 @@ define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) { define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) { ; X32-LABEL: combine_pshufb_as_psrlw: ; X32: # BB#0: -; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero +; X32-NEXT: vpsrlw $8, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_pshufb_as_psrlw: ; X64: # BB#0: -; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero +; X64-NEXT: vpsrlw $8, %ymm0, %ymm0 ; X64-NEXT: retq %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) ret <32 x i8> %res0 @@ -528,12 +528,12 @@ define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) { define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) { ; X32-LABEL: combine_pshufb_as_pslld: ; X32: # BB#0: -; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28] +; X32-NEXT: vpslld $24, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_pshufb_as_pslld: ; X64: # BB#0: -; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28] +; X64-NEXT: vpslld $24, %ymm0, %ymm0 ; X64-NEXT: retq %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) ret <32 x i8> %res0 @@ -542,12 +542,12 @@ define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) { define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) { ; X32-LABEL: combine_pshufb_as_psrlq: ; X32: # BB#0: -; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[5,6,7],zero,zero,zero,zero,zero,ymm0[13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23],zero,zero,zero,zero,zero,ymm0[29,30,31],zero,zero,zero,zero,zero +; X32-NEXT: vpsrlq $40, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_pshufb_as_psrlq: ; X64: # BB#0: -; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[5,6,7],zero,zero,zero,zero,zero,ymm0[13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23],zero,zero,zero,zero,zero,ymm0[29,30,31],zero,zero,zero,zero,zero +; X64-NEXT: vpsrlq $40, %ymm0, %ymm0 ; X64-NEXT: retq %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) ret <32 x i8> %res0 diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index 97bae0550c8..7676e8309f2 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -299,12 +299,12 @@ define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) { define <16 x i8> @combine_pshufb_as_psrlw(<16 x i8> %a0) { ; SSE-LABEL: combine_pshufb_as_psrlw: ; SSE: # BB#0: -; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,xmm0[3],zero,xmm0[5],zero,xmm0[7],zero,xmm0[9],zero,xmm0[11],zero,xmm0[13],zero,xmm0[15],zero +; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_pshufb_as_psrlw: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1],zero,xmm0[3],zero,xmm0[5],zero,xmm0[7],zero,xmm0[9],zero,xmm0[11],zero,xmm0[13],zero,xmm0[15],zero +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX-NEXT: retq %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) ret <16 x i8> %res0 @@ -313,12 +313,12 @@ define <16 x i8> @combine_pshufb_as_psrlw(<16 x i8> %a0) { define <16 x i8> @combine_pshufb_as_pslld(<16 x i8> %a0) { ; SSE-LABEL: combine_pshufb_as_pslld: ; SSE: # BB#0: -; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[12] +; SSE-NEXT: pslld $24, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_pshufb_as_pslld: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[12] +; AVX-NEXT: vpslld $24, %xmm0, %xmm0 ; AVX-NEXT: retq %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) ret <16 x i8> %res0 @@ -327,12 +327,12 @@ define <16 x i8> @combine_pshufb_as_pslld(<16 x i8> %a0) { define <16 x i8> @combine_pshufb_as_psrlq(<16 x i8> %a0) { ; SSE-LABEL: combine_pshufb_as_psrlq: ; SSE: # BB#0: -; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,6,7],zero,zero,zero,zero,zero,xmm0[13,14,15],zero,zero,zero,zero,zero +; SSE-NEXT: psrlq $40, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_pshufb_as_psrlq: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,6,7],zero,zero,zero,zero,zero,xmm0[13,14,15],zero,zero,zero,zero,zero +; AVX-NEXT: vpsrlq $40, %xmm0, %xmm0 ; AVX-NEXT: retq %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) ret <16 x i8> %res0 -- 2.50.1