From a88ec277ba483d6548fa89f874ca815f36d62b5e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 1 Dec 2016 13:47:02 +0000 Subject: [PATCH] [X86][SSE] Add support for combining target shuffles to AND bitmasks. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288335 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 31 +++++++++++++++++++ .../X86/vector-shuffle-combining-avx2.ll | 12 +++---- .../X86/vector-shuffle-combining-ssse3.ll | 2 +- test/CodeGen/X86/vector-zext.ll | 2 +- 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 63f191fa8dd..4b48537f701 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26257,6 +26257,37 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return false; } + // See if we can combine a single input shuffle with zeros to a bit-mask, + // which is much simpler than any shuffle. + if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) && + isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) && + DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) { + APInt Zero = APInt::getNullValue(MaskEltSizeInBits); + APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits); + SmallBitVector UndefElts(NumMaskElts, false); + SmallVector EltBits(NumMaskElts, Zero); + for (unsigned i = 0; i != NumMaskElts; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) { + UndefElts[i] = true; + continue; + } + if (M == SM_SentinelZero) + continue; + EltBits[i] = AllOnes; + } + SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL); + DCI.AddToWorklist(BitMask.getNode()); + Res = DAG.getBitcast(MaskVT, V1); + DCI.AddToWorklist(Res.getNode()); + unsigned AndOpcode = FloatDomain ? X86ISD::FAND : ISD::AND; + Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + // If we have a single input shuffle with different shuffle patterns in the // the 128-bit lanes use the variable mask to VPERMILPS. // TODO Combine other mask types at higher depths. diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 4fddbac7502..68fceef285c 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -72,12 +72,12 @@ define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) { define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) { ; X32-LABEL: combine_and_pshufb: ; X32: # BB#0: -; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[24,25],zero,zero,zero,zero,zero,zero +; X32-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_and_pshufb: ; X64: # BB#0: -; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[24,25],zero,zero,zero,zero,zero,zero +; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> ) @@ -669,12 +669,12 @@ define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) { define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) { ; X32-LABEL: combine_psrlw_pshufb: ; X32: # BB#0: -; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31] +; X32-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_psrlw_pshufb: ; X64: # BB#0: -; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31] +; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq %1 = lshr <16 x i16> %a0, %2 = bitcast <16 x i16> %1 to <32 x i8> @@ -685,12 +685,12 @@ define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) { define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) { ; X32-LABEL: combine_pslld_pshufb: ; X32: # BB#0: -; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28],zero,zero,zero +; X32-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_pslld_pshufb: ; X64: # BB#0: -; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28],zero,zero,zero +; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq %1 = shl <8 x i32> %a0, %2 = bitcast <8 x i32> %1 to <32 x i8> diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index b7aa81964c3..f38373b26ec 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -277,7 +277,7 @@ define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) { define <16 x i8> @combine_and_pshufb(<16 x i8> %a0) { ; SSSE3-LABEL: combine_and_pshufb: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_and_pshufb: diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll index 8d8d7aa4448..1febf559bde 100644 --- a/test/CodeGen/X86/vector-zext.ll +++ b/test/CodeGen/X86/vector-zext.ll @@ -1674,7 +1674,7 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) { ; SSSE3-LABEL: shuf_zext_8i8_to_8i32: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2],zero,xmm1[4],zero,xmm1[6],zero,xmm1[8],zero,xmm1[10],zero,xmm1[12],zero,xmm1[14],zero +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -- 2.50.1