From: Simon Pilgrim Date: Mon, 5 Dec 2016 11:25:13 +0000 (+0000) Subject: [X86][SSE] Add support for combining target shuffles to UNPCKL/UNPCKH. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=26d70f7d778419378f7beb536d87a77348718912;p=llvm [X86][SSE] Add support for combining target shuffles to UNPCKL/UNPCKH. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288663 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5ca3831ccb0..bd8c6a2302d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -25846,8 +25846,10 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef Mask, SDValue &V1, SDValue &V2, const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &ShuffleVT) { + unsigned &Shuffle, MVT &ShuffleVT, + bool IsUnary) { bool FloatDomain = MaskVT.isFloatingPoint(); + unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) { @@ -25875,33 +25877,65 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef Mask, ShuffleVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) { - V2 = V1; - Shuffle = X86ISD::UNPCKL; - ShuffleVT = MVT::v4f32; - return true; - } - if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) { - V2 = V1; - Shuffle = X86ISD::UNPCKH; - ShuffleVT = MVT::v4f32; - return true; - } - if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) || - isTargetShuffleEquivalent( - Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) { - V2 = V1; - Shuffle = X86ISD::UNPCKL; - ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8; - return true; - } - if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) || - isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, - 13, 14, 14, 15, 15})) { - V2 = V1; - Shuffle = X86ISD::UNPCKH; - ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8; - return true; + } + + // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle. + if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) || + (MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512())) { + MVT LegalVT = MaskVT; + if (LegalVT.is256BitVector() && !Subtarget.hasAVX2()) + LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); + + SmallVector Unpckl, Unpckh; + if (IsUnary) { + createUnpackShuffleMask(MaskVT, Unpckl, true, true); + if (isTargetShuffleEquivalent(Mask, Unpckl)) { + V2 = V1; + Shuffle = X86ISD::UNPCKL; + ShuffleVT = LegalVT; + return true; + } + + createUnpackShuffleMask(MaskVT, Unpckh, false, true); + if (isTargetShuffleEquivalent(Mask, Unpckh)) { + V2 = V1; + Shuffle = X86ISD::UNPCKH; + ShuffleVT = LegalVT; + return true; + } + } else { + createUnpackShuffleMask(MaskVT, Unpckl, true, false); + if (isTargetShuffleEquivalent(Mask, Unpckl)) { + Shuffle = X86ISD::UNPCKL; + ShuffleVT = LegalVT; + return true; + } + + createUnpackShuffleMask(MaskVT, Unpckh, false, false); + if (isTargetShuffleEquivalent(Mask, Unpckh)) { + Shuffle = X86ISD::UNPCKH; + ShuffleVT = LegalVT; + return true; + } + + ShuffleVectorSDNode::commuteMask(Unpckl); + if (isTargetShuffleEquivalent(Mask, Unpckl)) { + std::swap(V1, V2); + Shuffle = X86ISD::UNPCKL; + ShuffleVT = LegalVT; + return true; + } + + ShuffleVectorSDNode::commuteMask(Unpckh); + if (isTargetShuffleEquivalent(Mask, Unpckh)) { + std::swap(V1, V2); + Shuffle = X86ISD::UNPCKH; + ShuffleVT = LegalVT; + return true; + } } } @@ -26167,7 +26201,7 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } if (matchBinaryVectorShuffle(MaskVT, Mask, V1, V2, Subtarget, Shuffle, - ShuffleVT)) { + ShuffleVT, UnaryShuffle)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) diff --git a/test/CodeGen/X86/combine-srem.ll b/test/CodeGen/X86/combine-srem.ll index e4cf821e003..f400781c420 100644 --- a/test/CodeGen/X86/combine-srem.ll +++ b/test/CodeGen/X86/combine-srem.ll @@ -56,12 +56,11 @@ define <4 x i32> @combine_vec_srem_by_pos1(<4 x i32> %x) { ; SSE-NEXT: andl $7, %eax ; SSE-NEXT: movd %eax, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; SSE-NEXT: pextrd $1, %xmm0, %eax ; SSE-NEXT: andl $3, %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_srem_by_pos1: @@ -74,12 +73,11 @@ define <4 x i32> @combine_vec_srem_by_pos1(<4 x i32> %x) { ; AVX-NEXT: andl $7, %eax ; AVX-NEXT: vmovd %eax, %xmm2 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX-NEXT: vpbroadcastq %xmm1, %xmm1 ; AVX-NEXT: vpextrd $1, %xmm0, %eax ; AVX-NEXT: andl $3, %eax ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = and <4 x i32> %x, %2 = srem <4 x i32> %1, diff --git a/test/CodeGen/X86/combine-urem.ll b/test/CodeGen/X86/combine-urem.ll index 0dc10164a5c..0c39bb280e8 100644 --- a/test/CodeGen/X86/combine-urem.ll +++ b/test/CodeGen/X86/combine-urem.ll @@ -54,12 +54,11 @@ define <4 x i32> @combine_vec_urem_by_pow2b(<4 x i32> %x) { ; SSE-NEXT: andl $7, %eax ; SSE-NEXT: movd %eax, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; SSE-NEXT: pextrd $1, %xmm0, %eax ; SSE-NEXT: andl $3, %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_urem_by_pow2b: @@ -71,12 +70,11 @@ define <4 x i32> @combine_vec_urem_by_pow2b(<4 x i32> %x) { ; AVX-NEXT: andl $7, %eax ; AVX-NEXT: vmovd %eax, %xmm2 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX-NEXT: vpbroadcastq %xmm1, %xmm1 ; AVX-NEXT: vpextrd $1, %xmm0, %eax ; AVX-NEXT: andl $3, %eax ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = urem <4 x i32> %x, ret <4 x i32> %1 diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index 749c49f7859..dc9df248a76 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -907,14 +907,12 @@ define <16 x i32> @combine_vpermi2var_16i32_identity(<16 x i32> %x0, <16 x i32> define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float> %a0, <16 x float> %a1) { ; X32-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps: ; X32: # BB#0: -; X32-NEXT: vmovaps {{.*#+}} zmm2 = [18,2,19,3,22,6,23,7,26,10,27,11,30,14,31,15] -; X32-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; X32-NEXT: vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps: ; X64: # BB#0: -; X64-NEXT: vmovaps {{.*#+}} zmm2 = [18,2,19,3,22,6,23,7,26,10,27,11,30,14,31,15] -; X64-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; X64-NEXT: vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; X64-NEXT: retq %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %a0, <16 x i32> , <16 x float> %a1, i16 -1) ret <16 x float> %res0 @@ -923,14 +921,12 @@ define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x i32> %a1) { ; X32-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq: ; X32: # BB#0: -; X32-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,1,17,4,20,5,21,8,24,9,25,12,28,13,29] -; X32-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; X32-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X32-NEXT: retl ; ; X64-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq: ; X64: # BB#0: -; X64-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,1,17,4,20,5,21,8,24,9,25,12,28,13,29] -; X64-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; X64-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X64-NEXT: retq %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %a0, <16 x i32> , <16 x i32> %a1, i16 -1) ret <16 x i32> %res0 diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index 33629a3288d..c5b4c50b2ca 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -78,14 +78,12 @@ define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_vperm2(<16 x i16> %x0, define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_unpckhwd(<16 x i16> %a0, <16 x i16> %a1) { ; X32-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpckhwd: ; X32: # BB#0: -; X32-NEXT: vmovdqu16 {{.*#+}} ymm2 = [20,4,21,5,22,6,23,7,28,12,29,13,30,14,31,15] -; X32-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; X32-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpckhwd: ; X64: # BB#0: -; X64-NEXT: vmovdqu16 {{.*#+}} ymm2 = [20,4,21,5,22,6,23,7,28,12,29,13,30,14,31,15] -; X64-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; X64-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; X64-NEXT: retq %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %a0, <16 x i16> , <16 x i16> %a1, i16 -1) ret <16 x i16> %res0 @@ -94,14 +92,12 @@ define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_unpckhwd(<16 x i16> %a define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_unpcklwd(<16 x i16> %a0, <16 x i16> %a1) { ; X32-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpcklwd: ; X32: # BB#0: -; X32-NEXT: vmovdqu16 {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] -; X32-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpcklwd: ; X64: # BB#0: -; X64-NEXT: vmovdqu16 {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] -; X64-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; X64-NEXT: retq %res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> , <16 x i16> %a0, <16 x i16> %a1, i16 -1) ret <16 x i16> %res0 diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll index 3d864358fbf..707cafe9931 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -230,12 +230,12 @@ define <16 x i8> @combine_vpperm_as_unary_unpckhbw(<16 x i8> %a0, <16 x i8> %a1) define <16 x i8> @combine_vpperm_as_unpckhbw(<16 x i8> %a0, <16 x i8> %a1) { ; X32-LABEL: combine_vpperm_as_unpckhbw: ; X32: # BB#0: -; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; X32-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; X32-NEXT: retl ; ; X64-LABEL: combine_vpperm_as_unpckhbw: ; X64: # BB#0: -; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; X64-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; X64-NEXT: retq %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> ) ret <16 x i8> %res0 @@ -244,12 +244,12 @@ define <16 x i8> @combine_vpperm_as_unpckhbw(<16 x i8> %a0, <16 x i8> %a1) { define <16 x i8> @combine_vpperm_as_unpcklbw(<16 x i8> %a0, <16 x i8> %a1) { ; X32-LABEL: combine_vpperm_as_unpcklbw: ; X32: # BB#0: -; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X32-NEXT: retl ; ; X64-LABEL: combine_vpperm_as_unpcklbw: ; X64: # BB#0: -; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X64-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X64-NEXT: retq %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> ) ret <16 x i8> %res0