From 62a43ff365b766a30c0bc6dcac079a7988a55488 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 1 Oct 2017 17:54:55 +0000 Subject: [PATCH] [X86][SSE] Improve shuffle combining of PACKSS instructions. Support unary packing and fix the faux shuffle mask for vectors larger than 128 bits. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@314629 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 30 +++++++++++++++---- .../X86/vector-shuffle-combining-avx2.ll | 14 +++------ .../X86/vector-shuffle-combining-ssse3.ll | 6 ++-- 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 424430e2826..e68a6ed2a84 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5932,16 +5932,34 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, return true; } case X86ISD::PACKSS: { + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) && + N1.getValueType().getVectorNumElements() == (NumElts / 2) && + "Unexpected input value type"); + // If we know input saturation won't happen we can treat this // as a truncation shuffle. - if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt || - DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt) + if (DAG.ComputeNumSignBits(N0) <= NumBitsPerElt || + DAG.ComputeNumSignBits(N1) <= NumBitsPerElt) return false; - Ops.push_back(N.getOperand(0)); - Ops.push_back(N.getOperand(1)); - for (unsigned i = 0; i != NumElts; ++i) - Mask.push_back(i * 2); + bool IsUnary = (N0 == N1); + unsigned Offset = IsUnary ? 0 : NumElts; + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumEltsPerLane = NumElts / NumLanes; + unsigned HalfEltsPerLane = NumEltsPerLane / 2; + + Ops.push_back(N0); + if (!IsUnary) + Ops.push_back(N1); + + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + for (unsigned Elt = 0; Elt != HalfEltsPerLane; ++Elt) + Mask.push_back((Elt * 2) + (Lane * NumEltsPerLane)); + for (unsigned Elt = 0; Elt != HalfEltsPerLane; ++Elt) + Mask.push_back((Elt * 2) + (Lane * NumEltsPerLane) + Offset); + } return true; } case X86ISD::VSHLI: diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 0e3c4e402a8..bd1b23c45bd 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -808,15 +808,13 @@ define <16 x i16> @shuffle_combine_packssdw_pshufb(<8 x i32> %a0) { ; X32-LABEL: shuffle_combine_packssdw_pshufb: ; X32: # BB#0: ; X32-NEXT: vpsrad $31, %ymm0, %ymm0 -; X32-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 -; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,6,7,4,5,2,3,0,1,16,17,18,19,20,21,22,23,22,23,20,21,18,19,16,17] +; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17] ; X32-NEXT: retl ; ; X64-LABEL: shuffle_combine_packssdw_pshufb: ; X64: # BB#0: ; X64-NEXT: vpsrad $31, %ymm0, %ymm0 -; X64-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 -; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,6,7,4,5,2,3,0,1,16,17,18,19,20,21,22,23,22,23,20,21,18,19,16,17] +; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17] ; X64-NEXT: retq %1 = ashr <8 x i32> %a0, %2 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %1) @@ -829,17 +827,13 @@ define <32 x i8> @shuffle_combine_packsswb_pshufb(<16 x i16> %a0, <16 x i16> %a1 ; X32-LABEL: shuffle_combine_packsswb_pshufb: ; X32: # BB#0: ; X32-NEXT: vpsraw $15, %ymm0, %ymm0 -; X32-NEXT: vpsraw $15, %ymm1, %ymm1 -; X32-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,23,22,21,20,19,18,17,16,23,22,21,20,19,18,17,16] +; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16] ; X32-NEXT: retl ; ; X64-LABEL: shuffle_combine_packsswb_pshufb: ; X64: # BB#0: ; X64-NEXT: vpsraw $15, %ymm0, %ymm0 -; X64-NEXT: vpsraw $15, %ymm1, %ymm1 -; X64-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,23,22,21,20,19,18,17,16,23,22,21,20,19,18,17,16] +; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16] ; X64-NEXT: retq %1 = ashr <16 x i16> %a0, %2 = ashr <16 x i16> %a1, diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index 6a88bf010e7..874d090794c 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -644,15 +644,13 @@ define <16 x i8> @shuffle_combine_packssdw_pshufb(<4 x i32> %a0) { ; SSE-LABEL: shuffle_combine_packssdw_pshufb: ; SSE: # BB#0: ; SSE-NEXT: psrad $31, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 -; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13,12,9,8,5,4,1,0,13,12,9,8,5,4,1,0] ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_combine_packssdw_pshufb: ; AVX: # BB#0: ; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13,12,9,8,5,4,1,0,13,12,9,8,5,4,1,0] ; AVX-NEXT: retq %1 = ashr <4 x i32> %a0, %2 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %1) -- 2.50.1