From 685d8c452b691271ad0deaf80116a2de89532c0d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 27 Nov 2016 21:08:19 +0000
Subject: [PATCH] [X86][SSE] Add support for combining target shuffles to
 128/256-bit PSLL/PSRL bit shifts

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288006 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp            | 71 ++++++-------------
 .../X86/vector-shuffle-combining-avx2.ll      | 12 ++--
 .../X86/vector-shuffle-combining-ssse3.ll     | 12 ++--
 3 files changed, 34 insertions(+), 61 deletions(-)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 74687213856..25ad59f919b 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -25480,63 +25480,36 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                            unsigned &Shuffle, MVT &ShuffleVT,
                                            unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
-  unsigned NumLanes = MaskVT.getSizeInBits() / 128;
-  unsigned NumEltsPerLane = NumMaskElts / NumLanes;
   bool FloatDomain = MaskVT.isFloatingPoint();
 
-  // Attempt to match against PSLLDQ/PSRLDQ byte shifts.
-  // TODO: Share common code with lowerVectorShuffleAsShift?
-  //
-  // PSLLDQ : (little-endian) left byte shift
-  // [ zz,  0,  1,  2,  3,  4,  5,  6]
-  // [ zz, zz, -1, -1,  2,  3,  4, -1]
-  // [ zz, zz, zz, zz, zz, zz, -1,  1]
-  // PSRLDQ : (little-endian) right byte shift
-  // [  5, 6,  7, zz, zz, zz, zz, zz]
-  // [ -1, 5,  6,  7, zz, zz, zz, zz]
-  // [  1, 2, -1, -1, -1, -1, zz, zz]
+  bool ContainsZeros = false;
+  SmallBitVector Zeroable(NumMaskElts, false);
+  for (unsigned i = 0; i != NumMaskElts; ++i) {
+    int M = Mask[i];
+    Zeroable[i] = isUndefOrZero(M);
+    ContainsZeros |= (M == SM_SentinelZero);
+  }
+
+  // Attempt to match against byte/bit shifts.
+  // FIXME: Add 512-bit support.
   if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
                        (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
-    for (unsigned Shift = 1; Shift != NumEltsPerLane; ++Shift) {
-      bool IsVSHLDQ = true;
-      bool IsVSRLDQ = true;
-
-      for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-        unsigned Base = Lane * NumEltsPerLane;
-        unsigned Ofs = NumEltsPerLane - Shift;
-
-        IsVSHLDQ &= isUndefOrZeroInRange(Mask, Base, Shift);
-        IsVSHLDQ &= isSequentialOrUndefInRange(Mask, Base + Shift, Ofs, Base);
-
-        IsVSRLDQ &= isUndefOrZeroInRange(Mask, Base + Ofs, Shift);
-        IsVSRLDQ &= isSequentialOrUndefInRange(Mask, Base, Ofs, Base + Shift);
-
-        if (!IsVSHLDQ && !IsVSRLDQ)
-          break;
-      }
-
-      if (IsVSHLDQ) {
-        Shuffle = X86ISD::VSHLDQ;
-        ShuffleVT = MVT::getVectorVT(MVT::i8, NumLanes * 16);
-        PermuteImm = Shift * (MaskVT.getScalarSizeInBits() / 8);
-        return true;
-      }
-      if (IsVSRLDQ) {
-        Shuffle = X86ISD::VSRLDQ;
-        ShuffleVT = MVT::getVectorVT(MVT::i8, NumLanes * 16);
-        PermuteImm = Shift * (MaskVT.getScalarSizeInBits() / 8);
-        return true;
-      }
+    int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
+                                             MaskVT.getScalarSizeInBits(), Mask,
+                                             0, Zeroable, Subtarget);
+    if (0 < ShiftAmt) {
+      PermuteImm = (unsigned)ShiftAmt;
+      return true;
     }
   }
 
   // Ensure we don't contain any zero elements.
-  for (int M : Mask) {
-    if (M == SM_SentinelZero)
-      return false;
-    assert(SM_SentinelUndef <= M && M < (int)Mask.size() &&
-           "Expected unary shuffle");
-  }
+  if (ContainsZeros)
+    return false;
+
+  assert(llvm::all_of(Mask, [&](int M) {
+                        return SM_SentinelUndef <= M && M < (int)NumMaskElts;
+                      }) && "Expected unary shuffle");
 
   unsigned InputSizeInBits = MaskVT.getSizeInBits();
   unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 4fd220702fc..89194c3e00c 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -514,12 +514,12 @@ define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
 define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) {
 ; X32-LABEL: combine_pshufb_as_psrlw:
 ; X32:       # BB#0:
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero
+; X32-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_pshufb_as_psrlw:
 ; X64:       # BB#0:
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero
+; X64-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; X64-NEXT:    retq
   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128, i8 17, i8 128, i8 19, i8 128, i8 21, i8 128, i8 23, i8 128, i8 25, i8 128, i8 27, i8 128, i8 29, i8 128, i8 31, i8 128>)
   ret <32 x i8> %res0
@@ -528,12 +528,12 @@ define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) {
 define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) {
 ; X32-LABEL: combine_pshufb_as_pslld:
 ; X32:       # BB#0:
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28]
+; X32-NEXT:    vpslld $24, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_pshufb_as_pslld:
 ; X64:       # BB#0:
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28]
+; X64-NEXT:    vpslld $24, %ymm0, %ymm0
 ; X64-NEXT:    retq
   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12, i8 128, i8 128, i8 128, i8 16, i8 128, i8 128, i8 128, i8 20, i8 128, i8 128, i8 128, i8 24, i8 128, i8 128, i8 128, i8 28>)
   ret <32 x i8> %res0
@@ -542,12 +542,12 @@ define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) {
 define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) {
 ; X32-LABEL: combine_pshufb_as_psrlq:
 ; X32:       # BB#0:
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[5,6,7],zero,zero,zero,zero,zero,ymm0[13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23],zero,zero,zero,zero,zero,ymm0[29,30,31],zero,zero,zero,zero,zero
+; X32-NEXT:    vpsrlq $40, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_pshufb_as_psrlq:
 ; X64:       # BB#0:
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[5,6,7],zero,zero,zero,zero,zero,ymm0[13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23],zero,zero,zero,zero,zero,ymm0[29,30,31],zero,zero,zero,zero,zero
+; X64-NEXT:    vpsrlq $40, %ymm0, %ymm0
 ; X64-NEXT:    retq
   %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 21, i8 22, i8 23, i8 128, i8 128, i8 128, i8 128, i8 128, i8 29, i8 30, i8 31, i8 128, i8 128, i8 128, i8 128, i8 128>)
   ret <32 x i8> %res0
diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 97bae0550c8..7676e8309f2 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -299,12 +299,12 @@ define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) {
 define <16 x i8> @combine_pshufb_as_psrlw(<16 x i8> %a0) {
 ; SSE-LABEL: combine_pshufb_as_psrlw:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1],zero,xmm0[3],zero,xmm0[5],zero,xmm0[7],zero,xmm0[9],zero,xmm0[11],zero,xmm0[13],zero,xmm0[15],zero
+; SSE-NEXT:    psrlw $8, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_pshufb_as_psrlw:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1],zero,xmm0[3],zero,xmm0[5],zero,xmm0[7],zero,xmm0[9],zero,xmm0[11],zero,xmm0[13],zero,xmm0[15],zero
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128>)
   ret <16 x i8> %res0
@@ -313,12 +313,12 @@ define <16 x i8> @combine_pshufb_as_psrlw(<16 x i8> %a0) {
 define <16 x i8> @combine_pshufb_as_pslld(<16 x i8> %a0) {
 ; SSE-LABEL: combine_pshufb_as_pslld:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[12]
+; SSE-NEXT:    pslld $24, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_pshufb_as_pslld:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[12]
+; AVX-NEXT:    vpslld $24, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12>)
   ret <16 x i8> %res0
@@ -327,12 +327,12 @@ define <16 x i8> @combine_pshufb_as_pslld(<16 x i8> %a0) {
 define <16 x i8> @combine_pshufb_as_psrlq(<16 x i8> %a0) {
 ; SSE-LABEL: combine_pshufb_as_psrlq:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[5,6,7],zero,zero,zero,zero,zero,xmm0[13,14,15],zero,zero,zero,zero,zero
+; SSE-NEXT:    psrlq $40, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_pshufb_as_psrlq:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,6,7],zero,zero,zero,zero,zero,xmm0[13,14,15],zero,zero,zero,zero,zero
+; AVX-NEXT:    vpsrlq $40, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128>)
   ret <16 x i8> %res0
-- 
2.50.1