[X86][SSE] Add support for combining target shuffles to AND bitmasks.

author Simon Pilgrim <llvm-dev@redking.me.uk>

Thu, 1 Dec 2016 13:47:02 +0000 (13:47 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Thu, 1 Dec 2016 13:47:02 +0000 (13:47 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Thu, 1 Dec 2016 13:47:02 +0000 (13:47 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Thu, 1 Dec 2016 13:47:02 +0000 (13:47 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 63f191fa8ddb235c3b19d016418a406533119bfc..4b48537f701bcfaf0db977936fd3eba4a235ec2e 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26257,6 +26257,37 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
      return false;
    }
  
+  // See if we can combine a single input shuffle with zeros to a bit-mask,
+  // which is much simpler than any shuffle.
+  if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
+      isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
+      DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
+    APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
+    APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
+    SmallBitVector UndefElts(NumMaskElts, false);
+    SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
+    for (unsigned i = 0; i != NumMaskElts; ++i) {
+      int M = Mask[i];
+      if (M == SM_SentinelUndef) {
+        UndefElts[i] = true;
+        continue;
+      }
+      if (M == SM_SentinelZero)
+        continue;
+      EltBits[i] = AllOnes;
+    }
+    SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
+    DCI.AddToWorklist(BitMask.getNode());
+    Res = DAG.getBitcast(MaskVT, V1);
+    DCI.AddToWorklist(Res.getNode());
+    unsigned AndOpcode = FloatDomain ? X86ISD::FAND : ISD::AND;
+    Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
    // If we have a single input shuffle with different shuffle patterns in the
    // the 128-bit lanes use the variable mask to VPERMILPS.
    // TODO Combine other mask types at higher depths.
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll

index 4fddbac750233cb9036201f674ee0ec808cbab71..68fceef285c0147fedfb3e9f479d7d7006680d21 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -72,12 +72,12 @@ define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
  define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) {
  ; X32-LABEL: combine_and_pshufb:
  ; X32:       # BB#0:
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[24,25],zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: combine_and_pshufb:
  ; X64:       # BB#0:
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[24,25],zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
  ; X64-NEXT:    retq
    %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -669,12 +669,12 @@ define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
  define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
  ; X32-LABEL: combine_psrlw_pshufb:
  ; X32:       # BB#0:
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31]
+; X32-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: combine_psrlw_pshufb:
  ; X64:       # BB#0:
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31]
+; X64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
  ; X64-NEXT:    retq
    %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    %2 = bitcast <16 x i16> %1 to <32 x i8>
@@ -685,12 +685,12 @@ define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
  define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) {
  ; X32-LABEL: combine_pslld_pshufb:
  ; X32:       # BB#0:
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28],zero,zero,zero
+; X32-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: combine_pslld_pshufb:
  ; X64:       # BB#0:
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28],zero,zero,zero
+; X64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
  ; X64-NEXT:    retq
    %1 = shl <8 x i32> %a0, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
    %2 = bitcast <8 x i32> %1 to <32 x i8>
diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll

index b7aa81964c37bdc127bbae0a5bd6859c07fc2598..f38373b26ecae1058c19117e469550762e245b2f 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -277,7 +277,7 @@ define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) {
  define <16 x i8> @combine_and_pshufb(<16 x i8> %a0) {
  ; SSSE3-LABEL: combine_and_pshufb:
  ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
  ; SSSE3-NEXT:    retq
  ;
  ; SSE41-LABEL: combine_and_pshufb:
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll

index 8d8d7aa4448c122aaf6f628d871e143e5adc1731..1febf559bdea84cc611f6f8f48653e869f5a5c4b 100644 (file)
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -1674,7 +1674,7 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
  ; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
  ; SSSE3:       # BB#0: # %entry
  ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2],zero,xmm1[4],zero,xmm1[6],zero,xmm1[8],zero,xmm1[10],zero,xmm1[12],zero,xmm1[14],zero
+; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
  ; SSSE3-NEXT:    pxor %xmm2, %xmm2
  ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
  ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Thu, 1 Dec 2016 13:47:02 +0000 (13:47 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Thu, 1 Dec 2016 13:47:02 +0000 (13:47 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-combining-avx2.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-combining-ssse3.ll		patch \| blob \| history
test/CodeGen/X86/vector-zext.ll		patch \| blob \| history