[X86][SSE] Utilize ZeroableElements for canWidenShuffleElements

author Simon Pilgrim <llvm-dev@redking.me.uk>

Thu, 12 Jul 2018 13:29:41 +0000 (13:29 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Thu, 12 Jul 2018 13:29:41 +0000 (13:29 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Thu, 12 Jul 2018 13:29:41 +0000 (13:29 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Thu, 12 Jul 2018 13:29:41 +0000 (13:29 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 2ffd917337c70ce334366071be64f73171bb4b7f..cf22461a92f8128ae081855c4098d49e1339c5a0 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -15048,20 +15048,49 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
    if (Zeroable.isAllOnesValue())
      return getZeroVector(VT, Subtarget, DAG, DL);
  
+  bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
+
+  // Create an alternative mask with info about zeroable elements.
+  // Here we do not set undef elements as zeroable.
+  SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
+  if (V2IsZero) {
+    assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
+    for (int i = 0; i != NumElements; ++i)
+      if (Mask[i] != SM_SentinelUndef && Zeroable[i])
+        ZeroableMask[i] = SM_SentinelZero;
+  }
+
    // Try to collapse shuffles into using a vector type with fewer elements but
    // wider element types. We cap this to not form integers or floating point
    // elements wider than 64 bits, but it might be interesting to form i128
    // integers to handle flipping the low and high halves of AVX 256-bit vectors.
    SmallVector<int, 16> WidenedMask;
    if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
-      canWidenShuffleElements(Mask, WidenedMask)) {
+      canWidenShuffleElements(ZeroableMask, WidenedMask)) {
      MVT NewEltVT = VT.isFloatingPoint()
                         ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
                         : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
-    MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
+    int NewNumElts = NumElements / 2;
+    MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
      // Make sure that the new vector type is legal. For example, v2f64 isn't
      // legal on SSE1.
      if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
+      if (V2IsZero) {
+        // Modify the new Mask to take all zeros from the all-zero vector.
+        // Choose indices that are blend-friendly.
+        bool UsedZeroVector = false;
+        assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
+               "V2's non-undef elements are used?!");
+        for (int i = 0; i != NewNumElts; ++i)
+          if (WidenedMask[i] == SM_SentinelZero) {
+            WidenedMask[i] = i + NewNumElts;
+            UsedZeroVector = true;
+          }
+        // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
+        // some elements to be undef.
+        if (UsedZeroVector)
+          V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
+      }
        V1 = DAG.getBitcast(NewVT, V1);
        V2 = DAG.getBitcast(NewVT, V2);
        return DAG.getBitcast(
diff --git a/test/CodeGen/X86/avx-cast.ll b/test/CodeGen/X86/avx-cast.ll

index 4240f798bf90c79227a00674c74251a4fab11167..92d9319a18771f4ec7aa36943eb267a36a63fc2f 100644 (file)
--- a/test/CodeGen/X86/avx-cast.ll
+++ b/test/CodeGen/X86/avx-cast.ll
@@ -9,9 +9,7 @@
  define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
  ; AVX-LABEL: castA:
  ; AVX:       ## %bb.0:
-; AVX-NEXT:    ## kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT:    vmovaps %xmm0, %xmm0
  ; AVX-NEXT:    retq
    %shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
    ret <8 x float> %shuffle.i
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll

index 54094c39e460d237a996fbf64591766a22d21343..6f5b7d89dfa3e1a41b08f9f99539d6929e75e42d 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -2890,31 +2890,15 @@ define <32 x i8> @zeroable_src_to_zext(<32 x i8> %a0) {
  ; AVX1-LABEL: zeroable_src_to_zext:
  ; AVX1:       # %bb.0:
  ; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
  ; AVX1-NEXT:    retq
  ;
-; AVX2-LABEL: zeroable_src_to_zext:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
-; AVX2-NEXT:    retq
-;
-; AVX512VLBW-LABEL: zeroable_src_to_zext:
-; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512VLBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
-; AVX512VLBW-NEXT:    retq
-;
-; AVX512VLVBMI-LABEL: zeroable_src_to_zext:
-; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [32,33,0,0,0,0,0,0,34,35,0,0,0,0,0,0,36,37,16,16,16,16,16,16,38,39,16,16,16,16,16,16]
-; AVX512VLVBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLVBMI-NEXT:    vpermt2b %ymm0, %ymm2, %ymm1
-; AVX512VLVBMI-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX512VLVBMI-NEXT:    retq
+; AVX2OR512VL-LABEL: zeroable_src_to_zext:
+; AVX2OR512VL:       # %bb.0:
+; AVX2OR512VL-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2OR512VL-NEXT:    retq
    %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    %2 = shufflevector <32 x i8> %1, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 8, i32 9, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 10, i32 11, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 20, i32 21, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 22, i32 23, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
    ret <32 x i8> %2
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Thu, 12 Jul 2018 13:29:41 +0000 (13:29 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Thu, 12 Jul 2018 13:29:41 +0000 (13:29 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/avx-cast.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-256-v32.ll		patch \| blob \| history