[X86] Lower blended PACKUSes using appropriate types.

author Ahmed Bougacha <ahmed.bougacha@gmail.com>

Wed, 29 Jun 2016 16:56:09 +0000 (16:56 +0000)

committer Ahmed Bougacha <ahmed.bougacha@gmail.com>

Wed, 29 Jun 2016 16:56:09 +0000 (16:56 +0000)
author Ahmed Bougacha <ahmed.bougacha@gmail.com>
Wed, 29 Jun 2016 16:56:09 +0000 (16:56 +0000)
committer Ahmed Bougacha <ahmed.bougacha@gmail.com>
Wed, 29 Jun 2016 16:56:09 +0000 (16:56 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 859c27afe449de04f9d9be6f92172ad88caa1d0a..7d6f5c578fb5f55ade214502ad25ca09d557b721 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -8943,22 +8943,25 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
    assert(Mask[0] < 2 && "We sort V1 to be the first input.");
    assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
  
-  // If we have a blend of two PACKUS operations an the blend aligns with the
-  // low and half halves, we can just merge the PACKUS operations. This is
-  // particularly important as it lets us merge shuffles that this routine itself
-  // creates.
+  // If we have a blend of two same-type PACKUS operations and the blend aligns
+  // with the low and high halves, we can just merge the PACKUS operations.
+  // This is particularly important as it lets us merge shuffles that this
+  // routine itself creates.
    auto GetPackNode = [](SDValue V) {
      V = peekThroughBitcasts(V);
      return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
    };
    if (SDValue V1Pack = GetPackNode(V1))
-    if (SDValue V2Pack = GetPackNode(V2))
-      return DAG.getBitcast(MVT::v2i64,
-                            DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
-                                        Mask[0] == 0 ? V1Pack.getOperand(0)
-                                                     : V1Pack.getOperand(1),
-                                        Mask[1] == 2 ? V2Pack.getOperand(0)
-                                                     : V2Pack.getOperand(1)));
+    if (SDValue V2Pack = GetPackNode(V2)) {
+      EVT PackVT = V1Pack.getValueType();
+      if (PackVT == V2Pack.getValueType())
+        return DAG.getBitcast(MVT::v2i64,
+                              DAG.getNode(X86ISD::PACKUS, DL, PackVT,
+                                          Mask[0] == 0 ? V1Pack.getOperand(0)
+                                                       : V1Pack.getOperand(1),
+                                          Mask[1] == 2 ? V2Pack.getOperand(0)
+                                                       : V2Pack.getOperand(1)));
+    }
  
    // Try to use shift instructions.
    if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
diff --git a/test/CodeGen/X86/vector-shuffle-sse41.ll b/test/CodeGen/X86/vector-shuffle-sse41.ll

new file mode 100644 (file)

index 0000000..be9a4b9
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-sse41.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+
+define <8 x i16> @blend_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
+; SSE41-LABEL: blend_packusdw:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: blend_packusdw:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+  %p1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a2, <4 x i32> %a3)
+  %s0 = shufflevector <8 x i16> %p0, <8 x i16> %p1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i16> %s0
+}
+
+define <16 x i8> @blend_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
+; SSE41-LABEL: blend_packuswb:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: blend_packuswb:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %p0 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
+  %p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
+  %s0 = shufflevector <16 x i8> %p0, <16 x i8> %p1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s0
+}
+
+define <8 x i16> @blend_packusdw_packuswb(<4 x i32> %a0, <4 x i32> %a1, <8 x i16> %a2, <8 x i16> %a3) {
+; SSE41-LABEL: blend_packusdw_packuswb:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: blend_packusdw_packuswb:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+  %p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
+  %b1 = bitcast <16 x i8> %p1 to <8 x i16>
+  %s0 = shufflevector <8 x i16> %p0, <8 x i16> %b1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i16> %s0
+}
+
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
author	Ahmed Bougacha <ahmed.bougacha@gmail.com>
	Wed, 29 Jun 2016 16:56:09 +0000 (16:56 +0000)
committer	Ahmed Bougacha <ahmed.bougacha@gmail.com>
	Wed, 29 Jun 2016 16:56:09 +0000 (16:56 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-sse41.ll	[new file with mode: 0644]	patch \| blob