From cdfe078a42f95b07294f711d84b965fd6ef45ad5 Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Wed, 29 Jun 2016 16:56:09 +0000 Subject: [PATCH] [X86] Lower blended PACKUSes using appropriate types. When lowering two blended PACKUS, we used to disregard the types of the PACKUS inputs, indiscriminately generating a v16i8 PACKUS. This leads to non-selectable things like: (v16i8 (PACKUS (v4i32 v0), (v4i32 v1))) Instead, check that the PACKUSes have the same type, and use that as the final result type. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274138 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 25 +++++----- test/CodeGen/X86/vector-shuffle-sse41.ll | 59 ++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 11 deletions(-) create mode 100644 test/CodeGen/X86/vector-shuffle-sse41.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 859c27afe44..7d6f5c578fb 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -8943,22 +8943,25 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); - // If we have a blend of two PACKUS operations an the blend aligns with the - // low and half halves, we can just merge the PACKUS operations. This is - // particularly important as it lets us merge shuffles that this routine itself - // creates. + // If we have a blend of two same-type PACKUS operations and the blend aligns + // with the low and high halves, we can just merge the PACKUS operations. + // This is particularly important as it lets us merge shuffles that this + // routine itself creates. auto GetPackNode = [](SDValue V) { V = peekThroughBitcasts(V); return V.getOpcode() == X86ISD::PACKUS ? V : SDValue(); }; if (SDValue V1Pack = GetPackNode(V1)) - if (SDValue V2Pack = GetPackNode(V2)) - return DAG.getBitcast(MVT::v2i64, - DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, - Mask[0] == 0 ? V1Pack.getOperand(0) - : V1Pack.getOperand(1), - Mask[1] == 2 ? V2Pack.getOperand(0) - : V2Pack.getOperand(1))); + if (SDValue V2Pack = GetPackNode(V2)) { + EVT PackVT = V1Pack.getValueType(); + if (PackVT == V2Pack.getValueType()) + return DAG.getBitcast(MVT::v2i64, + DAG.getNode(X86ISD::PACKUS, DL, PackVT, + Mask[0] == 0 ? V1Pack.getOperand(0) + : V1Pack.getOperand(1), + Mask[1] == 2 ? V2Pack.getOperand(0) + : V2Pack.getOperand(1))); + } // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, diff --git a/test/CodeGen/X86/vector-shuffle-sse41.ll b/test/CodeGen/X86/vector-shuffle-sse41.ll new file mode 100644 index 00000000000..be9a4b95077 --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-sse41.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX + +define <8 x i16> @blend_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) { +; SSE41-LABEL: blend_packusdw: +; SSE41: # BB#0: +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: blend_packusdw: +; AVX: # BB#0: +; AVX-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) + %p1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a2, <4 x i32> %a3) + %s0 = shufflevector <8 x i16> %p0, <8 x i16> %p1, <8 x i32> + ret <8 x i16> %s0 +} + +define <16 x i8> @blend_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) { +; SSE41-LABEL: blend_packuswb: +; SSE41: # BB#0: +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: blend_packuswb: +; AVX: # BB#0: +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %p0 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) + %p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3) + %s0 = shufflevector <16 x i8> %p0, <16 x i8> %p1, <16 x i32> + ret <16 x i8> %s0 +} + +define <8 x i16> @blend_packusdw_packuswb(<4 x i32> %a0, <4 x i32> %a1, <8 x i16> %a2, <8 x i16> %a3) { +; SSE41-LABEL: blend_packusdw_packuswb: +; SSE41: # BB#0: +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm3, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: blend_packusdw_packuswb: +; AVX: # BB#0: +; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) + %p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3) + %b1 = bitcast <16 x i8> %p1 to <8 x i16> + %s0 = shufflevector <8 x i16> %p0, <8 x i16> %b1, <8 x i32> + ret <8 x i16> %s0 +} + +declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) -- 2.50.0