From d005962cad83b1821e318d6506351e9cadde2942 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 1 Nov 2017 21:52:29 +0000 Subject: [PATCH] [X86][SSE] Add PACKUS support to LowerTruncate Similar to the existing code to lower to PACKSS, we can use PACKUS if the input vector's leading zero bits extend all the way to the packed/truncated value. We have to account for pre-SSE41 targets not supporting PACKUSDW git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317128 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 38 ++++++++---- test/CodeGen/X86/avg.ll | 21 +++---- test/CodeGen/X86/combine-srl.ll | 5 +- test/CodeGen/X86/pr31773.ll | 26 +++----- test/CodeGen/X86/psubus.ll | 99 +++++++++++++----------------- test/CodeGen/X86/vector-trunc.ll | 23 ++++--- 6 files changed, 101 insertions(+), 111 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5a33a49536f..b178ad6c13e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16074,15 +16074,16 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); } -/// Helper to recursively truncate vector elements in half with PACKSS. -/// It makes use of the fact that vectors with enough leading sign bits -/// prevent the PACKSS from saturating the results. -/// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates +/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS. +/// It makes use of the fact that vectors with enough leading sign/zero bits +/// prevent the PACKSS/PACKUS from saturating the results. +/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates /// within each 128-bit lane. static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - assert(Opcode == X86ISD::PACKSS && "Unexpected PACK opcode"); + assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && + "Unexpected PACK opcode"); // Requires SSE2 but AVX512 has fast truncate. if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) @@ -16114,9 +16115,10 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2); // Pack to the largest type possible: - // vXi64/vXi32 -> PACKSSDW and vXi16 -> PACKSSWB. + // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB. EVT InVT = MVT::i16, OutVT = MVT::i8; - if (DstVT.getScalarSizeInBits() > 8) { + if (DstVT.getScalarSizeInBits() > 8 && + (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) { InVT = MVT::i32; OutVT = MVT::i16; } @@ -16125,7 +16127,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits()); OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits()); - // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors. + // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors. if (SrcVT.is256BitVector()) { Lo = DAG.getBitcast(InVT, Lo); Hi = DAG.getBitcast(InVT, Hi); @@ -16133,14 +16135,14 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, return DAG.getBitcast(DstVT, Res); } - // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors. - // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS). + // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors. + // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK). if (SrcVT.is512BitVector() && Subtarget.hasInt256()) { Lo = DAG.getBitcast(InVT, Lo); Hi = DAG.getBitcast(InVT, Hi); SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi); - // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)), + // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)), // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)). Res = DAG.getBitcast(MVT::v4i64, Res); Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3}); @@ -16208,6 +16210,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); + unsigned InNumEltBits = InVT.getScalarSizeInBits(); assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); @@ -16227,11 +16230,22 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { // Truncate with PACKSS if we are truncating a vector with sign-bits that // extend all the way to the packed/truncated value. unsigned NumPackedBits = std::min(VT.getScalarSizeInBits(), 16); - if ((InVT.getScalarSizeInBits() - NumPackedBits) < DAG.ComputeNumSignBits(In)) + if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In)) if (SDValue V = truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget)) return V; + // Truncate with PACKUS if we are truncating a vector with leading zero bits + // that extend all the way to the packed/truncated value. + // Pre-SSE41 we can only use PACKUSWB. + KnownBits Known; + DAG.computeKnownBits(In, Known); + NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8; + if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros()) + if (SDValue V = + truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget)) + return V; + if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index 5b36ccf4a54..508f10e9889 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -2905,17 +2905,16 @@ define void @avg_v32i16_const(<32 x i16>* %a) nounwind { ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 ; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vmovdqu %ymm2, (%rax) ; AVX2-NEXT: vzeroupper diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll index 1d064b36ee6..9f7f8a97dc2 100644 --- a/test/CodeGen/X86/combine-srl.ll +++ b/test/CodeGen/X86/combine-srl.ll @@ -181,9 +181,8 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) { ; AVX-LABEL: combine_vec_lshr_trunc_lshr0: ; AVX: # BB#0: ; AVX-NEXT: vpsrlq $48, %ymm0, %ymm0 -; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %1 = lshr <4 x i64> %x, diff --git a/test/CodeGen/X86/pr31773.ll b/test/CodeGen/X86/pr31773.ll index 5b1ea9a948a..d7ae04bf238 100644 --- a/test/CodeGen/X86/pr31773.ll +++ b/test/CodeGen/X86/pr31773.ll @@ -7,14 +7,11 @@ define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) { ; AVX-LABEL: usat_trunc_wb_256: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX-NEXT: vpminuw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpminuw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -32,14 +29,11 @@ define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) { define <8 x i16> @usat_trunc_dw_256(<8 x i32> %i) { ; AVX-LABEL: usat_trunc_dw_256: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] -; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; AVX-NEXT: vpminud %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll index e8bb3b86218..6e38f06a0f8 100644 --- a/test/CodeGen/X86/psubus.ll +++ b/test/CodeGen/X86/psubus.ll @@ -1555,14 +1555,11 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; AVX1-LABEL: psubus_8i32_max: ; AVX1: # BB#0: # %vector.ph -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1571,8 +1568,8 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind { ; AVX2: # BB#0: # %vector.ph ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2101,20 +2098,15 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; ; AVX1-LABEL: psubus_16i32_max: ; AVX1: # BB#0: # %vector.ph -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpminud %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpminud %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpminud %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 @@ -2130,23 +2122,22 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; ; AVX2-LABEL: psubus_16i32_max: ; AVX2: # BB#0: # %vector.ph -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpminud %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpackusdw %ymm0, %ymm3, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpsubusw %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpackusdw %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsubusw %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2239,14 +2230,11 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin ; ; AVX1-LABEL: psubus_i16_i32_max_swapped: ; AVX1: # BB#0: # %vector.ph -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2255,8 +2243,8 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin ; AVX2: # BB#0: # %vector.ph ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2353,14 +2341,11 @@ define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; AVX1-LABEL: psubus_i16_i32_min: ; AVX1: # BB#0: # %vector.ph -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2369,8 +2354,8 @@ define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind { ; AVX2: # BB#0: # %vector.ph ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index e9da847d0ad..dc08d88074d 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -483,9 +483,8 @@ define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) { ; AVX2-LABEL: trunc8i32_8i16_lshr: ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpackusdw %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -814,12 +813,12 @@ define void @trunc16i32_16i16_lshr(<16 x i32> %a) { ; ; AVX2-LABEL: trunc16i32_16i16_lshr: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpackusdw %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1901,8 +1900,8 @@ define void @PR34773(i16* %a0, i8* %a1) { ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rsi) +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) +; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1916,8 +1915,8 @@ define void @PR34773(i16* %a0, i8* %a1) { ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rsi) +; AVX2-NEXT: vmovdqu %xmm0, (%rsi) +; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -- 2.40.0