From 70a8e547383d48e14a67fd84894cb39a6c83e63a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 11 Oct 2019 00:38:51 +0000 Subject: [PATCH] [X86] Improve the AVX512 bailout in combineTruncateWithSat to allow pack instructions in more situations. If we don't have VLX we won't end up selecting a saturating truncate for 256-bit or smaller vectors so we should just use the pack lowering. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@374487 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 11 +- test/CodeGen/X86/masked_store_trunc_ssat.ll | 45 ++----- test/CodeGen/X86/pmaddubsw.ll | 130 ++++++-------------- test/CodeGen/X86/vector-trunc-packus.ll | 55 +++------ test/CodeGen/X86/vector-trunc-ssat.ll | 54 +++----- 5 files changed, 97 insertions(+), 198 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b9cb85ed8d0..275e876644c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -39860,9 +39860,16 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, } } + // vXi32 truncate instructions are available with AVX512F. + // vXi16 truncate instructions are only available with AVX512BW. + // For 256-bit or smaller vectors, we require VLX. + // FIXME: We could widen truncates to 512 to remove the VLX restriction. + bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) || + (Subtarget.hasBWI() && InSVT == MVT::i16)) && + (Subtarget.hasVLX() || InVT.getSizeInBits() > 256); + if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) && - !(Subtarget.hasAVX512() && InSVT == MVT::i32) && - !(Subtarget.hasBWI() && InSVT == MVT::i16) && + !PreferAVX512 && (SVT == MVT::i8 || SVT == MVT::i16) && (InSVT == MVT::i16 || InSVT == MVT::i32)) { if (auto USatVal = detectSSatPattern(In, VT, true)) { diff --git a/test/CodeGen/X86/masked_store_trunc_ssat.ll b/test/CodeGen/X86/masked_store_trunc_ssat.ll index 83bf33c4f7f..c170e079454 100644 --- a/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -4594,11 +4594,8 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, <8 x i16>* %p, <8 x i32> %mask ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: jne .LBB11_1 @@ -4665,11 +4662,8 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, <8 x i16>* %p, <8 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftld $24, %k0, %k0 ; AVX512BW-NEXT: kshiftrd $24, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4977,11 +4971,9 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, <8 x i8>* %p, <8 x i32> %mask) ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: jne .LBB12_1 @@ -5048,11 +5040,9 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, <8 x i8>* %p, <8 x i32> %mask) ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -5192,10 +5182,6 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; AVX512F-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] -; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -5235,10 +5221,6 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftld $28, %k0, %k0 ; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; AVX512BW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] -; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper @@ -7302,9 +7284,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, <16 x i8>* %p, <16 x i8> %ma ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kmovw %k0, %k1 -; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -7601,8 +7582,6 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, <8 x i8>* %p, <8 x i16> %mask) ; AVX512BW-NEXT: vptestmw %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 -; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper diff --git a/test/CodeGen/X86/pmaddubsw.ll b/test/CodeGen/X86/pmaddubsw.ll index 3a08e47092e..48da43c6311 100644 --- a/test/CodeGen/X86/pmaddubsw.ll +++ b/test/CodeGen/X86/pmaddubsw.ll @@ -349,53 +349,27 @@ define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { ; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: pmaddubsw_bad_extend: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero -; AVX2-NEXT: vpmulld %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: pmaddubsw_bad_extend: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512-NEXT: vpmovsxbd %xmm3, %ymm3 -; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero -; AVX512-NEXT: vpmulld %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] -; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX256-LABEL: pmaddubsw_bad_extend: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqa (%rdi), %xmm0 +; AVX256-NEXT: vmovdqa (%rsi), %xmm1 +; AVX256-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX256-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX256-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX256-NEXT: vpmulld %ymm2, %ymm3, %ymm2 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX256-NEXT: vpmovsxbd %xmm1, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %A = load <16 x i8>, <16 x i8>* %Aptr %B = load <16 x i8>, <16 x i8>* %Bptr %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> @@ -476,49 +450,25 @@ define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) { ; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: pmaddubsw_bad_indices: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: pmaddubsw_bad_indices: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpmovsxbd %xmm2, %ymm2 -; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX512-NEXT: vpmulld %ymm3, %ymm2, %ymm2 -; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 -; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] -; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX256-LABEL: pmaddubsw_bad_indices: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqa (%rdi), %xmm0 +; AVX256-NEXT: vmovdqa (%rsi), %xmm1 +; AVX256-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] +; AVX256-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] +; AVX256-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX256-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX256-NEXT: vpmovsxbd %xmm2, %ymm2 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX256-NEXT: vpmulld %ymm3, %ymm2, %ymm2 +; AVX256-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %A = load <16 x i8>, <16 x i8>* %Aptr %B = load <16 x i8>, <16 x i8>* %Bptr %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> ;indices aren't all even diff --git a/test/CodeGen/X86/vector-trunc-packus.ll b/test/CodeGen/X86/vector-trunc-packus.ll index 704ac81394b..e75b2b9d78f 100644 --- a/test/CodeGen/X86/vector-trunc-packus.ll +++ b/test/CodeGen/X86/vector-trunc-packus.ll @@ -1111,12 +1111,8 @@ define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) { ; ; AVX512F-LABEL: trunc_packus_v8i32_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1130,12 +1126,8 @@ define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) { ; ; AVX512BW-LABEL: trunc_packus_v8i32_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2816,11 +2808,9 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512F-LABEL: trunc_packus_v8i32_v8i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2834,11 +2824,9 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512BW-LABEL: trunc_packus_v8i32_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2885,11 +2873,9 @@ define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) { ; ; AVX512F-LABEL: trunc_packus_v8i32_v8i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2904,11 +2890,9 @@ define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) { ; ; AVX512BW-LABEL: trunc_packus_v8i32_v8i8_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3007,11 +2991,8 @@ define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) { ; ; AVX512BW-LABEL: trunc_packus_v16i16_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-trunc-ssat.ll b/test/CodeGen/X86/vector-trunc-ssat.ll index 54f0d35eb74..7ab62b95429 100644 --- a/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/test/CodeGen/X86/vector-trunc-ssat.ll @@ -1078,12 +1078,8 @@ define <8 x i16> @trunc_ssat_v8i32_v8i16(<8 x i32> %a0) { ; ; AVX512F-LABEL: trunc_ssat_v8i32_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1095,12 +1091,8 @@ define <8 x i16> @trunc_ssat_v8i32_v8i16(<8 x i32> %a0) { ; ; AVX512BW-LABEL: trunc_ssat_v8i32_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2795,11 +2787,9 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512F-LABEL: trunc_ssat_v8i32_v8i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2811,11 +2801,9 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2860,11 +2848,9 @@ define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) { ; ; AVX512F-LABEL: trunc_ssat_v8i32_v8i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2877,11 +2863,9 @@ define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) { ; ; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2976,10 +2960,8 @@ define <16 x i8> @trunc_ssat_v16i16_v16i8(<16 x i16> %a0) { ; ; AVX512BW-LABEL: trunc_ssat_v16i16_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; -- 2.40.0