From 62ea4baf5fa2d922e06e01c93804a33db22fee5d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 12 Aug 2019 22:18:23 +0000 Subject: [PATCH] [X86] Allow combineTruncateWithSat to use pack instructions for i16->i8 without AVX512BW. We need AVX512BW to be able to truncate an i16 vector. If we don't have that we have to extend i16->i32, then trunc, i32->i8. But we won't be able to remove the min/max if we do that. At least not without more special handling. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@368623 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 3 +- test/CodeGen/X86/avx512-trunc.ll | 26 +++++--------- test/CodeGen/X86/masked_store_trunc_ssat.ll | 21 +++-------- test/CodeGen/X86/vector-trunc-packus.ll | 40 +++++---------------- test/CodeGen/X86/vector-trunc-ssat.ll | 38 +++++--------------- 5 files changed, 30 insertions(+), 98 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f49a425329b..90d56f56528 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -39024,7 +39024,8 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); } if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) && - !Subtarget.hasAVX512() && + !(Subtarget.hasAVX512() && InSVT == MVT::i32) && + !(Subtarget.hasBWI() && InSVT == MVT::i16) && (SVT == MVT::i8 || SVT == MVT::i16) && (InSVT == MVT::i16 || InSVT == MVT::i32)) { if (auto USatVal = detectSSatPattern(In, VT, true)) { diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll index e62435df8cb..572b6065b2f 100644 --- a/test/CodeGen/X86/avx512-trunc.ll +++ b/test/CodeGen/X86/avx512-trunc.ll @@ -753,11 +753,9 @@ define <16 x i8> @usat_trunc_db_256(<8 x i32> %x) { define void @smax_usat_trunc_wb_256_mem1(<16 x i16> %i, <16 x i8>* %res) { ; KNL-LABEL: smax_usat_trunc_wb_256_mem1: ; KNL: ## %bb.0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, (%rdi) +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vmovdqu %xmm0, (%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -781,11 +779,9 @@ define void @smax_usat_trunc_wb_256_mem1(<16 x i16> %i, <16 x i8>* %res) { define void @smax_usat_trunc_wb_256_mem2(<16 x i16> %i, <16 x i8>* %res) { ; KNL-LABEL: smax_usat_trunc_wb_256_mem2: ; KNL: ## %bb.0: -; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, (%rdi) +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vmovdqu %xmm0, (%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -808,11 +804,8 @@ define void @smax_usat_trunc_wb_256_mem2(<16 x i16> %i, <16 x i8>* %res) { define <16 x i8> @smax_usat_trunc_wb_256(<16 x i16> %i) { ; KNL-LABEL: smax_usat_trunc_wb_256: ; KNL: ## %bb.0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -834,9 +827,6 @@ define <16 x i8> @smax_usat_trunc_wb_256(<16 x i16> %i) { define void @smax_usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) { ; KNL-LABEL: smax_usat_trunc_wb_128_mem: ; KNL: ## %bb.0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 ; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; KNL-NEXT: vmovq %xmm0, (%rdi) ; KNL-NEXT: retq diff --git a/test/CodeGen/X86/masked_store_trunc_ssat.ll b/test/CodeGen/X86/masked_store_trunc_ssat.ll index 34e50ef02ff..c0dd2893c79 100644 --- a/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -6452,17 +6452,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpminsw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] -; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaxsw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: vpmovmskb %ymm2, %eax ; AVX512F-NEXT: notl %eax ; AVX512F-NEXT: testb $1, %al @@ -7200,10 +7191,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, <16 x i8>* %p, <16 x i8> %ma ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovmskb %xmm1, %eax ; AVX512F-NEXT: xorl $65535, %eax # imm = 0xFFFF ; AVX512F-NEXT: testb $1, %al @@ -7558,8 +7547,6 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, <8 x i8>* %p, <8 x i16> %mask) ; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpmaxsw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al diff --git a/test/CodeGen/X86/vector-trunc-packus.ll b/test/CodeGen/X86/vector-trunc-packus.ll index db809666cd9..01e68925744 100644 --- a/test/CodeGen/X86/vector-trunc-packus.ll +++ b/test/CodeGen/X86/vector-trunc-packus.ll @@ -2961,21 +2961,15 @@ define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) { ; ; AVX512F-LABEL: trunc_packus_v16i16_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_packus_v16i16_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -3029,32 +3023,14 @@ define <32 x i8> @trunc_packus_v32i16_v32i8(<32 x i16> %a0) { ; ; AVX512F-LABEL: trunc_packus_v32i16_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpminsw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_packus_v32i16_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpminsw %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpminsw %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v32i16_v32i8: diff --git a/test/CodeGen/X86/vector-trunc-ssat.ll b/test/CodeGen/X86/vector-trunc-ssat.ll index 7bdacc90336..96ca65ce3c9 100644 --- a/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/test/CodeGen/X86/vector-trunc-ssat.ll @@ -2941,19 +2941,15 @@ define <16 x i8> @trunc_ssat_v16i16_v16i8(<16 x i16> %a0) { ; ; AVX512F-LABEL: trunc_ssat_v16i16_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_ssat_v16i16_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512VL-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -3004,32 +3000,14 @@ define <32 x i8> @trunc_ssat_v32i16_v32i8(<32 x i16> %a0) { ; ; AVX512F-LABEL: trunc_ssat_v32i16_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpminsw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_ssat_v32i16_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-NEXT: vpminsw %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpminsw %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] -; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v32i16_v32i8: -- 2.40.0