From ec998a2f4e3f41ee3561833c1a8b53c13bb0786f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 12 Oct 2019 07:59:29 +0000 Subject: [PATCH] [X86] Use pack instructions for packus/ssat truncate patterns when 256-bit is the largest legal vector and the result type is at least 256 bits. Since the input type is larger than 256-bits we'll need to some concatenating to reassemble the results. The pack instructions ability to concatenate while packing make this a shorter/faster sequence. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@374643 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 5 +++- test/CodeGen/X86/vector-trunc-packus.ll | 38 ++++++++----------------- test/CodeGen/X86/vector-trunc-ssat.ll | 35 +++++++---------------- 3 files changed, 27 insertions(+), 51 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 81df19b827f..6838dbbd08d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -39869,9 +39869,12 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, // vXi16 truncate instructions are only available with AVX512BW. // For 256-bit or smaller vectors, we require VLX. // FIXME: We could widen truncates to 512 to remove the VLX restriction. + // If the result type is 256-bits or larger and we have disable 512-bit + // registers, we should go ahead and use the pack instructions if possible. bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) || (Subtarget.hasBWI() && InSVT == MVT::i16)) && - (Subtarget.hasVLX() || InVT.getSizeInBits() > 256); + (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) && + !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256); if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 && diff --git a/test/CodeGen/X86/vector-trunc-packus.ll b/test/CodeGen/X86/vector-trunc-packus.ll index 48da85dee83..8867aaaeddd 100644 --- a/test/CodeGen/X86/vector-trunc-packus.ll +++ b/test/CodeGen/X86/vector-trunc-packus.ll @@ -2095,13 +2095,8 @@ define <16 x i16> @trunc_packus_v16i32_v16i16(<16 x i32>* %p0) "min-legal-vector ; ; SKX-LABEL: trunc_packus_v16i32_v16i16: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535] -; SKX-NEXT: vpminsd (%rdi), %ymm0, %ymm1 -; SKX-NEXT: vpminsd 32(%rdi), %ymm0, %ymm0 -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 -; SKX-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 -; SKX-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vmovdqa (%rdi), %ymm0 +; SKX-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 ; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; SKX-NEXT: retq %a0 = load <16 x i32>, <16 x i32>* %p0 @@ -4943,13 +4938,8 @@ define <32 x i8> @trunc_packus_v32i16_v32i8(<32 x i16>* %p0) "min-legal-vector-w ; ; SKX-LABEL: trunc_packus_v32i16_v32i8: ; SKX: # %bb.0: -; SKX-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SKX-NEXT: vpminsw (%rdi), %ymm0, %ymm1 -; SKX-NEXT: vpminsw 32(%rdi), %ymm0, %ymm0 -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 -; SKX-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 -; SKX-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vmovdqa (%rdi), %ymm0 +; SKX-NEXT: vpackuswb 32(%rdi), %ymm0, %ymm0 ; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; SKX-NEXT: retq %a0 = load <32 x i16>, <32 x i16>* %p0 @@ -5015,18 +5005,14 @@ define <32 x i8> @trunc_packus_v32i32_v32i8(<32 x i32>* %p0) "min-legal-vector-w ; ; SKX-LABEL: trunc_packus_v32i32_v32i8: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX-NEXT: vpmaxsd 96(%rdi), %ymm0, %ymm1 -; SKX-NEXT: vpmovusdb %ymm1, %xmm1 -; SKX-NEXT: vpmaxsd 64(%rdi), %ymm0, %ymm2 -; SKX-NEXT: vpmovusdb %ymm2, %xmm2 -; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; SKX-NEXT: vpmaxsd 32(%rdi), %ymm0, %ymm2 -; SKX-NEXT: vpmovusdb %ymm2, %xmm2 -; SKX-NEXT: vpmaxsd (%rdi), %ymm0, %ymm0 -; SKX-NEXT: vpmovusdb %ymm0, %xmm0 -; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vmovdqa (%rdi), %ymm0 +; SKX-NEXT: vmovdqa 64(%rdi), %ymm1 +; SKX-NEXT: vpackssdw 96(%rdi), %ymm1, %ymm1 +; SKX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; SKX-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; SKX-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; SKX-NEXT: retq %a0 = load <32 x i32>, <32 x i32>* %p0 %1 = icmp slt <32 x i32> %a0, diff --git a/test/CodeGen/X86/vector-trunc-ssat.ll b/test/CodeGen/X86/vector-trunc-ssat.ll index 519b917ab95..729ab00275c 100644 --- a/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/test/CodeGen/X86/vector-trunc-ssat.ll @@ -1878,13 +1878,8 @@ define <16 x i16> @trunc_ssat_v16i32_v16i16(<16 x i32>* %p0) "min-legal-vector-w ; ; SKX-LABEL: trunc_ssat_v16i32_v16i16: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [32767,32767,32767,32767,32767,32767,32767,32767] -; SKX-NEXT: vpminsd (%rdi), %ymm0, %ymm1 -; SKX-NEXT: vpminsd 32(%rdi), %ymm0, %ymm0 -; SKX-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] -; SKX-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 -; SKX-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 -; SKX-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vmovdqa (%rdi), %ymm0 +; SKX-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 ; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; SKX-NEXT: retq %a0 = load <16 x i32>, <16 x i32>* %p0 @@ -4823,13 +4818,8 @@ define <32 x i8> @trunc_ssat_v32i16_v32i8(<32 x i16>* %p0) "min-legal-vector-wid ; ; SKX-LABEL: trunc_ssat_v32i16_v32i8: ; SKX: # %bb.0: -; SKX-NEXT: vmovdqa {{.*#+}} ymm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; SKX-NEXT: vpminsw (%rdi), %ymm0, %ymm1 -; SKX-NEXT: vpminsw 32(%rdi), %ymm0, %ymm0 -; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] -; SKX-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 -; SKX-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 -; SKX-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vmovdqa (%rdi), %ymm0 +; SKX-NEXT: vpacksswb 32(%rdi), %ymm0, %ymm0 ; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; SKX-NEXT: retq %a0 = load <32 x i16>, <32 x i16>* %p0 @@ -4895,16 +4885,13 @@ define <32 x i8> @trunc_ssat_v32i32_v32i8(<32 x i32>* %p0) "min-legal-vector-wid ; SKX-LABEL: trunc_ssat_v32i32_v32i8: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa (%rdi), %ymm0 -; SKX-NEXT: vmovdqa 32(%rdi), %ymm1 -; SKX-NEXT: vmovdqa 64(%rdi), %ymm2 -; SKX-NEXT: vmovdqa 96(%rdi), %ymm3 -; SKX-NEXT: vpmovsdb %ymm3, %xmm3 -; SKX-NEXT: vpmovsdb %ymm2, %xmm2 -; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SKX-NEXT: vpmovsdb %ymm1, %xmm1 -; SKX-NEXT: vpmovsdb %ymm0, %xmm0 -; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SKX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; SKX-NEXT: vmovdqa 64(%rdi), %ymm1 +; SKX-NEXT: vpackssdw 96(%rdi), %ymm1, %ymm1 +; SKX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; SKX-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 +; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; SKX-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; SKX-NEXT: retq %a0 = load <32 x i32>, <32 x i32>* %p0 %1 = icmp slt <32 x i32> %a0, -- 2.40.0