From 4f8964c97278ac61fc229690d3a6c538579d52de Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 24 Dec 2018 19:40:20 +0000 Subject: [PATCH] [X86] Use GetDemandedBits to simplify the operands of PMULDQ/PMULUDQ. This is an alternative to what I attempted in D56057. GetDemandedBits is a special version of SimplifyDemandedBits that allows simplifications even when the operand has other uses. GetDemandedBits will only do simplifications that allow a node to be bypassed. It won't create new nodes or alter any of the other users. I had to add support for bypassing SIGN_EXTEND_INREG to GetDemandedBits. Based on a patch that Simon Pilgrim sent me in email. Fixes PR40142. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@350059 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 9 ++++ lib/Target/X86/X86ISelLowering.cpp | 9 ++++ test/CodeGen/X86/avx2-intrinsics-fast-isel.ll | 9 ---- .../X86/avx512-intrinsics-fast-isel.ll | 9 ---- test/CodeGen/X86/pmul.ll | 45 ++++--------------- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll | 28 ++---------- .../CodeGen/X86/sse41-intrinsics-fast-isel.ll | 17 +------ 7 files changed, 32 insertions(+), 94 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index b4947de31c9..f38770b773f 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2118,6 +2118,15 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &Mask) { return getNode(ISD::ANY_EXTEND, SDLoc(V), V.getValueType(), DemandedSrc); break; } + case ISD::SIGN_EXTEND_INREG: + EVT ExVT = cast(V.getOperand(1))->getVT(); + unsigned ExVTBits = ExVT.getScalarSizeInBits(); + + // If none of the extended bits are demanded, eliminate the sextinreg. + if (Mask.getActiveBits() <= ExVTBits) + return V.getOperand(0); + + break; } return SDValue(); } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1fb1aa12de6..978a7adb304 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -41212,6 +41212,15 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, if (ISD::isBuildVectorAllZeros(RHS.getNode())) return RHS; + // Aggressively peek through ops to get at the demanded low bits. + APInt DemandedMask = APInt::getLowBitsSet(64, 32); + SDValue DemandedLHS = DAG.GetDemandedBits(LHS, DemandedMask); + SDValue DemandedRHS = DAG.GetDemandedBits(RHS, DemandedMask); + if (DemandedLHS || DemandedRHS) + return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), + DemandedLHS ? DemandedLHS : LHS, + DemandedRHS ? DemandedRHS : RHS); + // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI)) diff --git a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll index e43e2d57b26..d230ffe7073 100644 --- a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -1823,12 +1823,6 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_mul_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllq $32, %ymm0, %ymm2 -; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; CHECK-NEXT: vpsllq $32, %ymm1, %ymm2 -; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %A = shl <4 x i64> %a0, @@ -1843,9 +1837,6 @@ declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_mul_epu32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %A = and <4 x i64> %a0, diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index 2b2424db196..53eb64f7b11 100644 --- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -1651,10 +1651,6 @@ define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind { define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind { ; CHECK-LABEL: test_mm512_mul_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0 -; CHECK-NEXT: vpsraq $32, %zmm0, %zmm0 -; CHECK-NEXT: vpsllq $32, %zmm1, %zmm1 -; CHECK-NEXT: vpsraq $32, %zmm1, %zmm1 ; CHECK-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %tmp = shl <8 x i64> %__A, @@ -1718,11 +1714,6 @@ entry: define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind { ; CHECK-LABEL: test_mm512_mul_epu32: ; CHECK: # %bb.0: -; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA -; CHECK-NEXT: kmovw %eax, %k0 -; CHECK-NEXT: knotw %k0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} ; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %tmp = and <8 x i64> %__A, diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 72ddc673517..f65d37a7e5e 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -1388,29 +1388,13 @@ define <2 x i64> @pmuldq_square(<2 x i64> %x) { ; ; SSE41-LABEL: pmuldq_square: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: pmuldq %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmuldq %xmm0, %xmm0 ; SSE41-NEXT: retq ; -; AVX2-LABEL: pmuldq_square: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpmuldq %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: pmuldq_square: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 -; AVX512-NEXT: vpmuldq %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: pmuldq_square: +; AVX: # %bb.0: +; AVX-NEXT: vpmuldq %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = shl <2 x i64> %x, %2 = ashr exact <2 x i64> %1, %3 = mul nsw <2 x i64> %2, %2 @@ -1418,24 +1402,13 @@ define <2 x i64> @pmuldq_square(<2 x i64> %x) { } define <2 x i64> @pmuludq_square(<2 x i64> %x) { -; SSE2-LABEL: pmuludq_square: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pmuludq %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: pmuludq_square: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: pmuludq %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: pmuludq_square: +; SSE: # %bb.0: +; SSE-NEXT: pmuludq %xmm0, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: pmuludq_square: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX-NEXT: vpmuludq %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = and <2 x i64> %x, diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index 72a4bf34676..7fd3dc59cf1 100644 --- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -2758,23 +2758,13 @@ define i32 @test_mm_movemask_pd(<2 x double> %a0) nounwind { declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind { -; X86-SSE-LABEL: test_mm_mul_epu32: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; X86-SSE-NEXT: # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] -; X86-SSE-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 -; X86-SSE-NEXT: pand %xmm2, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc2] -; X86-SSE-NEXT: pand %xmm2, %xmm1 # encoding: [0x66,0x0f,0xdb,0xca] -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1] -; X86-SSE-NEXT: retl # encoding: [0xc3] +; SSE-LABEL: test_mm_mul_epu32: +; SSE: # %bb.0: +; SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1] +; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_mul_epu32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xef,0xd2] -; AVX1-NEXT: vpblendw $204, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xcc] -; AVX1-NEXT: # xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpblendw $204, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x0e,0xca,0xcc] -; AVX1-NEXT: # xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf4,0xc1] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; @@ -2787,16 +2777,6 @@ define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; AVX512-NEXT: # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512-NEXT: vpmullq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; X64-SSE-LABEL: test_mm_mul_epu32: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; X64-SSE-NEXT: # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] -; X64-SSE-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte -; X64-SSE-NEXT: pand %xmm2, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc2] -; X64-SSE-NEXT: pand %xmm2, %xmm1 # encoding: [0x66,0x0f,0xdb,0xca] -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1] -; X64-SSE-NEXT: retq # encoding: [0xc3] %A = and <2 x i64> %a0, %B = and <2 x i64> %a1, %res = mul nuw <2 x i64> %A, %B diff --git a/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll index dd82bef1d11..9990ac00eb0 100644 --- a/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll @@ -832,26 +832,11 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_mm_mul_epi32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psllq $32, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: psrad $31, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE-NEXT: pmuldq %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pmuldq %xmm1, %xmm0 ; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_mm_mul_epi32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: ret{{[l|q]}} ; -- 2.50.1