From 4f8964c97278ac61fc229690d3a6c538579d52de Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 24 Dec 2018 19:40:20 +0000
Subject: [PATCH] [X86] Use GetDemandedBits to simplify the operands of
 PMULDQ/PMULUDQ.

This is an alternative to what I attempted in D56057.

GetDemandedBits is a special version of SimplifyDemandedBits that allows simplifications even when the operand has other uses. GetDemandedBits will only do simplifications that allow a node to be bypassed. It won't create new nodes or alter any of the other users.

I had to add support for bypassing SIGN_EXTEND_INREG to GetDemandedBits.

Based on a patch that Simon Pilgrim sent me in email.

Fixes PR40142.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@350059 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |  9 ++++
 lib/Target/X86/X86ISelLowering.cpp            |  9 ++++
 test/CodeGen/X86/avx2-intrinsics-fast-isel.ll |  9 ----
 .../X86/avx512-intrinsics-fast-isel.ll        |  9 ----
 test/CodeGen/X86/pmul.ll                      | 45 ++++---------------
 test/CodeGen/X86/sse2-intrinsics-fast-isel.ll | 28 ++----------
 .../CodeGen/X86/sse41-intrinsics-fast-isel.ll | 17 +------
 7 files changed, 32 insertions(+), 94 deletions(-)
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b4947de31c9..f38770b773f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2118,6 +2118,15 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &Mask) {
       return getNode(ISD::ANY_EXTEND, SDLoc(V), V.getValueType(), DemandedSrc);
     break;
   }
+  case ISD::SIGN_EXTEND_INREG:
+    EVT ExVT = cast<VTSDNode>(V.getOperand(1))->getVT();
+    unsigned ExVTBits = ExVT.getScalarSizeInBits();
+
+    // If none of the extended bits are demanded, eliminate the sextinreg.
+    if (Mask.getActiveBits() <= ExVTBits)
+      return V.getOperand(0);
+
+    break;
   }
   return SDValue();
 }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 1fb1aa12de6..978a7adb304 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -41212,6 +41212,15 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
   if (ISD::isBuildVectorAllZeros(RHS.getNode()))
     return RHS;
 
+  // Aggressively peek through ops to get at the demanded low bits.
+  APInt DemandedMask = APInt::getLowBitsSet(64, 32);
+  SDValue DemandedLHS = DAG.GetDemandedBits(LHS, DemandedMask);
+  SDValue DemandedRHS = DAG.GetDemandedBits(RHS, DemandedMask);
+  if (DemandedLHS || DemandedRHS)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+                       DemandedLHS ? DemandedLHS : LHS,
+                       DemandedRHS ? DemandedRHS : RHS);
+
   // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
diff --git a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
index e43e2d57b26..d230ffe7073 100644
--- a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
@@ -1823,12 +1823,6 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea
 define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_mul_epi32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsllq $32, %ymm0, %ymm2
-; CHECK-NEXT:    vpsrad $31, %ymm2, %ymm2
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; CHECK-NEXT:    vpsllq $32, %ymm1, %ymm2
-; CHECK-NEXT:    vpsrad $31, %ymm2, %ymm2
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
 ; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
@@ -1843,9 +1837,6 @@ declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
 define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_mm256_mul_epu32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
 ; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index 2b2424db196..53eb64f7b11 100644
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -1651,10 +1651,6 @@ define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind {
 define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_mul_epi32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsllq $32, %zmm0, %zmm0
-; CHECK-NEXT:    vpsraq $32, %zmm0, %zmm0
-; CHECK-NEXT:    vpsllq $32, %zmm1, %zmm1
-; CHECK-NEXT:    vpsraq $32, %zmm1, %zmm1
 ; CHECK-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
@@ -1718,11 +1714,6 @@ entry:
 define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
 ; CHECK-LABEL: test_mm512_mul_epu32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; CHECK-NEXT:    kmovw %eax, %k0
-; CHECK-NEXT:    knotw %k0, %k1
-; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vmovdqa32 %zmm1, %zmm1 {%k1} {z}
 ; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 72ddc673517..f65d37a7e5e 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1388,29 +1388,13 @@ define <2 x i64> @pmuldq_square(<2 x i64> %x) {
 ;
 ; SSE41-LABEL: pmuldq_square:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psllq $32, %xmm1
-; SSE41-NEXT:    psrad $31, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT:    pmuldq %xmm1, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pmuldq %xmm0, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX2-LABEL: pmuldq_square:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm1
-; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT:    vpmuldq %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: pmuldq_square:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT:    vpsraq $32, %zmm0, %zmm0
-; AVX512-NEXT:    vpmuldq %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: pmuldq_square:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmuldq %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = shl <2 x i64> %x, <i64 32, i64 32>
   %2 = ashr exact <2 x i64> %1, <i64 32, i64 32>
   %3 = mul nsw <2 x i64> %2, %2
@@ -1418,24 +1402,13 @@ define <2 x i64> @pmuldq_square(<2 x i64> %x) {
 }
 
 define <2 x i64> @pmuludq_square(<2 x i64> %x) {
-; SSE2-LABEL: pmuludq_square:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    pmuludq %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: pmuludq_square:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT:    pmuludq %xmm1, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: pmuludq_square:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmuludq %xmm0, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: pmuludq_square:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; AVX-NEXT:    vpmuludq %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = and <2 x i64> %x, <i64 4294967295, i64 4294967295>
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 72a4bf34676..7fd3dc59cf1 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -2758,23 +2758,13 @@ define i32 @test_mm_movemask_pd(<2 x double> %a0) nounwind {
 declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
 
 define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
-; X86-SSE-LABEL: test_mm_mul_epu32:
-; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; X86-SSE-NEXT:    # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
-; X86-SSE-NEXT:    # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4
-; X86-SSE-NEXT:    pand %xmm2, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc2]
-; X86-SSE-NEXT:    pand %xmm2, %xmm1 # encoding: [0x66,0x0f,0xdb,0xca]
-; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1]
-; X86-SSE-NEXT:    retl # encoding: [0xc3]
+; SSE-LABEL: test_mm_mul_epu32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1]
+; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX1-LABEL: test_mm_mul_epu32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xef,0xd2]
-; AVX1-NEXT:    vpblendw $204, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xcc]
-; AVX1-NEXT:    # xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpblendw $204, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x0e,0xca,0xcc]
-; AVX1-NEXT:    # xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf4,0xc1]
 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
@@ -2787,16 +2777,6 @@ define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; AVX512-NEXT:    # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
 ; AVX512-NEXT:    vpmullq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-;
-; X64-SSE-LABEL: test_mm_mul_epu32:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; X64-SSE-NEXT:    # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
-; X64-SSE-NEXT:    # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
-; X64-SSE-NEXT:    pand %xmm2, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc2]
-; X64-SSE-NEXT:    pand %xmm2, %xmm1 # encoding: [0x66,0x0f,0xdb,0xca]
-; X64-SSE-NEXT:    pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1]
-; X64-SSE-NEXT:    retq # encoding: [0xc3]
   %A = and <2 x i64> %a0, <i64 4294967295, i64 4294967295>
   %B = and <2 x i64> %a1, <i64 4294967295, i64 4294967295>
   %res = mul nuw <2 x i64> %A, %B
diff --git a/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
index dd82bef1d11..9990ac00eb0 100644
--- a/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
@@ -832,26 +832,11 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
 define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 ; SSE-LABEL: test_mm_mul_epi32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psllq $32, %xmm2
-; SSE-NEXT:    psrad $31, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psllq $32, %xmm0
-; SSE-NEXT:    psrad $31, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; SSE-NEXT:    pmuldq %xmm0, %xmm2
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pmuldq %xmm1, %xmm0
 ; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX1-LABEL: test_mm_mul_epi32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    ret{{[l|q]}}
 ;
-- 
2.50.1