From a2b71f96a9d64aaccd23687d424d54bceb093160 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 8 Apr 2019 20:59:38 +0000 Subject: [PATCH] [TargetLowering] SimplifyDemandedBits - use DemandedElts in bitcast handling Be more selective in the SimplifyDemandedBits -> SimplifyDemandedVectorElts bitcast call based on the demanded elts. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@357942 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/TargetLowering.cpp | 25 +++++++++++---------- test/CodeGen/X86/extract-insert.ll | 11 +-------- test/CodeGen/X86/vector-reduce-mul-widen.ll | 15 +++++-------- test/CodeGen/X86/widen_load-2.ll | 12 ++++------ 4 files changed, 24 insertions(+), 39 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 5b89aceb9d4..89d9a55c3f7 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1394,6 +1394,7 @@ bool TargetLowering::SimplifyDemandedBits( TLO.DAG.getNode(ISD::SHL, dl, VT, Sign, ShAmt)); } } + // If bitcast from a vector, see if we can use SimplifyDemandedVectorElts by // demanding the element if any bits from it are demanded. // TODO - bigendian once we have test coverage. @@ -1401,26 +1402,26 @@ bool TargetLowering::SimplifyDemandedBits( if (SrcVT.isVector() && NumSrcEltBits > 1 && (BitWidth % NumSrcEltBits) == 0 && TLO.DAG.getDataLayout().isLittleEndian()) { - unsigned Scale = BitWidth / NumSrcEltBits; - auto GetDemandedSubMask = [&](APInt &DemandedSubElts) -> bool { - DemandedSubElts = APInt::getNullValue(Scale); + auto GetDemandedSrcMask = [&](APInt &DemandedSrcElts) -> bool { + unsigned Scale = BitWidth / NumSrcEltBits; + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + DemandedSrcElts = APInt::getNullValue(NumSrcElts); for (unsigned i = 0; i != Scale; ++i) { unsigned Offset = i * NumSrcEltBits; APInt Sub = DemandedBits.extractBits(NumSrcEltBits, Offset); if (!Sub.isNullValue()) - DemandedSubElts.setBit(i); + for (unsigned j = 0; j != NumElts; ++j) + if (DemandedElts[j]) + DemandedSrcElts.setBit((j * Scale) + i); } return true; }; - APInt DemandedSubElts; - if (GetDemandedSubMask(DemandedSubElts)) { - unsigned NumSrcElts = SrcVT.getVectorNumElements(); - APInt DemandedElts = APInt::getSplat(NumSrcElts, DemandedSubElts); - - APInt KnownUndef, KnownZero; - if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero, - TLO, Depth + 1)) + APInt DemandedSrcElts; + if (GetDemandedSrcMask(DemandedSrcElts)) { + APInt KnownSrcUndef, KnownSrcZero; + if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef, + KnownSrcZero, TLO, Depth + 1)) return true; } } diff --git a/test/CodeGen/X86/extract-insert.ll b/test/CodeGen/X86/extract-insert.ll index be5f9ed24fb..97ca884a54b 100644 --- a/test/CodeGen/X86/extract-insert.ll +++ b/test/CodeGen/X86/extract-insert.ll @@ -28,30 +28,21 @@ define i8 @extractelt_bitcast(i32 %x) nounwind { ret i8 %ext } -; TODO: This should have folded to avoid vector ops, but the transform -; is guarded by 'hasOneUse'. That limitation apparently makes some AMDGPU -; codegen better. - define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind { ; X86-LABEL: extractelt_bitcast_extra_use: ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movd %eax, %xmm0 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %eax, (%ecx) -; X86-NEXT: movd %xmm0, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: popl %ecx ; X86-NEXT: retl ; ; X64-LABEL: extractelt_bitcast_extra_use: ; X64: # %bb.0: -; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: movl %edi, %eax ; X64-NEXT: movl %edi, (%rsi) -; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %bc = bitcast i32 %x to <4 x i8> diff --git a/test/CodeGen/X86/vector-reduce-mul-widen.ll b/test/CodeGen/X86/vector-reduce-mul-widen.ll index e30da07e635..413336f06b5 100644 --- a/test/CodeGen/X86/vector-reduce-mul-widen.ll +++ b/test/CodeGen/X86/vector-reduce-mul-widen.ll @@ -1558,9 +1558,8 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pmullw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; @@ -1606,9 +1605,8 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pmullw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; @@ -1676,9 +1674,8 @@ define i8 @test_v8i8(<8 x i8> %a0) { ; SSE2-NEXT: packuswb %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll index ea8f4ff0528..e4ee28a8f88 100644 --- a/test/CodeGen/X86/widen_load-2.ll +++ b/test/CodeGen/X86/widen_load-2.ll @@ -362,11 +362,9 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa ; X86-NEXT: movb $1, 2(%ecx) ; X86-NEXT: movw $257, (%ecx) # imm = 0x101 ; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-NEXT: movdqa %xmm0, %xmm1 -; X86-NEXT: psrld $1, %xmm1 -; X86-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; X86-NEXT: psrld $1, %xmm0 +; X86-NEXT: pextrb $8, %xmm0, 2(%eax) ; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; X86-NEXT: pextrb $8, %xmm1, 2(%eax) ; X86-NEXT: pextrw $0, %xmm0, (%eax) ; X86-NEXT: addl $16, %esp ; X86-NEXT: retl $4 @@ -379,11 +377,9 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa ; X64-NEXT: movb $1, 2(%rdx) ; X64-NEXT: movw $257, (%rdx) # imm = 0x101 ; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psrld $1, %xmm1 -; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; X64-NEXT: psrld $1, %xmm0 +; X64-NEXT: pextrb $8, %xmm0, 2(%rdi) ; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; X64-NEXT: pextrb $8, %xmm1, 2(%rdi) ; X64-NEXT: pextrw $0, %xmm0, (%rdi) ; X64-NEXT: retq entry: -- 2.50.1