From: Simon Pilgrim Date: Fri, 4 Jan 2019 15:43:43 +0000 (+0000) Subject: [X86] Add VPSLLI/VPSRLI ((X >>u C1) << C2) SimplifyDemandedBits combine X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=98472a7053e44540e9af1e3976081db10a2448fa;p=llvm [X86] Add VPSLLI/VPSRLI ((X >>u C1) << C2) SimplifyDemandedBits combine Repeat of the generic SimplifyDemandedBits shift combine git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@350399 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 13fb8358859..32f79184a03 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -32403,15 +32403,38 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( break; } case X86ISD::VSHLI: { - if (auto *ShiftImm = dyn_cast(Op.getOperand(1))) { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + if (auto *ShiftImm = dyn_cast(Op1)) { if (ShiftImm->getAPIntValue().uge(BitWidth)) break; unsigned ShAmt = ShiftImm->getZExtValue(); APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt); - if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, - OriginalDemandedElts, Known, TLO, Depth + 1)) + // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a + // single shift. We can do this if the bottom bits (which are shifted + // out) are never demanded. + if (Op0.getOpcode() == X86ISD::VSRLI && + OriginalDemandedBits.countTrailingZeros() >= ShAmt) { + if (auto *Shift2Imm = dyn_cast(Op0.getOperand(1))) { + if (Shift2Imm->getAPIntValue().ult(BitWidth)) { + int Diff = ShAmt - Shift2Imm->getZExtValue(); + if (Diff == 0) + return TLO.CombineTo(Op, Op0.getOperand(0)); + + unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI; + SDValue NewShift = TLO.DAG.getNode( + NewOpc, SDLoc(Op), VT, Op0.getOperand(0), + TLO.DAG.getConstant(std::abs(Diff), SDLoc(Op), MVT::i8)); + return TLO.CombineTo(Op, NewShift); + } + } + } + + if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, + TLO, Depth + 1)) return true; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); diff --git a/test/CodeGen/X86/vector-trunc-widen.ll b/test/CodeGen/X86/vector-trunc-widen.ll index 497393786aa..be44f4618a1 100644 --- a/test/CodeGen/X86/vector-trunc-widen.ll +++ b/test/CodeGen/X86/vector-trunc-widen.ll @@ -468,11 +468,7 @@ entry: define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) { ; SSE2-LABEL: trunc8i32_8i16_lshr: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -767,18 +763,10 @@ entry: define void @trunc16i32_16i16_lshr(<16 x i32> %a) { ; SSE2-LABEL: trunc16i32_16i16_lshr: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: psrld $16, %xmm2 -; SSE2-NEXT: psrld $16, %xmm3 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pslld $16, %xmm3 ; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: movdqu %xmm2, (%rax) @@ -787,18 +775,10 @@ define void @trunc16i32_16i16_lshr(<16 x i32> %a) { ; ; SSSE3-LABEL: trunc16i32_16i16_lshr: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: psrld $16, %xmm2 -; SSSE3-NEXT: psrld $16, %xmm3 -; SSSE3-NEXT: psrld $16, %xmm0 -; SSSE3-NEXT: psrld $16, %xmm1 -; SSSE3-NEXT: pslld $16, %xmm1 ; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: pslld $16, %xmm0 ; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: packssdw %xmm1, %xmm0 -; SSSE3-NEXT: pslld $16, %xmm3 ; SSSE3-NEXT: psrad $16, %xmm3 -; SSSE3-NEXT: pslld $16, %xmm2 ; SSSE3-NEXT: psrad $16, %xmm2 ; SSSE3-NEXT: packssdw %xmm3, %xmm2 ; SSSE3-NEXT: movdqu %xmm2, (%rax) diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index c17a618f28b..ba353fe60a1 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -478,11 +478,7 @@ entry: define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) { ; SSE2-LABEL: trunc8i32_8i16_lshr: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -777,18 +773,10 @@ entry: define void @trunc16i32_16i16_lshr(<16 x i32> %a) { ; SSE2-LABEL: trunc16i32_16i16_lshr: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: psrld $16, %xmm2 -; SSE2-NEXT: psrld $16, %xmm3 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pslld $16, %xmm3 ; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: movdqu %xmm2, (%rax) @@ -797,18 +785,10 @@ define void @trunc16i32_16i16_lshr(<16 x i32> %a) { ; ; SSSE3-LABEL: trunc16i32_16i16_lshr: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: psrld $16, %xmm2 -; SSSE3-NEXT: psrld $16, %xmm3 -; SSSE3-NEXT: psrld $16, %xmm0 -; SSSE3-NEXT: psrld $16, %xmm1 -; SSSE3-NEXT: pslld $16, %xmm1 ; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: pslld $16, %xmm0 ; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: packssdw %xmm1, %xmm0 -; SSSE3-NEXT: pslld $16, %xmm3 ; SSSE3-NEXT: psrad $16, %xmm3 -; SSSE3-NEXT: pslld $16, %xmm2 ; SSSE3-NEXT: psrad $16, %xmm2 ; SSSE3-NEXT: packssdw %xmm3, %xmm2 ; SSSE3-NEXT: movdqu %xmm2, (%rax)