From: Simon Pilgrim Date: Tue, 18 Dec 2018 08:55:47 +0000 (+0000) Subject: [X86][SSE] Replace (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31) fold. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ce5a6119833914748df042b0880f0d609ad0b3be;p=llvm [X86][SSE] Replace (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31) fold. This fold was incredibly specific - replace with a SimplifyDemandedBits fold to remove a VSRAI if only the original sign bit is demanded (its guaranteed to stay the same). Test change is merely a rescheduling. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@349459 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index cdd4e1dd9a3..0988fa9dfe3 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -32447,6 +32447,10 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( if (ShiftImm->getAPIntValue().uge(BitWidth)) break; + // If we just want the sign bit then we don't need to shift it. + if (OriginalDemandedBits.isSignMask()) + return TLO.CombineTo(Op, Op0); + unsigned ShAmt = ShiftImm->getZExtValue(); APInt DemandedMask = OriginalDemandedBits << ShAmt; @@ -32507,7 +32511,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( else if (KnownSrc.Zero[SrcBits - 1]) Known.Zero.setLowBits(NumElts); return false; - } + } } return TargetLowering::SimplifyDemandedBitsForTargetNode( @@ -35562,13 +35566,6 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, if (ISD::isBuildVectorAllZeros(N0.getNode())) return DAG.getConstant(0, SDLoc(N), VT); - // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31). - // This VSRLI only looks at the sign bit, which is unmodified by VSRAI. - // TODO - support other sra opcodes as needed. - if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt && - N0.getOpcode() == X86ISD::VSRAI) - return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1); - // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI && N1 == N0.getOperand(1)) { diff --git a/test/CodeGen/X86/combine-srem.ll b/test/CodeGen/X86/combine-srem.ll index 0ed2f63282a..4878d708e48 100644 --- a/test/CodeGen/X86/combine-srem.ll +++ b/test/CodeGen/X86/combine-srem.ll @@ -252,35 +252,35 @@ define <4 x i32> @combine_vec_srem_by_pow2b(<4 x i32> %x) { ; SSE-LABEL: combine_vec_srem_by_pow2b: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $31, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: psrld $29, %xmm3 -; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; SSE-NEXT: psrld $30, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; SSE-NEXT: paddd %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrad $3, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrld $29, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psrld $31, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: psrld $30, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $3, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: psrad $1, %xmm3 -; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; SSE-NEXT: psrad $2, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE-NEXT: pmulld {{.*}}(%rip), %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: psrad $2, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: combine_vec_srem_by_pow2b: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpsrld $29, %xmm2, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpsrld $30, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $29, %xmm1, %xmm2 +; AVX1-NEXT: vpsrld $31, %xmm1, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpsrld $30, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 ; AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 @@ -317,10 +317,10 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) { ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: psrld $30, %xmm3 ; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrld $31, %xmm2 -; SSE-NEXT: psrld $29, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrld $29, %xmm2 +; SSE-NEXT: psrld $31, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; SSE-NEXT: paddd %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 @@ -344,9 +344,9 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) { ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 ; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpsrld $31, %xmm0, %xmm3 -; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpsrld $29, %xmm1, %xmm3 +; AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2