From: Craig Topper Date: Wed, 20 Dec 2017 07:36:59 +0000 (+0000) Subject: [X86] Optimize sign extends on index operand to gather/scatter to not sign extend... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=15dc4d1db93cd30d784b160947dc7ff2e2f36181;p=llvm [X86] Optimize sign extends on index operand to gather/scatter to not sign extend past i32. The gather instruction will implicitly sign extend to the pointer width, we don't need to further extend it. This can prevent unnecessary splitting in some cases. There's still an issue that lowering on non-VLX can introduce another sign extend that doesn't get combined with shifts from a lowered sign_extend_inreg. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@321152 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 987d77b551b..73685e07000 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -36214,37 +36214,35 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(N); - // Pre-shrink oversized index elements to avoid triggering scalarization. - if (DCI.isBeforeLegalize()) { + if (DCI.isBeforeLegalizeOps()) { SDValue Index = N->getOperand(4); - if (Index.getScalarValueSizeInBits() > 64) { - EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, - Index.getValueType().getVectorNumElements()); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index); - SmallVector NewOps(N->op_begin(), N->op_end()); - NewOps[4] = Trunc; - DAG.UpdateNodeOperands(N, NewOps); - DCI.AddToWorklist(N); - return SDValue(N, 0); + // Remove any sign extends from 32 or smaller to larger than 32. + // Only do this before LegalizeOps in case we need the sign extend for + // legalization. + if (Index.getOpcode() == ISD::SIGN_EXTEND) { + if (Index.getScalarValueSizeInBits() > 32 && + Index.getOperand(0).getScalarValueSizeInBits() <= 32) { + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[4] = Index.getOperand(0); + DAG.UpdateNodeOperands(N, NewOps); + // The original sign extend has less users, add back to worklist in case + // it needs to be removed + DCI.AddToWorklist(Index.getNode()); + DCI.AddToWorklist(N); + return SDValue(N, 0); + } } - } - // Try to remove sign extends from i32 to i64 on the index. - // Only do this before legalize in case we are relying on it for - // legalization. - // TODO: We should maybe remove any sign extend once we learn how to sign - // extend narrow index during lowering. - if (DCI.isBeforeLegalizeOps()) { - SDValue Index = N->getOperand(4); - if (Index.getScalarValueSizeInBits() == 64 && - Index.getOpcode() == ISD::SIGN_EXTEND && - Index.getOperand(0).getScalarValueSizeInBits() == 32) { + // Make sure the index is either i32 or i64 + unsigned ScalarSize = Index.getScalarValueSizeInBits(); + if (ScalarSize != 32 && ScalarSize != 64) { + MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32; + EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT, + Index.getValueType().getVectorNumElements()); + Index = DAG.getSExtOrTrunc(Index, DL, IndexVT); SmallVector NewOps(N->op_begin(), N->op_end()); - NewOps[4] = Index.getOperand(0); + NewOps[4] = Index; DAG.UpdateNodeOperands(N, NewOps); - // The original sign extend has less users, add back to worklist in case - // it needs to be removed. - DCI.AddToWorklist(Index.getNode()); DCI.AddToWorklist(N); return SDValue(N, 0); } diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll index 1eb2631e26e..f70b6024241 100644 --- a/test/CodeGen/X86/masked_gather_scatter.ll +++ b/test/CodeGen/X86/masked_gather_scatter.ll @@ -2606,56 +2606,32 @@ define <2 x float> @large_index(float* %base, <2 x i128> %ind, <2 x i1> %mask, < define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) { ; KNL_64-LABEL: sext_i8_index: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpmovsxbw %xmm0, %ymm0 -; KNL_64-NEXT: vpmovsxwq %xmm0, %zmm1 -; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL_64-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 -; KNL_64-NEXT: kxnorw %k0, %k0, %k2 -; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2} -; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} -; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: sext_i8_index: ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpmovsxbw %xmm0, %ymm0 -; KNL_32-NEXT: vpmovsxwq %xmm0, %zmm1 -; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL_32-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 -; KNL_32-NEXT: kxnorw %k0, %k0, %k2 -; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2} -; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} -; KNL_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: sext_i8_index: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsxbw %xmm0, %ymm0 -; SKX-NEXT: vpmovsxwq %xmm0, %zmm1 -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpmovsxwq %xmm0, %zmm0 +; SKX-NEXT: vpmovsxbd %xmm0, %zmm1 ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kxnorw %k0, %k0, %k2 -; SKX-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2} -; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} -; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: sext_i8_index: ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpmovsxbw %xmm0, %ymm0 -; SKX_32-NEXT: vpmovsxwq %xmm0, %zmm1 -; SKX_32-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX_32-NEXT: vpmovsxwq %xmm0, %zmm0 +; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: kxnorw %k0, %k0, %k2 -; SKX_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2} -; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} -; SKX_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; SKX_32-NEXT: retl %sext_ind = sext <16 x i8> %ind to <16 x i64> @@ -2669,40 +2645,42 @@ define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) { define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) { ; KNL_64-LABEL: sext_v8i8_index: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; KNL_64-NEXT: vpsllq $56, %zmm0, %zmm0 -; KNL_64-NEXT: vpsraq $56, %zmm0, %zmm1 +; KNL_64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpslld $24, %ymm0, %ymm0 +; KNL_64-NEXT: vpsrad $24, %ymm0, %ymm0 +; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: sext_v8i8_index: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; KNL_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpsllq $56, %zmm0, %zmm0 -; KNL_32-NEXT: vpsraq $56, %zmm0, %zmm1 +; KNL_32-NEXT: vpslld $24, %ymm0, %ymm0 +; KNL_32-NEXT: vpsrad $24, %ymm0, %ymm0 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: sext_v8i8_index: ; SKX: # %bb.0: -; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; SKX-NEXT: vpsllq $56, %zmm0, %zmm0 -; SKX-NEXT: vpsraq $56, %zmm0, %zmm1 +; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} +; SKX-NEXT: vpslld $24, %ymm0, %ymm0 +; SKX-NEXT: vpsrad $24, %ymm0, %ymm1 +; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: sext_v8i8_index: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; SKX_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpsllq $56, %zmm0, %zmm0 -; SKX_32-NEXT: vpsraq $56, %zmm0, %zmm1 +; SKX_32-NEXT: vpslld $24, %ymm0, %ymm0 +; SKX_32-NEXT: vpsrad $24, %ymm0, %ymm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} +; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} ; SKX_32-NEXT: retl %sext_ind = sext <8 x i8> %ind to <8 x i64>