From: Simon Pilgrim Date: Mon, 6 Feb 2017 13:44:45 +0000 (+0000) Subject: [X86][SSE] Combine shuffle nodes with multiple uses if all the users are being combined. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0aca40051b03f86a952cdec20db7d6af714a6d6f;p=llvm [X86][SSE] Combine shuffle nodes with multiple uses if all the users are being combined. Currently we only combine shuffle nodes if they have a single user to prevent us from causing code bloat by splitting the shuffles into several different combines. We don't take into account that in some cases we will already have combined all the users during recursively calling up the shuffle tree. This patch keeps a list of all the shuffle nodes that have been combined so far and permits combining of further shuffle nodes if all its users are in that list. Differential Revision: https://reviews.llvm.org/D29399 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294183 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h index a512c44e4ba..4630541e972 100644 --- a/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/include/llvm/CodeGen/SelectionDAGNodes.h @@ -737,6 +737,10 @@ public: return false; } + /// Return true if all the users of N are contained in Nodes. + /// NOTE: Requires at least one match, but doesn't require them all. + static bool areOnlyUsersOf(ArrayRef Nodes, const SDNode *N); + /// Return the number of values used by this operation. unsigned getNumOperands() const { return NumOperands; } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 46dde01970b..dc5c8baaabc 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7125,6 +7125,21 @@ bool SDNode::isOnlyUserOf(const SDNode *N) const { return Seen; } +/// Return true if the only users of N are contained in Nodes. +bool SDNode::areOnlyUsersOf(ArrayRef Nodes, const SDNode *N) { + bool Seen = false; + for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { + SDNode *User = *I; + if (llvm::any_of(Nodes, + [&User](const SDNode *Node) { return User == Node; })) + Seen = true; + else + return false; + } + + return Seen; +} + /// isOperand - Return true if this node is an operand of N. /// bool SDValue::isOperandOf(const SDNode *N) const { diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index af06cd3a719..83b340a1b5c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -27333,6 +27333,7 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl &Ops, static bool combineX86ShufflesRecursively(ArrayRef SrcOps, int SrcOpIndex, SDValue Root, ArrayRef RootMask, + ArrayRef SrcNodes, int Depth, bool HasVariableMask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -27469,11 +27470,20 @@ static bool combineX86ShufflesRecursively(ArrayRef SrcOps, HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); - // See if we can recurse into each shuffle source op (if it's a target shuffle). + // Update the list of shuffle nodes that have been combined so far. + SmallVector CombinedNodes(SrcNodes.begin(), + SrcNodes.end()); + CombinedNodes.push_back(Op.getNode()); + + // See if we can recurse into each shuffle source op (if it's a target + // shuffle). The source op should only be combined if it either has a + // single use (i.e. current Op) or all its users have already been combined. for (int i = 0, e = Ops.size(); i < e; ++i) - if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode())) - if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1, - HasVariableMask, DAG, DCI, Subtarget)) + if (Ops[i].getNode()->hasOneUse() || + SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) + if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes, + Depth + 1, HasVariableMask, DAG, DCI, + Subtarget)) return true; // Attempt to constant fold all of the constant source ops. @@ -28270,7 +28280,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // a particular chain. SmallVector NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. @@ -30541,7 +30551,7 @@ static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG, SDValue Op(N, 0); SmallVector NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. @@ -30582,7 +30592,7 @@ static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, SDValue Op(N, 0); SmallVector NonceMask; // Just a placeholder. NonceMask.push_back(0); - combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget); return SDValue(); @@ -30895,7 +30905,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, SDValue Op(N, 0); SmallVector NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. @@ -33053,7 +33063,7 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, SDValue Op(N, 0); SmallVector NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll index 6d50048ed7c..e9fad72752d 100644 --- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -260,13 +260,7 @@ define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind { define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind { ; SSE-LABEL: _clearupper2xi64b: ; SSE: # BB#0: -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: _clearupper2xi64b: diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll index e6cce778b21..ae49a2c0da4 100644 --- a/test/CodeGen/X86/select.ll +++ b/test/CodeGen/X86/select.ll @@ -639,9 +639,9 @@ CF244: define i16 @select_xor_1(i16 %A, i8 %cond) { ; CHECK-LABEL: select_xor_1: -; MCU: andl $1, %edx -; MCU-NEXT: negl %edx -; MCU-NEXT: andl $43, %edx +; MCU: andl $1, %edx +; MCU-NEXT: negl %edx +; MCU-NEXT: andl $43, %edx ; MCU-NEXT: xorl %edx, %eax entry: %and = and i8 %cond, 1 diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 4b23ba4f69f..2f6267edfce 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -715,22 +715,12 @@ define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) { define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) { ; X32-LABEL: combine_unpack_unpack_pshufb: ; X32: # BB#0: -; X32-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,0,0,0,4,4,4,4] -; X32-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,2,3,5,5,6,7] -; X32-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; X32-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] -; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27] ; X32-NEXT: retl ; ; X64-LABEL: combine_unpack_unpack_pshufb: ; X64: # BB#0: -; X64-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,0,0,0,4,4,4,4] -; X64-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,2,3,5,5,6,7] -; X64-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; X64-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] -; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27] ; X64-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> %2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index 33b22b3fe86..792496e83f0 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -555,31 +555,12 @@ define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) { define <8 x i16> @shuffle_combine_unpack_insert(<8 x i16> %a0) { ; SSE-LABEL: shuffle_combine_unpack_insert: ; SSE: # BB#0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: pextrw $4, %xmm0, %ecx -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pinsrw $4, %eax, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pinsrw $2, %ecx, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11] ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_combine_unpack_insert: ; AVX: # BB#0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: vpextrw $4, %xmm0, %ecx -; AVX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm1 -; AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11] ; AVX-NEXT: retq %1 = extractelement <8 x i16> %a0, i32 2 %2 = extractelement <8 x i16> %a0, i32 4