[X86][SSE] Combine shuffle nodes with multiple uses if all the users are being combined.

author Simon Pilgrim <llvm-dev@redking.me.uk>

Mon, 6 Feb 2017 13:44:45 +0000 (13:44 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Mon, 6 Feb 2017 13:44:45 +0000 (13:44 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Mon, 6 Feb 2017 13:44:45 +0000 (13:44 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Mon, 6 Feb 2017 13:44:45 +0000 (13:44 +0000)
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h

index a512c44e4bafaa505cef19ac9f4f58a58cd1ee8e..4630541e972ea4b27329952c60915d8ece27d502 100644 (file)
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -737,6 +737,10 @@ public:
      return false;
    }
  
+  /// Return true if all the users of N are contained in Nodes.
+  /// NOTE: Requires at least one match, but doesn't require them all.
+  static bool areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N);
+
    /// Return the number of values used by this operation.
    unsigned getNumOperands() const { return NumOperands; }
  
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

index 46dde01970b9bd30171f22268fd9c92150da41ab..dc5c8baaabc0c34854facb29bf3cf8841c6d808b 100644 (file)
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7125,6 +7125,21 @@ bool SDNode::isOnlyUserOf(const SDNode *N) const {
    return Seen;
  }
  
+/// Return true if the only users of N are contained in Nodes.
+bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
+  bool Seen = false;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDNode *User = *I;
+    if (llvm::any_of(Nodes,
+                     [&User](const SDNode *Node) { return User == Node; }))
+      Seen = true;
+    else
+      return false;
+  }
+
+  return Seen;
+}
+
  /// isOperand - Return true if this node is an operand of N.
  ///
  bool SDValue::isOperandOf(const SDNode *N) const {
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index af06cd3a7192b906b666fbc4d5b1345cd60eb3b9..83b340a1b5c4210832e243fe9beef6b36a43ea74 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -27333,6 +27333,7 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
  static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
                                            int SrcOpIndex, SDValue Root,
                                            ArrayRef<int> RootMask,
+                                          ArrayRef<const SDNode*> SrcNodes,
                                            int Depth, bool HasVariableMask,
                                            SelectionDAG &DAG,
                                            TargetLowering::DAGCombinerInfo &DCI,
@@ -27469,11 +27470,20 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
  
    HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
  
-  // See if we can recurse into each shuffle source op (if it's a target shuffle).
+  // Update the list of shuffle nodes that have been combined so far.
+  SmallVector<const SDNode *, 8> CombinedNodes(SrcNodes.begin(),
+                                               SrcNodes.end());
+  CombinedNodes.push_back(Op.getNode());
+
+  // See if we can recurse into each shuffle source op (if it's a target
+  // shuffle). The source op should only be combined if it either has a
+  // single use (i.e. current Op) or all its users have already been combined.
    for (int i = 0, e = Ops.size(); i < e; ++i)
-    if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
-      if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
-                                        HasVariableMask, DAG, DCI, Subtarget))
+    if (Ops[i].getNode()->hasOneUse() ||
+        SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
+      if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
+                                        Depth + 1, HasVariableMask, DAG, DCI,
+                                        Subtarget))
          return true;
  
    // Attempt to constant fold all of the constant source ops.
@@ -28270,7 +28280,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
      // a particular chain.
      SmallVector<int, 1> NonceMask; // Just a placeholder.
      NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                        /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                        DCI, Subtarget))
        return SDValue(); // This routine will use CombineTo to replace N.
@@ -30541,7 +30551,7 @@ static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
      SDValue Op(N, 0);
      SmallVector<int, 1> NonceMask; // Just a placeholder.
      NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                        /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                        DCI, Subtarget))
        return SDValue(); // This routine will use CombineTo to replace N.
@@ -30582,7 +30592,7 @@ static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
    SDValue Op(N, 0);
    SmallVector<int, 1> NonceMask; // Just a placeholder.
    NonceMask.push_back(0);
-  combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+  combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                  /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                  DCI, Subtarget);
    return SDValue();
@@ -30895,7 +30905,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
      SDValue Op(N, 0);
      SmallVector<int, 1> NonceMask; // Just a placeholder.
      NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                        /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                        DCI, Subtarget))
        return SDValue(); // This routine will use CombineTo to replace N.
@@ -33053,7 +33063,7 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
      SDValue Op(N, 0);
      SmallVector<int, 1> NonceMask; // Just a placeholder.
      NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                        /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                        DCI, Subtarget))
        return SDValue(); // This routine will use CombineTo to replace N.
diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll

index 6d50048ed7c7eb611adc33270db7dfbdf294e1ba..e9fad72752da1eb4a0ba6b04cf82e548ad3e71e2 100644 (file)
--- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll
+++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -260,13 +260,7 @@ define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind {
  define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind {
  ; SSE-LABEL: _clearupper2xi64b:
  ; SSE:       # BB#0:
-; SSE-NEXT:    xorps %xmm2, %xmm2
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
  ; SSE-NEXT:    retq
  ;
  ; AVX1-LABEL: _clearupper2xi64b:
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll

index e6cce778b218341824ddaae3e73f37a4e317676e..ae49a2c0da46ae8872f478a96991356b8f30c343 100644 (file)
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -639,9 +639,9 @@ CF244:
  
  define i16 @select_xor_1(i16 %A, i8 %cond) {
  ; CHECK-LABEL: select_xor_1:
-; MCU:    andl    $1, %edx\r
-; MCU-NEXT:    negl    %edx\r
-; MCU-NEXT:    andl    $43, %edx\r
+; MCU:    andl    $1, %edx
+; MCU-NEXT:    negl    %edx
+; MCU-NEXT:    andl    $43, %edx
  ; MCU-NEXT:    xorl    %edx, %eax
  entry:
   %and = and i8 %cond, 1
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll

index 4b23ba4f69f67222c286afaf980bb37afb7514a5..2f6267edfceb12858031244932c2a33a2414436a 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -715,22 +715,12 @@ define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
  define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) {
  ; X32-LABEL: combine_unpack_unpack_pshufb:
  ; X32:       # BB#0:
-; X32-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,0,0,0,4,4,4,4]
-; X32-NEXT:    vpshufd {{.*#+}} ymm2 = ymm0[1,1,2,3,5,5,6,7]
-; X32-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; X32-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; X32-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; X32-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: combine_unpack_unpack_pshufb:
  ; X64:       # BB#0:
-; X64-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,0,0,0,4,4,4,4]
-; X64-NEXT:    vpshufd {{.*#+}} ymm2 = ymm0[1,1,2,3,5,5,6,7]
-; X64-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; X64-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; X64-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; X64-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
  ; X64-NEXT:    retq
    %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
    %2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll

index 33b22b3fe86816b58aade89c603d7f76fc7f2f52..792496e83f038953b13b50ada2e62fad77dfa275 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -555,31 +555,12 @@ define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
  define <8 x i16> @shuffle_combine_unpack_insert(<8 x i16> %a0) {
  ; SSE-LABEL: shuffle_combine_unpack_insert:
  ; SSE:       # BB#0:
-; SSE-NEXT:    pextrw $2, %xmm0, %eax
-; SSE-NEXT:    pextrw $4, %xmm0, %ecx
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    pinsrw $4, %eax, %xmm2
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pinsrw $2, %ecx, %xmm1
-; SSE-NEXT:    movdqa %xmm2, %xmm3
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11]
  ; SSE-NEXT:    retq
  ;
  ; AVX-LABEL: shuffle_combine_unpack_insert:
  ; AVX:       # BB#0:
-; AVX-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX-NEXT:    vpextrw $4, %xmm0, %ecx
-; AVX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm1
-; AVX-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm2
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11]
  ; AVX-NEXT:    retq
    %1 = extractelement <8 x i16> %a0, i32 2
    %2 = extractelement <8 x i16> %a0, i32 4
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Mon, 6 Feb 2017 13:44:45 +0000 (13:44 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Mon, 6 Feb 2017 13:44:45 +0000 (13:44 +0000)
include/llvm/CodeGen/SelectionDAGNodes.h		patch \| blob \| history
lib/CodeGen/SelectionDAG/SelectionDAG.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/clear_upper_vector_element_bits.ll		patch \| blob \| history
test/CodeGen/X86/select.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-combining-avx2.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-combining-ssse3.ll		patch \| blob \| history