From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 1 Sep 2017 21:09:04 +0000 (+0000)
Subject: [x86] eliminate redundant shuffle of horizontal math ops when both inputs are the... 
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3cac265b271a7ca43069c0ce39e85fa49812e8f0;p=llvm

[x86] eliminate redundant shuffle of horizontal math ops when both inputs are the same

This is limited to a set of patterns based on the example in PR34111:
https://bugs.llvm.org/show_bug.cgi?id=34111
...but as I was investigating this, I see that horizontal patterns can go wrong in many,
many other ways that would not be handled by this patch. Each data type may even go
different in the DAG after starting with the same basic IR pattern, so even proper IR
canonicalization won't fix it all.

Differential Revision: https://reviews.llvm.org/D37357


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@312379 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index c7b383a8be4..e0c17a2c27f 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -29053,6 +29053,40 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
 }
 
+/// Eliminate a redundant shuffle of a horizontal math op.
+static SDValue foldShuffleOfHorizOp(SDNode *N) {
+  if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
+    return SDValue();
+
+  SDValue HOp = N->getOperand(0);
+  if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
+      HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
+    return SDValue();
+
+  // 128-bit horizontal math instructions are defined to operate on adjacent
+  // lanes of each operand as:
+  // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
+  // ...similarly for v2f64 and v8i16.
+  // TODO: 256-bit is not the same because...x86.
+  if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
+    return SDValue();
+
+  // When the operands of a horizontal math op are identical, the low half of
+  // the result is the same as the high half. If the shuffle is also replicating
+  // low and high halves, we don't need the shuffle.
+  // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+  // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
+  // but this should be tied to whatever horizontal op matching and shuffle
+  // canonicalization are producing.
+  if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
+      isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
+      isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
+    return HOp;
+
+  return SDValue();
+}
+
 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
                               TargetLowering::DAGCombinerInfo &DCI,
                               const X86Subtarget &Subtarget) {
@@ -29061,10 +29095,14 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   // If we have legalized the vector types, look for blends of FADD and FSUB
   // nodes that we can fuse into an ADDSUB node.
-  if (TLI.isTypeLegal(VT))
+  if (TLI.isTypeLegal(VT)) {
     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
       return AddSub;
 
+    if (SDValue HAddSub = foldShuffleOfHorizOp(N))
+      return HAddSub;
+  }
+
   // During Type Legalization, when promoting illegal vector types,
   // the backend might introduce new shuffle dag nodes and bitcasts.
   //
diff --git a/test/CodeGen/X86/haddsub-shuf.ll b/test/CodeGen/X86/haddsub-shuf.ll
index c22453ba4f5..37597c415d6 100644
--- a/test/CodeGen/X86/haddsub-shuf.ll
+++ b/test/CodeGen/X86/haddsub-shuf.ll
@@ -9,13 +9,11 @@ define <4 x float> @hadd_v4f32(<4 x float> %a) {
 ; SSSE3-LABEL: hadd_v4f32:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    haddps %xmm0, %xmm0
-; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd_v4f32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
@@ -28,13 +26,11 @@ define <4 x float> @hsub_v4f32(<4 x float> %a) {
 ; SSSE3-LABEL: hsub_v4f32:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hsub_v4f32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
@@ -47,13 +43,11 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) {
 ; SSSE3-LABEL: hadd_v2f64:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    haddpd %xmm0, %xmm0
-; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd_v2f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -66,13 +60,11 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) {
 ; SSSE3-LABEL: hsub_v2f64:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    hsubpd %xmm0, %xmm0
-; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hsub_v2f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -85,13 +77,11 @@ define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
 ; SSSE3-LABEL: hadd_v4i32:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd_v4i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    retq
   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
@@ -104,13 +94,11 @@ define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
 ; SSSE3-LABEL: hsub_v4i32:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    phsubd %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hsub_v4i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    retq
   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
@@ -123,13 +111,11 @@ define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
 ; SSSE3-LABEL: hadd_v8i16:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    phaddw %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd_v8i16:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    retq
   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -142,13 +128,11 @@ define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
 ; SSSE3-LABEL: hsub_v8i16:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    phsubw %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hsub_v8i16:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    retq
   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>