From 53c102ac20116062f637a701b3ab8ffb116b5efe Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 6 Mar 2019 16:11:03 +0000
Subject: [PATCH] [DAGCombiner] Enable UADDO/USUBO vector combine support

Differential Revision: https://reviews.llvm.org/D58965

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@355517 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 19 ++++++-------
 test/CodeGen/X86/combine-addo.ll         | 33 ++++++++--------------
 test/CodeGen/X86/combine-subo.ll         | 36 ++++--------------------
 3 files changed, 26 insertions(+), 62 deletions(-)
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a839073e63d..2e4c5933b9e 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2443,8 +2443,6 @@ SDValue DAGCombiner::visitUADDO(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
-  if (VT.isVector())
-    return SDValue();
 
   EVT CarryVT = N->getValueType(1);
   SDLoc DL(N);
@@ -2455,13 +2453,12 @@ SDValue DAGCombiner::visitUADDO(SDNode *N) {
                      DAG.getUNDEF(CarryVT));
 
   // canonicalize constant to RHS.
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
-  if (N0C && !N1C)
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0);
 
   // fold (uaddo x, 0) -> x + no carry out
-  if (isNullConstant(N1))
+  if (isNullOrNullSplat(N1))
     return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
 
   // If it cannot overflow, transform into an add.
@@ -2488,7 +2485,9 @@ SDValue DAGCombiner::visitUADDO(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) {
-  auto VT = N0.getValueType();
+  EVT VT = N0.getValueType();
+  if (VT.isVector())
+    return SDValue();
 
   // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
   // If Y + 1 cannot overflow.
@@ -2952,8 +2951,6 @@ SDValue DAGCombiner::visitUSUBO(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
-  if (VT.isVector())
-    return SDValue();
 
   EVT CarryVT = N->getValueType(1);
   SDLoc DL(N);
@@ -2969,11 +2966,11 @@ SDValue DAGCombiner::visitUSUBO(SDNode *N) {
                      DAG.getConstant(0, DL, CarryVT));
 
   // fold (usubo x, 0) -> x + no borrow
-  if (isNullConstant(N1))
+  if (isNullOrNullSplat(N1))
     return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
 
   // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow
-  if (isAllOnesConstant(N0))
+  if (isAllOnesOrAllOnesSplat(N0))
     return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
                      DAG.getConstant(0, DL, CarryVT));
 
diff --git a/test/CodeGen/X86/combine-addo.ll b/test/CodeGen/X86/combine-addo.ll
index 23e5366f5e9..e93254e052a 100644
--- a/test/CodeGen/X86/combine-addo.ll
+++ b/test/CodeGen/X86/combine-addo.ll
@@ -62,18 +62,10 @@ define i32 @combine_uadd_zero(i32 %a0, i32 %a1) {
 define <4 x i32> @combine_vec_uadd_zero(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-LABEL: combine_vec_uadd_zero:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    pmaxud %xmm0, %xmm0
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE-NEXT:    blendvps %xmm0, %xmm2, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_uadd_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpmaxud %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %1 = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> zeroinitializer)
   %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
@@ -108,24 +100,23 @@ define i32 @combine_uadd_not(i32 %a0, i32 %a1) {
 define <4 x i32> @combine_vec_uadd_not(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-LABEL: combine_vec_uadd_not:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE-NEXT:    pxor %xmm2, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psubd %xmm2, %xmm3
-; SSE-NEXT:    pmaxud %xmm3, %xmm0
-; SSE-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE-NEXT:    blendvps %xmm0, %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    psubd %xmm0, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1]
+; SSE-NEXT:    pmaxud %xmm2, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE-NEXT:    blendvps %xmm0, %xmm2, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_uadd_not:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpsubd %xmm2, %xmm0, %xmm2
-; AVX-NEXT:    vpmaxud %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
+; AVX-NEXT:    vpmaxud %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %1 = xor <4 x i32> %a0, <i32 -1, i32 -1, i32 -1, i32 -1>
   %2 = call {<4 x i32>, <4 x i1>} @llvm.uadd.with.overflow.v4i32(<4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
diff --git a/test/CodeGen/X86/combine-subo.ll b/test/CodeGen/X86/combine-subo.ll
index c162515c257..5113c95f920 100644
--- a/test/CodeGen/X86/combine-subo.ll
+++ b/test/CodeGen/X86/combine-subo.ll
@@ -62,18 +62,10 @@ define i32 @combine_usub_zero(i32 %a0, i32 %a1) {
 define <4 x i32> @combine_vec_usub_zero(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-LABEL: combine_vec_usub_zero:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    pminud %xmm0, %xmm0
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE-NEXT:    blendvps %xmm0, %xmm2, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_usub_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpminud %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %1 = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> zeroinitializer)
   %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
@@ -138,20 +130,12 @@ define i32 @combine_usub_self(i32 %a0, i32 %a1) {
 define <4 x i32> @combine_vec_usub_self(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-LABEL: combine_vec_usub_self:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psubd %xmm0, %xmm2
-; SSE-NEXT:    pminud %xmm2, %xmm0
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE-NEXT:    blendvps %xmm0, %xmm2, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_usub_self:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpsubd %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vpminud %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a0)
   %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
@@ -183,22 +167,14 @@ define i32 @combine_usub_negone(i32 %a0, i32 %a1) {
 define <4 x i32> @combine_vec_usub_negone(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-LABEL: combine_vec_usub_negone:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE-NEXT:    pxor %xmm0, %xmm2
-; SSE-NEXT:    pminud %xmm2, %xmm0
-; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
-; SSE-NEXT:    blendvps %xmm0, %xmm2, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_usub_negone:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpminud %xmm2, %xmm0, %xmm2
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %a0)
   %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0
-- 
2.40.0