From 25df0ed1c502d2044da0f796187e183f15514815 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Sun, 3 Feb 2019 17:53:09 +0000
Subject: [PATCH] [CGP] adjust target constraints for forming uaddo

There are 2 changes visible here:
1. There's no reason to limit this transform based on number
   of condition registers. That diff allows PPC to produce
   slightly better (dot-instructions should be generally good)
   code.
   Note: someone that cares about PPC codegen might want to
   look closer at that output because it seems like we could
   still improve this.

2. We (probably?) should not bother trying to form uaddo (or
   other overflow ops) when there's no target support for such
   an op. This goes beyond checking whether the op is expanded
   because both PPC and AArch64 show better codegen for standard
   types regardless of whether the op is legal/custom.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@353001 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/CodeGenPrepare.cpp                | 19 ++++---
 test/CodeGen/PowerPC/sat-add.ll               | 50 +++++++++----------
 test/CodeGen/X86/codegen-prepare-uaddo.ll     | 13 +++--
 .../CodeGenPrepare/X86/overflow-intrinsics.ll |  9 ++--
 4 files changed, 45 insertions(+), 46 deletions(-)
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 792e4a537ea..bcb899a9e02 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -1149,20 +1149,22 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
 
 /// Try to combine the compare into a call to the llvm.uadd.with.overflow
 /// intrinsic. Return true if any changes were made.
-static bool combineToUAddWithOverflow(CmpInst *Cmp, const TargetLowering &TLI) {
-  // TODO: Why is this transform limited by this condition?
-  if (TLI.hasMultipleConditionRegisters())
-    return false;
-
+static bool combineToUAddWithOverflow(CmpInst *Cmp, const TargetLowering &TLI,
+                                      const DataLayout &DL) {
   Value *A, *B;
   Instruction *AddI;
   if (!match(Cmp,
              m_UAddWithOverflow(m_Value(A), m_Value(B), m_Instruction(AddI))))
     return false;
 
+  // Allow the transform as long as we have an integer type that is not
+  // obviously illegal and unsupported.
   Type *Ty = AddI->getType();
   if (!isa<IntegerType>(Ty))
     return false;
+  EVT CodegenVT = TLI.getValueType(DL, Ty);
+  if (!CodegenVT.isSimple() && TLI.isOperationExpand(ISD::UADDO, CodegenVT))
+    return false;
 
   // We don't want to move around uses of condition values this late, so we we
   // check if it is legal to create the call to the intrinsic in the basic
@@ -1263,11 +1265,12 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
   return MadeChange;
 }
 
-static bool optimizeCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
+static bool optimizeCmpExpression(CmpInst *Cmp, const TargetLowering &TLI,
+                                  const DataLayout &DL) {
   if (sinkCmpExpression(Cmp, TLI))
     return true;
 
-  if (combineToUAddWithOverflow(Cmp, TLI))
+  if (combineToUAddWithOverflow(Cmp, TLI, DL))
     return true;
 
   return false;
@@ -6714,7 +6717,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
   }
 
   if (CmpInst *CI = dyn_cast<CmpInst>(I))
-    if (TLI && optimizeCmpExpression(CI, *TLI))
+    if (TLI && optimizeCmpExpression(CI, *TLI, *DL))
       return true;
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
diff --git a/test/CodeGen/PowerPC/sat-add.ll b/test/CodeGen/PowerPC/sat-add.ll
index 515ddfe976c..7f4e1b8cc2a 100644
--- a/test/CodeGen/PowerPC/sat-add.ll
+++ b/test/CodeGen/PowerPC/sat-add.ll
@@ -24,12 +24,11 @@ define i8 @unsigned_sat_constant_i8_using_min(i8 %x) {
 define i8 @unsigned_sat_constant_i8_using_cmp_sum(i8 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i8_using_cmp_sum:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi 5, 3, 42
 ; CHECK-NEXT:    rlwinm 3, 3, 0, 24, 31
+; CHECK-NEXT:    addi 3, 3, 42
+; CHECK-NEXT:    andi. 4, 3, 256
 ; CHECK-NEXT:    li 4, -1
-; CHECK-NEXT:    clrlwi 6, 5, 24
-; CHECK-NEXT:    cmplw 3, 6
-; CHECK-NEXT:    isel 3, 4, 5, 1
+; CHECK-NEXT:    isel 3, 3, 4, 2
 ; CHECK-NEXT:    blr
   %a = add i8 %x, 42
   %c = icmp ugt i8 %x, %a
@@ -70,12 +69,11 @@ define i16 @unsigned_sat_constant_i16_using_min(i16 %x) {
 define i16 @unsigned_sat_constant_i16_using_cmp_sum(i16 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_sum:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi 5, 3, 42
 ; CHECK-NEXT:    rlwinm 3, 3, 0, 16, 31
+; CHECK-NEXT:    addi 3, 3, 42
+; CHECK-NEXT:    andis. 4, 3, 1
 ; CHECK-NEXT:    li 4, -1
-; CHECK-NEXT:    clrlwi 6, 5, 16
-; CHECK-NEXT:    cmplw 3, 6
-; CHECK-NEXT:    isel 3, 4, 5, 1
+; CHECK-NEXT:    isel 3, 3, 4, 2
 ; CHECK-NEXT:    blr
   %a = add i16 %x, 42
   %c = icmp ugt i16 %x, %a
@@ -117,8 +115,8 @@ define i32 @unsigned_sat_constant_i32_using_cmp_sum(i32 %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi 5, 3, 42
 ; CHECK-NEXT:    li 4, -1
-; CHECK-NEXT:    cmplw 0, 3, 5
-; CHECK-NEXT:    isel 3, 4, 5, 1
+; CHECK-NEXT:    cmplw 0, 5, 3
+; CHECK-NEXT:    isel 3, 4, 5, 0
 ; CHECK-NEXT:    blr
   %a = add i32 %x, 42
   %c = icmp ugt i32 %x, %a
@@ -160,8 +158,8 @@ define i64 @unsigned_sat_constant_i64_using_cmp_sum(i64 %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi 5, 3, 42
 ; CHECK-NEXT:    li 4, -1
-; CHECK-NEXT:    cmpld 3, 5
-; CHECK-NEXT:    isel 3, 4, 5, 1
+; CHECK-NEXT:    cmpld 5, 3
+; CHECK-NEXT:    isel 3, 4, 5, 0
 ; CHECK-NEXT:    blr
   %a = add i64 %x, 42
   %c = icmp ugt i64 %x, %a
@@ -204,12 +202,12 @@ define i8 @unsigned_sat_variable_i8_using_min(i8 %x, i8 %y) {
 define i8 @unsigned_sat_variable_i8_using_cmp_sum(i8 %x, i8 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i8_using_cmp_sum:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    add 4, 3, 4
+; CHECK-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; CHECK-NEXT:    rlwinm 3, 3, 0, 24, 31
-; CHECK-NEXT:    li 5, -1
-; CHECK-NEXT:    clrlwi 6, 4, 24
-; CHECK-NEXT:    cmplw 3, 6
-; CHECK-NEXT:    isel 3, 5, 4, 1
+; CHECK-NEXT:    add 3, 3, 4
+; CHECK-NEXT:    andi. 4, 3, 256
+; CHECK-NEXT:    li 4, -1
+; CHECK-NEXT:    isel 3, 3, 4, 2
 ; CHECK-NEXT:    blr
   %a = add i8 %x, %y
   %c = icmp ugt i8 %x, %a
@@ -255,12 +253,12 @@ define i16 @unsigned_sat_variable_i16_using_min(i16 %x, i16 %y) {
 define i16 @unsigned_sat_variable_i16_using_cmp_sum(i16 %x, i16 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i16_using_cmp_sum:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    add 4, 3, 4
+; CHECK-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; CHECK-NEXT:    rlwinm 3, 3, 0, 16, 31
-; CHECK-NEXT:    li 5, -1
-; CHECK-NEXT:    clrlwi 6, 4, 16
-; CHECK-NEXT:    cmplw 3, 6
-; CHECK-NEXT:    isel 3, 5, 4, 1
+; CHECK-NEXT:    add 3, 3, 4
+; CHECK-NEXT:    andis. 4, 3, 1
+; CHECK-NEXT:    li 4, -1
+; CHECK-NEXT:    isel 3, 3, 4, 2
 ; CHECK-NEXT:    blr
   %a = add i16 %x, %y
   %c = icmp ugt i16 %x, %a
@@ -306,8 +304,8 @@ define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    add 4, 3, 4
 ; CHECK-NEXT:    li 5, -1
-; CHECK-NEXT:    cmplw 0, 3, 4
-; CHECK-NEXT:    isel 3, 5, 4, 1
+; CHECK-NEXT:    cmplw 0, 4, 3
+; CHECK-NEXT:    isel 3, 5, 4, 0
 ; CHECK-NEXT:    blr
   %a = add i32 %x, %y
   %c = icmp ugt i32 %x, %a
@@ -351,8 +349,8 @@ define i64 @unsigned_sat_variable_i64_using_cmp_sum(i64 %x, i64 %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    add 4, 3, 4
 ; CHECK-NEXT:    li 5, -1
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    isel 3, 5, 4, 1
+; CHECK-NEXT:    cmpld 4, 3
+; CHECK-NEXT:    isel 3, 5, 4, 0
 ; CHECK-NEXT:    blr
   %a = add i64 %x, %y
   %c = icmp ugt i64 %x, %a
diff --git a/test/CodeGen/X86/codegen-prepare-uaddo.ll b/test/CodeGen/X86/codegen-prepare-uaddo.ll
index dbf32f0782f..2bc13cc57d2 100644
--- a/test/CodeGen/X86/codegen-prepare-uaddo.ll
+++ b/test/CodeGen/X86/codegen-prepare-uaddo.ll
@@ -252,15 +252,14 @@ define void @test_18446744073709551615(i64*, i64*) {
 define i1 @illegal_type(i17 %x, i17* %p) {
 ; CHECK-LABEL: illegal_type:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $131071, %edi # imm = 0x1FFFF
 ; CHECK-NEXT:    addl $29, %edi
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    andl $131071, %ecx # imm = 0x1FFFF
-; CHECK-NEXT:    cmpl %edi, %ecx
-; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    movw %di, (%rsi)
-; CHECK-NEXT:    shrl $16, %ecx
-; CHECK-NEXT:    movb %cl, 2(%rsi)
+; CHECK-NEXT:    andl $131071, %edi # imm = 0x1FFFF
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrl $16, %eax
+; CHECK-NEXT:    movb %al, 2(%rsi)
+; CHECK-NEXT:    cmpl $29, %edi
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    retq
   %a = add i17 %x, 29
   store i17 %a, i17* %p
diff --git a/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll b/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
index 6b91a3b3c18..6be9661cc63 100644
--- a/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
+++ b/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
@@ -163,11 +163,10 @@ define i1 @uaddo_i16_increment_noncanonical_3(i16 %x, i16* %p) {
 
 define i1 @uaddo_i42_increment_illegal_type(i42 %x, i42* %p) {
 ; CHECK-LABEL: @uaddo_i42_increment_illegal_type(
-; CHECK-NEXT:    [[UADD_OVERFLOW:%.*]] = call { i42, i1 } @llvm.uadd.with.overflow.i42(i42 [[X:%.*]], i42 1)
-; CHECK-NEXT:    [[UADD:%.*]] = extractvalue { i42, i1 } [[UADD_OVERFLOW]], 0
-; CHECK-NEXT:    [[OVERFLOW:%.*]] = extractvalue { i42, i1 } [[UADD_OVERFLOW]], 1
-; CHECK-NEXT:    store i42 [[UADD]], i42* [[P:%.*]]
-; CHECK-NEXT:    ret i1 [[OVERFLOW]]
+; CHECK-NEXT:    [[A:%.*]] = add i42 [[X:%.*]], 1
+; CHECK-NEXT:    [[OV:%.*]] = icmp eq i42 [[A]], 0
+; CHECK-NEXT:    store i42 [[A]], i42* [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[OV]]
 ;
   %a = add i42 %x, 1
   %ov = icmp eq i42 %a, 0
-- 
2.50.1