From 46a61cfdb62c4250ba7d1bcf083fcf0d7aa3aa8d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 7 Oct 2019 06:27:55 +0000
Subject: [PATCH] [X86] Support LEA64_32r in processInstrForSlow3OpLEA and use
 INC/DEC when possible.

Move the erasing and iterator updating inside to match the
other slow LEA function.

I've adapted code from optTwoAddrLEA and basically rebuilt the
implementation here. We do lose the kill flags now just like
optTwoAddrLEA. This runs late enough in the pipeline that
shouldn't really be a problem.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373877 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FixupLEAs.cpp      | 190 ++++++++++++++++-----------
 test/CodeGen/X86/leaFixup32.mir      |   6 +-
 test/CodeGen/X86/leaFixup64.mir      |  20 +--
 test/CodeGen/X86/select-1-or-neg1.ll |   4 +-
 4 files changed, 125 insertions(+), 95 deletions(-)

diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index f66c6eb4ec1..543dc8b00fa 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -67,8 +67,8 @@ class FixupLEAPass : public MachineFunctionPass {
   /// - LEA that uses RIP relative addressing mode
   /// - LEA that uses 16-bit addressing mode "
   /// This function currently handles the first 2 cases only.
-  MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
-                                          MachineBasicBlock &MBB);
+  void processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
+                                 MachineBasicBlock &MBB, bool OptIncDec);
 
   /// Look for LEAs that are really two address LEAs that we might be able to
   /// turn into regular ADD instructions.
@@ -216,14 +216,10 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) {
       if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP))
         continue;
 
-      if (IsSlowLEA) {
+      if (IsSlowLEA)
         processInstructionForSlowLEA(I, MBB);
-      } else if (IsSlow3OpsLEA) {
-        if (auto *NewMI = processInstrForSlow3OpLEA(*I, MBB)) {
-          MBB.erase(I);
-          I = NewMI;
-        }
-      }
+      else if (IsSlow3OpsLEA)
+        processInstrForSlow3OpLEA(I, MBB, OptIncDec);
     }
 
     // Second pass for creating LEAs. This may reverse some of the
@@ -301,18 +297,14 @@ static inline bool isInefficientLEAReg(unsigned Reg) {
          Reg == X86::R13D || Reg == X86::R13;
 }
 
-static inline bool isRegOperand(const MachineOperand &Op) {
-  return Op.isReg() && Op.getReg() != X86::NoRegister;
-}
-
 /// Returns true if this LEA uses base an index registers, and the base register
 /// is known to be inefficient for the subtarget.
 // TODO: use a variant scheduling class to model the latency profile
 // of LEA instructions, and implement this logic as a scheduling predicate.
 static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
                                             const MachineOperand &Index) {
-  return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
-         isRegOperand(Index);
+  return Base.isReg() && isInefficientLEAReg(Base.getReg()) && Index.isReg() &&
+         Index.getReg() != X86::NoRegister;
 }
 
 static inline bool hasLEAOffset(const MachineOperand &Offset) {
@@ -534,112 +526,150 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
   }
 }
 
-MachineInstr *
-FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
-                                        MachineBasicBlock &MBB) {
+void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
+                                             MachineBasicBlock &MBB,
+                                             bool OptIncDec) {
+  MachineInstr &MI = *I;
   const unsigned LEAOpcode = MI.getOpcode();
 
-  const MachineOperand &Dst =     MI.getOperand(0);
+  const MachineOperand &Dest =    MI.getOperand(0);
   const MachineOperand &Base =    MI.getOperand(1 + X86::AddrBaseReg);
   const MachineOperand &Scale =   MI.getOperand(1 + X86::AddrScaleAmt);
   const MachineOperand &Index =   MI.getOperand(1 + X86::AddrIndexReg);
   const MachineOperand &Offset =  MI.getOperand(1 + X86::AddrDisp);
   const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
 
-  if (!(TII->isThreeOperandsLEA(MI) ||
-        hasInefficientLEABaseReg(Base, Index)) ||
+  if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) ||
       !TII->isSafeToClobberEFLAGS(MBB, MI) ||
       Segment.getReg() != X86::NoRegister)
-    return nullptr;
+    return;
+
+  Register DestReg = Dest.getReg();
+  Register BaseReg = Base.getReg();
+  Register IndexReg = Index.getReg();
+
+  if (MI.getOpcode() == X86::LEA64_32r) {
+    if (BaseReg != 0)
+      BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit);
+    if (IndexReg != 0)
+      IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit);
+  }
 
-  Register DstR = Dst.getReg();
-  Register BaseR = Base.getReg();
-  Register IndexR = Index.getReg();
-  Register SSDstR =
-      (LEAOpcode == X86::LEA64_32r) ? Register(getX86SubSuperRegister(DstR, 64))
-                                    : DstR;
   bool IsScale1 = Scale.getImm() == 1;
-  bool IsInefficientBase = isInefficientLEAReg(BaseR);
-  bool IsInefficientIndex = isInefficientLEAReg(IndexR);
+  bool IsInefficientBase = isInefficientLEAReg(BaseReg);
+  bool IsInefficientIndex = isInefficientLEAReg(IndexReg);
 
   // Skip these cases since it takes more than 2 instructions
   // to replace the LEA instruction.
-  if (IsInefficientBase && SSDstR == BaseR && !IsScale1)
-    return nullptr;
-  if (LEAOpcode == X86::LEA64_32r && IsInefficientBase &&
-      (IsInefficientIndex || !IsScale1))
-    return nullptr;
-
-  const DebugLoc DL = MI.getDebugLoc();
-  const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
-  const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
+  if (IsInefficientBase && DestReg == BaseReg && !IsScale1)
+    return;
 
   LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
   LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";);
 
+  MachineInstr *NewMI = nullptr;
+
   // First try to replace LEA with one or two (for the 3-op LEA case)
   // add instructions:
   // 1.lea (%base,%index,1), %base => add %index,%base
   // 2.lea (%base,%index,1), %index => add %base,%index
-  if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
-    const MachineOperand &Src = DstR == BaseR ? Index : Base;
-    MachineInstr *NewMI =
-        BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
-    LLVM_DEBUG(NewMI->dump(););
-    // Create ADD instruction for the Offset in case of 3-Ops LEA.
-    if (hasLEAOffset(Offset)) {
-      NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
-      LLVM_DEBUG(NewMI->dump(););
+  if (IsScale1 && (DestReg == BaseReg || DestReg == IndexReg)) {
+    unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+    if (DestReg != BaseReg)
+      std::swap(BaseReg, IndexReg);
+
+    if (MI.getOpcode() == X86::LEA64_32r) {
+      // TODO: Do we need the super register implicit use?
+      NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+                  .addReg(BaseReg)
+                  .addReg(IndexReg)
+                  .addReg(Base.getReg(), RegState::Implicit)
+                  .addReg(Index.getReg(), RegState::Implicit);
+    } else {
+      NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+                  .addReg(BaseReg)
+                  .addReg(IndexReg);
     }
-    return NewMI;
-  }
-  // If the base is inefficient try switching the index and base operands,
-  // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
-  // lea offset(%base,%index,scale),%dst =>
-  // lea (%base,%index,scale); add offset,%dst
-  if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
-    MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode))
-                              .add(Dst)
-                              .add(IsInefficientBase ? Index : Base)
-                              .add(Scale)
-                              .add(IsInefficientBase ? Base : Index)
-                              .addImm(0)
-                              .add(Segment);
+  } else if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
+    // If the base is inefficient try switching the index and base operands,
+    // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
+    // lea offset(%base,%index,scale),%dst =>
+    // lea (%base,%index,scale); add offset,%dst
+    NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode))
+                .add(Dest)
+                .add(IsInefficientBase ? Index : Base)
+                .add(Scale)
+                .add(IsInefficientBase ? Base : Index)
+                .addImm(0)
+                .add(Segment);
     LLVM_DEBUG(NewMI->dump(););
+  }
+
+  // If either replacement succeeded above, add the offset if needed, then
+  // replace the instruction.
+  if (NewMI) {
     // Create ADD instruction for the Offset in case of 3-Ops LEA.
     if (hasLEAOffset(Offset)) {
-      NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
-      LLVM_DEBUG(NewMI->dump(););
+      if (OptIncDec && Offset.isImm() &&
+          (Offset.getImm() == 1 || Offset.getImm() == -1)) {
+        unsigned NewOpc =
+            getINCDECFromLEA(MI.getOpcode(), Offset.getImm() == 1);
+        NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+                    .addReg(DestReg);
+        LLVM_DEBUG(NewMI->dump(););
+      } else {
+        unsigned NewOpc = getADDriFromLEA(MI.getOpcode(), Offset);
+        NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+                    .addReg(DestReg)
+                    .add(Offset);
+        LLVM_DEBUG(NewMI->dump(););
+      }
     }
-    return NewMI;
+
+    MBB.erase(I);
+    I = NewMI;
+    return;
   }
+
   // Handle the rest of the cases with inefficient base register:
-  assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!");
+  assert(DestReg != BaseReg && "DestReg == BaseReg should be handled already!");
   assert(IsInefficientBase && "efficient base should be handled already!");
 
+  // FIXME: Handle LEA64_32r.
+  if (LEAOpcode == X86::LEA64_32r)
+    return;
+
   // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
   if (IsScale1 && !hasLEAOffset(Offset)) {
-    bool BIK = Base.isKill() && BaseR != IndexR;
-    TII->copyPhysReg(MBB, MI, DL, DstR, BaseR, BIK);
+    bool BIK = Base.isKill() && BaseReg != IndexReg;
+    TII->copyPhysReg(MBB, MI, MI.getDebugLoc(), DestReg, BaseReg, BIK);
     LLVM_DEBUG(MI.getPrevNode()->dump(););
 
-    MachineInstr *NewMI =
-        BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
+    unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+    NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+                .addReg(DestReg)
+                .add(Index);
     LLVM_DEBUG(NewMI->dump(););
-    return NewMI;
+    return;
   }
+
   // lea offset(%base,%index,scale), %dst =>
   // lea offset( ,%index,scale), %dst; add %base,%dst
-  MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode))
-                            .add(Dst)
-                            .addReg(0)
-                            .add(Scale)
-                            .add(Index)
-                            .add(Offset)
-                            .add(Segment);
+  NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode))
+              .add(Dest)
+              .addReg(0)
+              .add(Scale)
+              .add(Index)
+              .add(Offset)
+              .add(Segment);
   LLVM_DEBUG(NewMI->dump(););
 
-  NewMI = BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
+  unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+  NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+              .addReg(DestReg)
+              .add(Base);
   LLVM_DEBUG(NewMI->dump(););
-  return NewMI;
+
+  MBB.erase(I);
+  I = NewMI;
 }
diff --git a/test/CodeGen/X86/leaFixup32.mir b/test/CodeGen/X86/leaFixup32.mir
index f614a4ad975..ede0df7c77d 100644
--- a/test/CodeGen/X86/leaFixup32.mir
+++ b/test/CodeGen/X86/leaFixup32.mir
@@ -104,7 +104,7 @@ frameInfo:
 body:             |
   bb.0 (%ir-block.0):
     liveins: $eax, $ebp
-    ; CHECK: $eax = ADD32rr $eax, killed $ebp
+    ; CHECK: $eax = ADD32rr $eax, $ebp
     ; CHECK: $eax = ADD32ri8 $eax, -5
  
     $eax = LEA32r killed $eax, 1, killed $ebp, -5, $noreg
@@ -139,7 +139,7 @@ frameInfo:
 body:             |
   bb.0 (%ir-block.0):
     liveins: $eax, $ebp
-    ; CHECK: $ebp = ADD32rr $ebp, killed $eax
+    ; CHECK: $ebp = ADD32rr $ebp, $eax
     ; CHECK: $ebp = ADD32ri8 $ebp, -5
  
     $ebp = LEA32r killed $ebp, 1, killed $eax, -5, $noreg
@@ -315,7 +315,7 @@ frameInfo:
 body:             |
   bb.0 (%ir-block.0):
     liveins: $eax, $ebp
-    ; CHECK: $eax = ADD32rr $eax, killed $ebp
+    ; CHECK: $eax = ADD32rr $eax, $ebp
     ; CHECK: $eax = ADD32ri $eax, 129
  
     $eax = LEA32r killed $eax, 1, killed $ebp, 129, $noreg
diff --git a/test/CodeGen/X86/leaFixup64.mir b/test/CodeGen/X86/leaFixup64.mir
index 317c219992c..4e9c47b11fc 100644
--- a/test/CodeGen/X86/leaFixup64.mir
+++ b/test/CodeGen/X86/leaFixup64.mir
@@ -177,8 +177,8 @@ frameInfo:
 body:             |
   bb.0 (%ir-block.0):
     liveins: $rax, $rbp
-    ; CHECK: $eax = LEA64_32r killed $rax, 1, killed $rbp, 0
-    ; CHECK: $eax = ADD32ri8 $eax, -5
+    ; CHECK: $eax = ADD32rr $eax, $ebp, implicit-def $eflags, implicit $rax, implicit $rbp
+    ; CHECK: $eax = ADD32ri8 $eax, -5, implicit-def $eflags
  
     $eax = LEA64_32r killed $rax, 1, killed $rbp, -5, $noreg
     RETQ $eax
@@ -212,8 +212,8 @@ frameInfo:
 body:             |
   bb.0 (%ir-block.0):
     liveins: $rax, $rbp
-    ; CHECK: $ebp = LEA64_32r killed $rax, 1,  killed $rbp, 0
-    ; CHECK: $ebp = ADD32ri8 $ebp, -5
+    ; CHECK: $ebp = ADD32rr $ebp, $eax, implicit-def $eflags, implicit $rbp, implicit $rax
+    ; CHECK: $ebp = ADD32ri8 $ebp, -5, implicit-def $eflags
  
     $ebp = LEA64_32r killed $rbp, 1, killed $rax, -5, $noreg
     RETQ $ebp
@@ -281,7 +281,7 @@ frameInfo:
 body:             |
   bb.0 (%ir-block.0):
     liveins: $rax, $rbp
-    ; CHECK: $rax = ADD64rr $rax, killed $rbp
+    ; CHECK: $rax = ADD64rr $rax, $rbp
     ; CHECK: $rax = ADD64ri8 $rax, -5
  
     $rax = LEA64r killed $rax, 1, killed $rbp, -5, $noreg
@@ -316,7 +316,7 @@ frameInfo:
 body:             |
   bb.0 (%ir-block.0):
     liveins: $rax, $rbp
-    ; CHECK: $rbp = ADD64rr $rbp, killed $rax
+    ; CHECK: $rbp = ADD64rr $rbp, $rax
     ; CHECK: $rbp = ADD64ri8 $rbp, -5
  
     $rbp = LEA64r killed $rbp, 1, killed $rax, -5, $noreg
@@ -635,8 +635,8 @@ frameInfo:
 body:             |
   bb.0 (%ir-block.0):
     liveins: $rax, $rbp
-    ; CHECK: $eax = LEA64_32r killed $rax, 1, killed $rbp, 0
-    ; CHECK: $eax = ADD32ri $eax, 129
+    ; CHECK: $eax = ADD32rr $eax, $ebp, implicit-def $eflags
+    ; CHECK: $eax = ADD32ri $eax, 129, implicit-def $eflags
  
     $eax = LEA64_32r killed $rax, 1, killed $rbp, 129, $noreg
     RETQ $eax
@@ -772,8 +772,8 @@ frameInfo:
 body:             |
   bb.0 (%ir-block.0):
     liveins: $rax, $rbp
-    ; CHECK: $rax = ADD64rr $rax, killed $rbp
-    ; CHECK: $rax = ADD64ri32 $rax, 129
+    ; CHECK: $rax = ADD64rr $rax, $rbp, implicit-def $eflags
+    ; CHECK: $rax = ADD64ri32 $rax, 129, implicit-def $eflags
  
     $rax = LEA64r killed $rax, 1, killed $rbp, 129, $noreg
     RETQ $eax
diff --git a/test/CodeGen/X86/select-1-or-neg1.ll b/test/CodeGen/X86/select-1-or-neg1.ll
index b0244fe7d99..c85cc08f886 100644
--- a/test/CodeGen/X86/select-1-or-neg1.ll
+++ b/test/CodeGen/X86/select-1-or-neg1.ll
@@ -19,8 +19,8 @@ define i32 @PR28968(i32 %x) {
 ; SLOWLEA3-NEXT:    xorl %eax, %eax
 ; SLOWLEA3-NEXT:    cmpl $1, %edi
 ; SLOWLEA3-NEXT:    sete %al
-; SLOWLEA3-NEXT:    leal (%rax,%rax), %eax
-; SLOWLEA3-NEXT:    addl $-1, %eax
+; SLOWLEA3-NEXT:    addl %eax, %eax
+; SLOWLEA3-NEXT:    decl %eax
 ; SLOWLEA3-NEXT:    retq
   %cmp = icmp eq i32 %x, 1
   %sel = select i1 %cmp, i32 1, i32 -1
-- 
2.50.0