From 46a61cfdb62c4250ba7d1bcf083fcf0d7aa3aa8d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 7 Oct 2019 06:27:55 +0000 Subject: [PATCH] [X86] Support LEA64_32r in processInstrForSlow3OpLEA and use INC/DEC when possible. Move the erasing and iterator updating inside to match the other slow LEA function. I've adapted code from optTwoAddrLEA and basically rebuilt the implementation here. We do lose the kill flags now just like optTwoAddrLEA. This runs late enough in the pipeline that shouldn't really be a problem. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373877 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86FixupLEAs.cpp | 190 ++++++++++++++++----------- test/CodeGen/X86/leaFixup32.mir | 6 +- test/CodeGen/X86/leaFixup64.mir | 20 +-- test/CodeGen/X86/select-1-or-neg1.ll | 4 +- 4 files changed, 125 insertions(+), 95 deletions(-) diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index f66c6eb4ec1..543dc8b00fa 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -67,8 +67,8 @@ class FixupLEAPass : public MachineFunctionPass { /// - LEA that uses RIP relative addressing mode /// - LEA that uses 16-bit addressing mode " /// This function currently handles the first 2 cases only. - MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI, - MachineBasicBlock &MBB); + void processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB, bool OptIncDec); /// Look for LEAs that are really two address LEAs that we might be able to /// turn into regular ADD instructions. @@ -216,14 +216,10 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) { if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP)) continue; - if (IsSlowLEA) { + if (IsSlowLEA) processInstructionForSlowLEA(I, MBB); - } else if (IsSlow3OpsLEA) { - if (auto *NewMI = processInstrForSlow3OpLEA(*I, MBB)) { - MBB.erase(I); - I = NewMI; - } - } + else if (IsSlow3OpsLEA) + processInstrForSlow3OpLEA(I, MBB, OptIncDec); } // Second pass for creating LEAs. This may reverse some of the @@ -301,18 +297,14 @@ static inline bool isInefficientLEAReg(unsigned Reg) { Reg == X86::R13D || Reg == X86::R13; } -static inline bool isRegOperand(const MachineOperand &Op) { - return Op.isReg() && Op.getReg() != X86::NoRegister; -} - /// Returns true if this LEA uses base an index registers, and the base register /// is known to be inefficient for the subtarget. // TODO: use a variant scheduling class to model the latency profile // of LEA instructions, and implement this logic as a scheduling predicate. static inline bool hasInefficientLEABaseReg(const MachineOperand &Base, const MachineOperand &Index) { - return Base.isReg() && isInefficientLEAReg(Base.getReg()) && - isRegOperand(Index); + return Base.isReg() && isInefficientLEAReg(Base.getReg()) && Index.isReg() && + Index.getReg() != X86::NoRegister; } static inline bool hasLEAOffset(const MachineOperand &Offset) { @@ -534,112 +526,150 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, } } -MachineInstr * -FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, - MachineBasicBlock &MBB) { +void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB, + bool OptIncDec) { + MachineInstr &MI = *I; const unsigned LEAOpcode = MI.getOpcode(); - const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Dest = MI.getOperand(0); const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg); const MachineOperand &Scale = MI.getOperand(1 + X86::AddrScaleAmt); const MachineOperand &Index = MI.getOperand(1 + X86::AddrIndexReg); const MachineOperand &Offset = MI.getOperand(1 + X86::AddrDisp); const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg); - if (!(TII->isThreeOperandsLEA(MI) || - hasInefficientLEABaseReg(Base, Index)) || + if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) || !TII->isSafeToClobberEFLAGS(MBB, MI) || Segment.getReg() != X86::NoRegister) - return nullptr; + return; + + Register DestReg = Dest.getReg(); + Register BaseReg = Base.getReg(); + Register IndexReg = Index.getReg(); + + if (MI.getOpcode() == X86::LEA64_32r) { + if (BaseReg != 0) + BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit); + if (IndexReg != 0) + IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit); + } - Register DstR = Dst.getReg(); - Register BaseR = Base.getReg(); - Register IndexR = Index.getReg(); - Register SSDstR = - (LEAOpcode == X86::LEA64_32r) ? Register(getX86SubSuperRegister(DstR, 64)) - : DstR; bool IsScale1 = Scale.getImm() == 1; - bool IsInefficientBase = isInefficientLEAReg(BaseR); - bool IsInefficientIndex = isInefficientLEAReg(IndexR); + bool IsInefficientBase = isInefficientLEAReg(BaseReg); + bool IsInefficientIndex = isInefficientLEAReg(IndexReg); // Skip these cases since it takes more than 2 instructions // to replace the LEA instruction. - if (IsInefficientBase && SSDstR == BaseR && !IsScale1) - return nullptr; - if (LEAOpcode == X86::LEA64_32r && IsInefficientBase && - (IsInefficientIndex || !IsScale1)) - return nullptr; - - const DebugLoc DL = MI.getDebugLoc(); - const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode)); - const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset)); + if (IsInefficientBase && DestReg == BaseReg && !IsScale1) + return; LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump();); LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";); + MachineInstr *NewMI = nullptr; + // First try to replace LEA with one or two (for the 3-op LEA case) // add instructions: // 1.lea (%base,%index,1), %base => add %index,%base // 2.lea (%base,%index,1), %index => add %base,%index - if (IsScale1 && (DstR == BaseR || DstR == IndexR)) { - const MachineOperand &Src = DstR == BaseR ? Index : Base; - MachineInstr *NewMI = - BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); - LLVM_DEBUG(NewMI->dump();); - // Create ADD instruction for the Offset in case of 3-Ops LEA. - if (hasLEAOffset(Offset)) { - NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); - LLVM_DEBUG(NewMI->dump();); + if (IsScale1 && (DestReg == BaseReg || DestReg == IndexReg)) { + unsigned NewOpc = getADDrrFromLEA(MI.getOpcode()); + if (DestReg != BaseReg) + std::swap(BaseReg, IndexReg); + + if (MI.getOpcode() == X86::LEA64_32r) { + // TODO: Do we need the super register implicit use? + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(BaseReg) + .addReg(IndexReg) + .addReg(Base.getReg(), RegState::Implicit) + .addReg(Index.getReg(), RegState::Implicit); + } else { + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(BaseReg) + .addReg(IndexReg); } - return NewMI; - } - // If the base is inefficient try switching the index and base operands, - // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: - // lea offset(%base,%index,scale),%dst => - // lea (%base,%index,scale); add offset,%dst - if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { - MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode)) - .add(Dst) - .add(IsInefficientBase ? Index : Base) - .add(Scale) - .add(IsInefficientBase ? Base : Index) - .addImm(0) - .add(Segment); + } else if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { + // If the base is inefficient try switching the index and base operands, + // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: + // lea offset(%base,%index,scale),%dst => + // lea (%base,%index,scale); add offset,%dst + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode)) + .add(Dest) + .add(IsInefficientBase ? Index : Base) + .add(Scale) + .add(IsInefficientBase ? Base : Index) + .addImm(0) + .add(Segment); LLVM_DEBUG(NewMI->dump();); + } + + // If either replacement succeeded above, add the offset if needed, then + // replace the instruction. + if (NewMI) { // Create ADD instruction for the Offset in case of 3-Ops LEA. if (hasLEAOffset(Offset)) { - NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); - LLVM_DEBUG(NewMI->dump();); + if (OptIncDec && Offset.isImm() && + (Offset.getImm() == 1 || Offset.getImm() == -1)) { + unsigned NewOpc = + getINCDECFromLEA(MI.getOpcode(), Offset.getImm() == 1); + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg); + LLVM_DEBUG(NewMI->dump();); + } else { + unsigned NewOpc = getADDriFromLEA(MI.getOpcode(), Offset); + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg) + .add(Offset); + LLVM_DEBUG(NewMI->dump();); + } } - return NewMI; + + MBB.erase(I); + I = NewMI; + return; } + // Handle the rest of the cases with inefficient base register: - assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!"); + assert(DestReg != BaseReg && "DestReg == BaseReg should be handled already!"); assert(IsInefficientBase && "efficient base should be handled already!"); + // FIXME: Handle LEA64_32r. + if (LEAOpcode == X86::LEA64_32r) + return; + // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst if (IsScale1 && !hasLEAOffset(Offset)) { - bool BIK = Base.isKill() && BaseR != IndexR; - TII->copyPhysReg(MBB, MI, DL, DstR, BaseR, BIK); + bool BIK = Base.isKill() && BaseReg != IndexReg; + TII->copyPhysReg(MBB, MI, MI.getDebugLoc(), DestReg, BaseReg, BIK); LLVM_DEBUG(MI.getPrevNode()->dump();); - MachineInstr *NewMI = - BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); + unsigned NewOpc = getADDrrFromLEA(MI.getOpcode()); + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg) + .add(Index); LLVM_DEBUG(NewMI->dump();); - return NewMI; + return; } + // lea offset(%base,%index,scale), %dst => // lea offset( ,%index,scale), %dst; add %base,%dst - MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode)) - .add(Dst) - .addReg(0) - .add(Scale) - .add(Index) - .add(Offset) - .add(Segment); + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode)) + .add(Dest) + .addReg(0) + .add(Scale) + .add(Index) + .add(Offset) + .add(Segment); LLVM_DEBUG(NewMI->dump();); - NewMI = BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); + unsigned NewOpc = getADDrrFromLEA(MI.getOpcode()); + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg) + .add(Base); LLVM_DEBUG(NewMI->dump();); - return NewMI; + + MBB.erase(I); + I = NewMI; } diff --git a/test/CodeGen/X86/leaFixup32.mir b/test/CodeGen/X86/leaFixup32.mir index f614a4ad975..ede0df7c77d 100644 --- a/test/CodeGen/X86/leaFixup32.mir +++ b/test/CodeGen/X86/leaFixup32.mir @@ -104,7 +104,7 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp - ; CHECK: $eax = ADD32rr $eax, killed $ebp + ; CHECK: $eax = ADD32rr $eax, $ebp ; CHECK: $eax = ADD32ri8 $eax, -5 $eax = LEA32r killed $eax, 1, killed $ebp, -5, $noreg @@ -139,7 +139,7 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp - ; CHECK: $ebp = ADD32rr $ebp, killed $eax + ; CHECK: $ebp = ADD32rr $ebp, $eax ; CHECK: $ebp = ADD32ri8 $ebp, -5 $ebp = LEA32r killed $ebp, 1, killed $eax, -5, $noreg @@ -315,7 +315,7 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp - ; CHECK: $eax = ADD32rr $eax, killed $ebp + ; CHECK: $eax = ADD32rr $eax, $ebp ; CHECK: $eax = ADD32ri $eax, 129 $eax = LEA32r killed $eax, 1, killed $ebp, 129, $noreg diff --git a/test/CodeGen/X86/leaFixup64.mir b/test/CodeGen/X86/leaFixup64.mir index 317c219992c..4e9c47b11fc 100644 --- a/test/CodeGen/X86/leaFixup64.mir +++ b/test/CodeGen/X86/leaFixup64.mir @@ -177,8 +177,8 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp - ; CHECK: $eax = LEA64_32r killed $rax, 1, killed $rbp, 0 - ; CHECK: $eax = ADD32ri8 $eax, -5 + ; CHECK: $eax = ADD32rr $eax, $ebp, implicit-def $eflags, implicit $rax, implicit $rbp + ; CHECK: $eax = ADD32ri8 $eax, -5, implicit-def $eflags $eax = LEA64_32r killed $rax, 1, killed $rbp, -5, $noreg RETQ $eax @@ -212,8 +212,8 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp - ; CHECK: $ebp = LEA64_32r killed $rax, 1, killed $rbp, 0 - ; CHECK: $ebp = ADD32ri8 $ebp, -5 + ; CHECK: $ebp = ADD32rr $ebp, $eax, implicit-def $eflags, implicit $rbp, implicit $rax + ; CHECK: $ebp = ADD32ri8 $ebp, -5, implicit-def $eflags $ebp = LEA64_32r killed $rbp, 1, killed $rax, -5, $noreg RETQ $ebp @@ -281,7 +281,7 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp - ; CHECK: $rax = ADD64rr $rax, killed $rbp + ; CHECK: $rax = ADD64rr $rax, $rbp ; CHECK: $rax = ADD64ri8 $rax, -5 $rax = LEA64r killed $rax, 1, killed $rbp, -5, $noreg @@ -316,7 +316,7 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp - ; CHECK: $rbp = ADD64rr $rbp, killed $rax + ; CHECK: $rbp = ADD64rr $rbp, $rax ; CHECK: $rbp = ADD64ri8 $rbp, -5 $rbp = LEA64r killed $rbp, 1, killed $rax, -5, $noreg @@ -635,8 +635,8 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp - ; CHECK: $eax = LEA64_32r killed $rax, 1, killed $rbp, 0 - ; CHECK: $eax = ADD32ri $eax, 129 + ; CHECK: $eax = ADD32rr $eax, $ebp, implicit-def $eflags + ; CHECK: $eax = ADD32ri $eax, 129, implicit-def $eflags $eax = LEA64_32r killed $rax, 1, killed $rbp, 129, $noreg RETQ $eax @@ -772,8 +772,8 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp - ; CHECK: $rax = ADD64rr $rax, killed $rbp - ; CHECK: $rax = ADD64ri32 $rax, 129 + ; CHECK: $rax = ADD64rr $rax, $rbp, implicit-def $eflags + ; CHECK: $rax = ADD64ri32 $rax, 129, implicit-def $eflags $rax = LEA64r killed $rax, 1, killed $rbp, 129, $noreg RETQ $eax diff --git a/test/CodeGen/X86/select-1-or-neg1.ll b/test/CodeGen/X86/select-1-or-neg1.ll index b0244fe7d99..c85cc08f886 100644 --- a/test/CodeGen/X86/select-1-or-neg1.ll +++ b/test/CodeGen/X86/select-1-or-neg1.ll @@ -19,8 +19,8 @@ define i32 @PR28968(i32 %x) { ; SLOWLEA3-NEXT: xorl %eax, %eax ; SLOWLEA3-NEXT: cmpl $1, %edi ; SLOWLEA3-NEXT: sete %al -; SLOWLEA3-NEXT: leal (%rax,%rax), %eax -; SLOWLEA3-NEXT: addl $-1, %eax +; SLOWLEA3-NEXT: addl %eax, %eax +; SLOWLEA3-NEXT: decl %eax ; SLOWLEA3-NEXT: retq %cmp = icmp eq i32 %x, 1 %sel = select i1 %cmp, i32 1, i32 -1 -- 2.40.0