#include "X86Subtarget.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallPtrSet.h"
MachineDominatorTree *MDT;
CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
- MachineInstr &CopyDefI);
+ MachineBasicBlock::iterator CopyDefI);
unsigned promoteCondToReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator TestPos,
// Nothing to do for a degenerate empty function...
return false;
+ // Collect the copies in RPO so that when there are chains where a copy is in
+ // turn copied again we visit the first one first. This ensures we can find
+ // viable locations for testing the original EFLAGS that dominate all the
+ // uses across complex CFGs.
SmallVector<MachineInstr *, 4> Copies;
- for (MachineBasicBlock &MBB : MF)
- for (MachineInstr &MI : MBB)
+ ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+ for (MachineBasicBlock *MBB : RPOT)
+ for (MachineInstr &MI : *MBB)
if (MI.getOpcode() == TargetOpcode::COPY &&
MI.getOperand(0).getReg() == X86::EFLAGS)
Copies.push_back(&MI);
if (DOp.isDead())
continue;
- MachineBasicBlock &TestMBB = *CopyDefI.getParent();
+ MachineBasicBlock *TestMBB = CopyDefI.getParent();
auto TestPos = CopyDefI.getIterator();
DebugLoc TestLoc = CopyDefI.getDebugLoc();
LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
+ // Walk up across live-in EFLAGS to find where they were actually def'ed.
+ //
+ // This copy's def may just be part of a region of blocks covered by
+ // a single def of EFLAGS and we want to find the top of that region where
+ // possible.
+ //
+ // This is essentially a search for a *candidate* reaching definition
+ // location. We don't need to ever find the actual reaching definition here,
+ // but we want to walk up the dominator tree to find the highest point which
+ // would be viable for such a definition.
+ auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End) {
+ // Scan backwards as we expect these to be relatively short and often find
+ // a clobber near the end.
+ return llvm::any_of(
+ llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) {
+ // Flag any instruction (other than the copy we are
+ // currently rewriting) that defs EFLAGS.
+ return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS);
+ });
+ };
+ auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB,
+ MachineBasicBlock *EndMBB) {
+ assert(MDT->dominates(BeginMBB, EndMBB) &&
+ "Only support paths down the dominator tree!");
+ SmallPtrSet<MachineBasicBlock *, 4> Visited;
+ SmallVector<MachineBasicBlock *, 4> Worklist;
+ // We terminate at the beginning. No need to scan it.
+ Visited.insert(BeginMBB);
+ Worklist.push_back(EndMBB);
+ do {
+ auto *MBB = Worklist.pop_back_val();
+ for (auto *PredMBB : MBB->predecessors()) {
+ if (!Visited.insert(PredMBB).second)
+ continue;
+ if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end()))
+ return true;
+ // Enqueue this block to walk its predecessors.
+ Worklist.push_back(PredMBB);
+ }
+ } while (!Worklist.empty());
+ // No clobber found along a path from the begin to end.
+ return false;
+ };
+ while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() &&
+ !HasEFLAGSClobber(TestMBB->begin(), TestPos)) {
+ // Find the nearest common dominator of the predecessors, as
+ // that will be the best candidate to hoist into.
+ MachineBasicBlock *HoistMBB =
+ std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(),
+ *TestMBB->pred_begin(),
+ [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) {
+ return MDT->findNearestCommonDominator(LHS, RHS);
+ });
+
+ // Now we need to scan all predecessors that may be reached along paths to
+ // the hoist block. A clobber anywhere in any of these blocks the hoist.
+ // Note that this even handles loops because we require *no* clobbers.
+ if (HasEFLAGSClobberPath(HoistMBB, TestMBB))
+ break;
+
+ // We also need the terminators to not sneakily clobber flags.
+ if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(),
+ HoistMBB->instr_end()))
+ break;
+
+ // We found a viable location, hoist our test position to it.
+ TestMBB = HoistMBB;
+ TestPos = TestMBB->getFirstTerminator()->getIterator();
+ // Clear the debug location as it would just be confusing after hoisting.
+ TestLoc = DebugLoc();
+ }
+ LLVM_DEBUG({
+ auto DefIt = llvm::find_if(
+ llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)),
+ [&](MachineInstr &MI) {
+ return MI.findRegisterDefOperand(X86::EFLAGS);
+ });
+ if (DefIt.base() != TestMBB->instr_begin()) {
+ dbgs() << " Using EFLAGS defined by: ";
+ DefIt->dump();
+ } else {
+ dbgs() << " Using live-in flags for BB:\n";
+ TestMBB->dump();
+ }
+ });
+
// While rewriting uses, we buffer jumps and rewrite them in a second pass
// because doing so will perturb the CFG that we are walking to find the
// uses in the first place.
// very few of them and we expect to not revisit the same copy definition
// many times. If either of those change sufficiently we could build a map
// of these up front instead.
- CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI);
+ CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos);
// Collect the basic blocks we need to scan. Typically this will just be
// a single basic block but we may have to scan multiple blocks if the
SmallVector<MachineBasicBlock *, 2> Blocks;
SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
Blocks.push_back(&MBB);
- VisitedBlocks.insert(&MBB);
do {
MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
// Track when if/when we find a kill of the flags in this block.
bool FlagsKilled = false;
- // We currently don't do any PHI insertion and so we require that the
- // test basic block dominates all of the use basic blocks.
- //
- // We could in theory do PHI insertion here if it becomes useful by just
- // taking undef values in along every edge that we don't trace this
- // EFLAGS copy along. This isn't as bad as fully general PHI insertion,
- // but still seems like a great deal of complexity.
- //
- // Because it is theoretically possible that some earlier MI pass or
- // other lowering transformation could induce this to happen, we do
- // a hard check even in non-debug builds here.
- if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) {
- LLVM_DEBUG({
- dbgs() << "ERROR: Encountered use that is not dominated by our test "
- "basic block! Rewriting this would require inserting PHI "
- "nodes to track the flag state across the CFG.\n\nTest "
- "block:\n";
- TestMBB.dump();
- dbgs() << "Use block:\n";
- UseMBB.dump();
- });
- report_fatal_error("Cannot lower EFLAGS copy when original copy def "
- "does not dominate all uses.");
- }
-
- for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator())
- : UseMBB.instr_begin(),
+ // In most cases, we walk from the beginning to the end of the block. But
+ // when the block is the same block as the copy is from, we will visit it
+ // twice. The first time we start from the copy and go to the end. The
+ // second time we start from the beginning and go to the copy. This lets
+ // us handle copies inside of cycles.
+ // FIXME: This loop is *super* confusing. This is at least in part
+ // a symptom of all of this routine needing to be refactored into
+ // documentable components. Once done, there may be a better way to write
+ // this loop.
+ for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB))
+ ? std::next(CopyI->getIterator())
+ : UseMBB.instr_begin(),
MIE = UseMBB.instr_end();
MII != MIE;) {
MachineInstr &MI = *MII++;
+ // If we are in the original copy block and encounter either the copy
+ // def or the copy itself, break so that we don't re-process any part of
+ // the block or process the instructions in the range that was copied
+ // over.
+ if (&MI == CopyI || &MI == &CopyDefI) {
+ assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) &&
+ "Should only encounter these on the second pass over the "
+ "original block.");
+ break;
+ }
+
MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
if (!FlagUse) {
if (MI.findRegisterDefOperand(X86::EFLAGS)) {
// Otherwise we can just rewrite in-place.
if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
- rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+ rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
} else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
X86::COND_INVALID) {
- rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+ rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
} else if (MI.getOpcode() == TargetOpcode::COPY) {
rewriteCopy(MI, *FlagUse, CopyDefI);
} else {
case X86::SETB_C64r:
// Use custom lowering for arithmetic that is merely extending the
// carry flag. We model this as the SETB_C* pseudo instructions.
- rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse,
+ rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
CondRegs);
break;
default:
// Generically handle remaining uses as arithmetic instructions.
- rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse,
+ rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
CondRegs);
break;
}
// and queue those up for processing.
for (MachineBasicBlock *SuccMBB : UseMBB.successors())
if (SuccMBB->isLiveIn(X86::EFLAGS) &&
- VisitedBlocks.insert(SuccMBB).second)
+ VisitedBlocks.insert(SuccMBB).second) {
+ // We currently don't do any PHI insertion and so we require that the
+ // test basic block dominates all of the use basic blocks. Further, we
+ // can't have a cycle from the test block back to itself as that would
+ // create a cycle requiring a PHI to break it.
+ //
+ // We could in theory do PHI insertion here if it becomes useful by
+ // just taking undef values in along every edge that we don't trace
+ // this EFLAGS copy along. This isn't as bad as fully general PHI
+ // insertion, but still seems like a great deal of complexity.
+ //
+ // Because it is theoretically possible that some earlier MI pass or
+ // other lowering transformation could induce this to happen, we do
+ // a hard check even in non-debug builds here.
+ if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) {
+ LLVM_DEBUG({
+ dbgs()
+ << "ERROR: Encountered use that is not dominated by our test "
+ "basic block! Rewriting this would require inserting PHI "
+ "nodes to track the flag state across the CFG.\n\nTest "
+ "block:\n";
+ TestMBB->dump();
+ dbgs() << "Use block:\n";
+ SuccMBB->dump();
+ });
+ report_fatal_error(
+ "Cannot lower EFLAGS copy when original copy def "
+ "does not dominate all uses.");
+ }
+
Blocks.push_back(SuccMBB);
+ }
} while (!Blocks.empty());
// Now rewrite the jumps that use the flags. These we handle specially
else
LastJmpMBB = JmpI->getParent();
- rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
+ rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
}
// FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
/// Collect any conditions that have already been set in registers so that we
/// can re-use them rather than adding duplicates.
-CondRegArray
-X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB,
- MachineInstr &CopyDefI) {
+CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) {
CondRegArray CondRegs = {};
// Scan backwards across the range of instructions with live EFLAGS.
- for (MachineInstr &MI : llvm::reverse(
- llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) {
+ for (MachineInstr &MI :
+ llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() &&
TRI->isVirtualRegister(MI.getOperand(0).getReg()))
call void @foo()
ret i64 0
}
+
+ define i64 @test_mid_cycle_copies(i64 %a, i64 %b) {
+ entry:
+ call void @foo()
+ ret i64 0
+ }
...
---
name: test_branch
RET 0, $rax
...
+---
+# This test case is designed to exercise a particularly challenging situation:
+# when the flags are copied and restored *inside* of a complex and cyclic CFG
+# all of which have live-in flags. To correctly handle this case we have to walk
+# up the dominator tree and locate a viable reaching definition location,
+# checking for clobbers along any path. The CFG for this function looks like the
+# following diagram, control flowing out the bottom of blocks and in the top:
+#
+# bb.0
+# | __________________
+# |/ \
+# bb.1 |
+# |\_________ |
+# | __ \ ____ |
+# |/ \ |/ \ |
+# bb.2 | bb.4 | |
+# |\__/ / \ | |
+# | / \ | |
+# bb.3 bb.5 bb.6 | |
+# | \ / | |
+# | \ / | |
+# | bb.7 | |
+# | ________/ \____/ |
+# |/ |
+# bb.8 |
+# |\__________________/
+# |
+# bb.9
+#
+# We set EFLAGS in bb.0, clobber them in bb.3, and copy them in bb.2 and bb.6.
+# Because of the cycles this requires hoisting the `SETcc` instructions to
+# capture the flags for the bb.6 copy to bb.1 and using them for the copy in
+# `bb.2` as well despite the clobber in `bb.3`. The clobber in `bb.3` also
+# prevents hoisting the `SETcc`s up to `bb.0`.
+#
+# Throughout the test we use branch instructions that are totally bogus (as the
+# flags are obviously not changing!) but this is just to allow us to send
+# a small but complex CFG structure through the backend and force it to choose
+# plausible lowering decisions based on the core CFG presented, regardless of
+# the futility of the actual branches.
+name: test_mid_cycle_copies
+# CHECK-LABEL: name: test_mid_cycle_copies
+liveins:
+ - { reg: '$rdi', virtual-reg: '%0' }
+ - { reg: '$rsi', virtual-reg: '%1' }
+body: |
+ bb.0:
+ successors: %bb.1
+ liveins: $rdi, $rsi
+
+ %0:gr64 = COPY $rdi
+ %1:gr64 = COPY $rsi
+ CMP64rr %0, %1, implicit-def $eflags
+ ; CHECK: bb.0:
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ ; CHECK: CMP64rr %0, %1, implicit-def $eflags
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ JMP_1 %bb.1
+
+ bb.1:
+ successors: %bb.2, %bb.4
+ liveins: $eflags
+
+ ; Outer loop header, target for one set of hoisting.
+ JE_1 %bb.2, implicit $eflags
+ JMP_1 %bb.4
+ ; CHECK: bb.1:
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ ; CHECK: %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags
+ ; CHECK-NEXT: %[[E_REG:[^:]*]]:gr8 = SETEr implicit $eflags
+ ; CHECK-NEXT: %[[B_REG:[^:]*]]:gr8 = SETBr implicit $eflags
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+
+ bb.2:
+ successors: %bb.2, %bb.3
+ liveins: $eflags
+
+ ; Inner loop with a local copy. We should eliminate this but can't hoist.
+ %2:gr64 = COPY $eflags
+ $eflags = COPY %2
+ JE_1 %bb.2, implicit $eflags
+ JMP_1 %bb.3
+ ; CHECK: bb.2:
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ ; CHECK: TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+ ; CHECK-NEXT: JNE_1 %bb.2, implicit killed $eflags
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+
+ bb.3:
+ successors: %bb.8
+ liveins: $eflags
+
+ ; Use and then clobber $eflags. Then hop to the outer loop latch.
+ %3:gr64 = ADC64ri32 %0, 42, implicit-def dead $eflags, implicit $eflags
+ ; CHECK: bb.3:
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ ; CHECK: dead %{{[^:]*}}:gr8 = ADD8ri %[[B_REG]], 255, implicit-def $eflags
+ ; CHECK-NEXT: %3:gr64 = ADC64ri32 %0, 42, implicit-def{{( dead)?}} $eflags, implicit{{( killed)?}} $eflags
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %3
+ JMP_1 %bb.8
+
+ bb.4:
+ successors: %bb.5, %bb.6
+ liveins: $eflags
+
+ ; Another inner loop, this one with a diamond.
+ JE_1 %bb.5, implicit $eflags
+ JMP_1 %bb.6
+ ; CHECK: bb.4:
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ ; CHECK: TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+ ; CHECK-NEXT: JNE_1 %bb.5, implicit killed $eflags
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+
+ bb.5:
+ successors: %bb.7
+ liveins: $eflags
+
+ ; Just use $eflags on this side of the diamond.
+ %4:gr64 = CMOVA64rr %0, %1, implicit $eflags
+ ; CHECK: bb.5:
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ ; CHECK: TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+ ; CHECK-NEXT: %4:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %4
+ JMP_1 %bb.7
+
+ bb.6:
+ successors: %bb.7
+ liveins: $eflags
+
+ ; Use, copy, and then use $eflags again.
+ %5:gr64 = CMOVA64rr %0, %1, implicit $eflags
+ ; CHECK: bb.6:
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ ; CHECK: TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+ ; CHECK-NEXT: %5:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
+
+ %6:gr64 = COPY $eflags
+ $eflags = COPY %6:gr64
+
+ %7:gr64 = CMOVA64rr %0, %1, implicit $eflags
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ ; CHECK: TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+ ; CHECK-NEXT: %7:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %7
+ JMP_1 %bb.7
+
+ bb.7:
+ successors: %bb.4, %bb.8
+ liveins: $eflags
+
+ ; Inner loop latch.
+ JE_1 %bb.4, implicit $eflags
+ JMP_1 %bb.8
+ ; CHECK: bb.7:
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ ; CHECK: TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+ ; CHECK-NEXT: JNE_1 %bb.4, implicit killed $eflags
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+
+ bb.8:
+ successors: %bb.1, %bb.9
+
+ ; Outer loop latch. Note that we cannot have EFLAGS live-in here as that
+ ; Immediately require PHIs.
+ CMP64rr %0, %1, implicit-def $eflags
+ JE_1 %bb.1, implicit $eflags
+ JMP_1 %bb.9
+ ; CHECK: bb.8:
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+ ; CHECK: CMP64rr %0, %1, implicit-def $eflags
+ ; CHECK-NEXT: JE_1 %bb.1, implicit $eflags
+ ; CHECK-NOT: COPY{{( killed)?}} $eflags
+
+ bb.9:
+ liveins: $eflags
+
+ ; And we're done.
+ %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
+ $rax = COPY %8
+ RET 0, $rax
+ ; CHECK: bb.9:
+ ; CHECK-NOT: $eflags
+ ; CHECK: %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
+
+...