[x86] Teach the cmov converter to aggressively convert cmovs with memory

author Chandler Carruth <chandlerc@gmail.com>

Sat, 19 Aug 2017 05:01:19 +0000 (05:01 +0000)

committer Chandler Carruth <chandlerc@gmail.com>

Sat, 19 Aug 2017 05:01:19 +0000 (05:01 +0000)
author Chandler Carruth <chandlerc@gmail.com>
Sat, 19 Aug 2017 05:01:19 +0000 (05:01 +0000)
committer Chandler Carruth <chandlerc@gmail.com>
Sat, 19 Aug 2017 05:01:19 +0000 (05:01 +0000)
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp

index da20c5f0abd7ce86ad8a4eaa40f54ed4e22a11a8..46285b994f52236047fbff09f35cad27ed8c3a39 100644 (file)
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -72,6 +72,11 @@ static cl::opt<unsigned>
                         cl::desc("Minimum gain per loop (in cycles) threshold."),
                         cl::init(4), cl::Hidden);
  
+static cl::opt<bool> ForceMemOperand(
+    "x86-cmov-converter-force-mem-operand",
+    cl::desc("Convert cmovs to branches whenever they have memory operands."),
+    cl::init(true), cl::Hidden);
+
  /// Converts X86 cmov instructions into branches when profitable.
  class X86CmovConverterPass : public MachineFunctionPass {
  public:
@@ -86,8 +91,9 @@ private:
    /// Pass identification, replacement for typeid.
    static char ID;
  
-  const MachineRegisterInfo *MRI;
+  MachineRegisterInfo *MRI;
    const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
    TargetSchedModel TSchedModel;
  
    /// List of consecutive CMOV instructions.
@@ -101,7 +107,8 @@ private:
    /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
    /// \returns true iff it found any CMOV-group-candidate.
    bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
-                             CmovGroups &CmovInstGroups);
+                             CmovGroups &CmovInstGroups,
+                             bool IncludeLoads = false);
  
    /// Check if it is profitable to transform each CMOV-group-candidates into
    /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
@@ -139,10 +146,36 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
    const TargetSubtargetInfo &STI = MF.getSubtarget();
    MRI = &MF.getRegInfo();
    TII = STI.getInstrInfo();
+  TRI = STI.getRegisterInfo();
    TSchedModel.init(STI.getSchedModel(), &STI, TII);
  
+  // Before we handle the more subtle cases of register-register CMOVs inside
+  // of potentially hot loops, we want to quickly remove all CMOVs with
+  // a memory operand. The CMOV will risk a stall waiting for the load to
+  // complete that speculative execution behind a branch is better suited to
+  // handle on modern x86 chips.
+  if (ForceMemOperand) {
+    CmovGroups AllCmovGroups;
+    SmallVector<MachineBasicBlock *, 4> Blocks;
+    for (auto &MBB : MF)
+      Blocks.push_back(&MBB);
+    if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {
+      for (auto &Group : AllCmovGroups) {
+        // Skip any group that doesn't do at least one memory operand cmov.
+        if (!llvm::any_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
+          continue;
+
+        // For CMOV groups which we can rewrite and which contain a memory load,
+        // always rewrite them. On x86, a CMOV will dramatically amplify any
+        // memory latency by blocking speculative execution.
+        Changed = true;
+        convertCmovInstsToBranches(Group);
+      }
+    }
+  }
+
    //===--------------------------------------------------------------------===//
-  // Algorithm
+  // Register-operand Conversion Algorithm
    // ---------
    //   For each inner most loop
    //     collectCmovCandidates() {
@@ -191,11 +224,13 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
      for (auto &Group : CmovInstGroups)
        convertCmovInstsToBranches(Group);
    }
+
    return Changed;
  }
  
  bool X86CmovConverterPass::collectCmovCandidates(
-    ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {
+    ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,
+    bool IncludeLoads) {
    //===--------------------------------------------------------------------===//
    // Collect all CMOV-group-candidates and add them into CmovInstGroups.
    //
@@ -221,7 +256,7 @@ bool X86CmovConverterPass::collectCmovCandidates(
      Group.clear();
      // Condition code of first CMOV instruction current processed range and its
      // opposite condition code.
-    X86::CondCode FirstCC, FirstOppCC;
+    X86::CondCode FirstCC, FirstOppCC, MemOpCC;
      // Indicator of a non CMOVrr instruction in the current processed range.
      bool FoundNonCMOVInst = false;
      // Indicator for current processed CMOV-group if it should be skipped.
@@ -230,11 +265,13 @@ bool X86CmovConverterPass::collectCmovCandidates(
      for (auto &I : *MBB) {
        X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
        // Check if we found a X86::CMOVrr instruction.
-      if (CC != X86::COND_INVALID && !I.mayLoad()) {
+      if (CC != X86::COND_INVALID && (IncludeLoads || !I.mayLoad())) {
          if (Group.empty()) {
            // We found first CMOV in the range, reset flags.
            FirstCC = CC;
            FirstOppCC = X86::GetOppositeBranchCondition(CC);
+          // Clear out the prior group's memory operand CC.
+          MemOpCC = X86::COND_INVALID;
            FoundNonCMOVInst = false;
            SkipGroup = false;
          }
@@ -244,6 +281,14 @@ bool X86CmovConverterPass::collectCmovCandidates(
          if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
            // Mark the SKipGroup indicator to skip current processed CMOV-Group.
            SkipGroup = true;
+        if (I.mayLoad()) {
+          if (MemOpCC == X86::COND_INVALID)
+            // The first memory operand CMOV.
+            MemOpCC = CC;
+          else if (CC != MemOpCC)
+            // Can't handle mixed conditions with memory operands.
+            SkipGroup = true;
+        }
          continue;
        }
        // If Group is empty, keep looking for first CMOV in the range.
@@ -540,8 +585,18 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
    MachineInstr &MI = *Group.front();
    MachineInstr *LastCMOV = Group.back();
    DebugLoc DL = MI.getDebugLoc();
+
    X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode()));
    X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+  // Potentially swap the condition codes so that any memory operand to a CMOV
+  // is in the *false* position instead of the *true* position. We can invert
+  // any non-memory operand CMOV instructions to cope with this and we ensure
+  // memory operand CMOVs are only included with a single condition code.
+  if (llvm::any_of(Group, [&](MachineInstr *I) {
+        return I->mayLoad() && X86::getCondFromCMovOpc(I->getOpcode()) == CC;
+      }))
+    std::swap(CC, OppCC);
+
    MachineBasicBlock *MBB = MI.getParent();
    MachineFunction::iterator It = ++MBB->getIterator();
    MachineFunction *F = MBB->getParent();
@@ -578,7 +633,103 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
    MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
    MachineBasicBlock::iterator MIItEnd =
        std::next(MachineBasicBlock::iterator(LastCMOV));
+  MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();
    MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
+
+  // First we need to insert an explicit load on the false path for any memory
+  // operand. We also need to potentially do register rewriting here, but it is
+  // simpler as the memory operands are always on the false path so we can
+  // simply take that input, whatever it is.
+  DenseMap<unsigned, unsigned> FalseBBRegRewriteTable;
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {
+    auto &MI = *MIIt++;
+    // Skip any CMOVs in this group which don't load from memory.
+    if (!MI.mayLoad()) {
+      // Remember the false-side register input.
+      FalseBBRegRewriteTable[MI.getOperand(0).getReg()] =
+          MI.getOperand(X86::getCondFromCMovOpc(MI.getOpcode()) == CC ? 1 : 2)
+              .getReg();
+      continue;
+    }
+
+    // The condition must be the *opposite* of the one we've decided to branch
+    // on as the branch will go *around* the load and the load should happen
+    // when the CMOV condition is false.
+    assert(X86::getCondFromCMovOpc(MI.getOpcode()) == OppCC &&
+           "Can only handle memory-operand cmov instructions with a condition "
+           "opposite to the selected branch direction.");
+
+    // The goal is to rewrite the cmov from:
+    //
+    //   MBB:
+    //     %A = CMOVcc %B (tied), (mem)
+    //
+    // to
+    //
+    //   MBB:
+    //     %A = CMOVcc %B (tied), %C
+    //   FalseMBB:
+    //     %C = MOV (mem)
+    //
+    // Which will allow the next loop to rewrite the CMOV in terms of a PHI:
+    //
+    //   MBB:
+    //     JMP!cc SinkMBB
+    //   FalseMBB:
+    //     %C = MOV (mem)
+    //   SinkMBB:
+    //     %A = PHI [ %C, FalseMBB ], [ %B, MBB]
+
+    // Get a fresh register to use as the destination of the MOV.
+    const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
+    unsigned TmpReg = MRI->createVirtualRegister(RC);
+
+    SmallVector<MachineInstr *, 4> NewMIs;
+    bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
+                                             /*UnfoldLoad*/ true,
+                                             /*UnfoldStore*/ false, NewMIs);
+    (void)Unfolded;
+    assert(Unfolded && "Should never fail to unfold a loading cmov!");
+
+    // Move the new CMOV to just before the old one and reset any impacted
+    // iterator.
+    auto *NewCMOV = NewMIs.pop_back_val();
+    assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC &&
+           "Last new instruction isn't the expected CMOV!");
+    DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
+    MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV);
+    if (&*MIItBegin == &MI)
+      MIItBegin = MachineBasicBlock::iterator(NewCMOV);
+
+    // Sink whatever instructions were needed to produce the unfolded operand
+    // into the false block.
+    for (auto *NewMI : NewMIs) {
+      DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
+      FalseMBB->insert(FalseInsertionPoint, NewMI);
+      // Re-map any operands that are from other cmovs to the inputs for this block.
+      for (auto &MOp : NewMI->uses()) {
+        if (!MOp.isReg())
+          continue;
+        auto It = FalseBBRegRewriteTable.find(MOp.getReg());
+        if (It == FalseBBRegRewriteTable.end())
+          continue;
+
+        MOp.setReg(It->second);
+        // This might have been a kill when it referenced the cmov result, but
+        // it won't necessarily be once rewritten.
+        // FIXME: We could potentially improve this by tracking whether the
+        // operand to the cmov was also a kill, and then skipping the PHI node
+        // construction below.
+        MOp.setIsKill(false);
+      }
+    }
+    MBB->erase(MachineBasicBlock::iterator(MI),
+               std::next(MachineBasicBlock::iterator(MI)));
+
+    // Add this PHI to the rewrite table.
+    FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg;
+  }
+
    // As we are creating the PHIs, we have to be careful if there is more than
    // one.  Later CMOVs may reference the results of earlier CMOVs, but later
    // PHIs have to reference the individual true/false inputs from earlier PHIs.
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll

index fca39bca6c76aeea17c1b48330c095b898af4b2a..83ffed1a2967910b35b5d6118a6cab2e7693a7ae 100644 (file)
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -1,5 +1,5 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch -x86-cmov-converter=false | FileCheck %s
  target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
  
  define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
diff --git a/test/CodeGen/X86/pr15981.ll b/test/CodeGen/X86/pr15981.ll

index 6b246ef7936e6d092122f9de728cd8ef414eb82f..c171e6df18e00258ebf97a97b92bc8625a89f3f4 100644 (file)
--- a/test/CodeGen/X86/pr15981.ll
+++ b/test/CodeGen/X86/pr15981.ll
@@ -44,7 +44,10 @@ define void @fn2() {
  ; X64:       # BB#0:
  ; X64-NEXT:    xorl %eax, %eax
  ; X64-NEXT:    decl {{.*}}(%rip)
-; X64-NEXT:    cmovnel {{.*}}(%rip), %eax
+; X64-NEXT:    je .LBB1_2
+; X64-NEXT:  # BB#1:
+; X64-NEXT:    movl {{.*}}(%rip), %eax
+; X64-NEXT:  .LBB1_2:
  ; X64-NEXT:    movl %eax, {{.*}}(%rip)
  ; X64-NEXT:    retq
    %1 = load volatile i32, i32* @b, align 4
diff --git a/test/CodeGen/X86/x86-cmov-converter.ll b/test/CodeGen/X86/x86-cmov-converter.ll

index bede05a6684d07cf9f7122ad3dd4a7fd9cbac634..ccd8dc9a304a96e4e5386b67be3927918cbdbdcd 100644 (file)
--- a/test/CodeGen/X86/x86-cmov-converter.ll
+++ b/test/CodeGen/X86/x86-cmov-converter.ll
@@ -330,4 +330,141 @@ while.end:                                        ; preds = %while.body, %entry
    ret void
  }
  
+; Test that we always will convert a cmov with a memory operand into a branch,
+; even outside of a loop.
+define i32 @test_cmov_memoperand(i32 %a, i32 %b, i32 %x, i32* %y) #0 {
+; CHECK-LABEL: test_cmov_memoperand:
+entry:
+  %cond = icmp ugt i32 %a, %b
+; CHECK:         cmpl
+  %load = load i32, i32* %y
+  %z = select i1 %cond, i32 %x, i32 %load
+; CHECK-NOT:     cmov
+; CHECK:         ja [[FALSE_BB:.*]]
+; CHECK:         movl (%r{{..}}), %[[R:.*]]
+; CHECK:       [[FALSE_BB]]:
+; CHECK:         movl %[[R]], %
+  ret i32 %z
+}
+
+; Test that we can convert a group of cmovs where only one has a memory
+; operand.
+define i32 @test_cmov_memoperand_in_group(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 {
+; CHECK-LABEL: test_cmov_memoperand_in_group:
+entry:
+  %cond = icmp ugt i32 %a, %b
+; CHECK:         cmpl
+  %y = load i32, i32* %y.ptr
+  %z1 = select i1 %cond, i32 %x, i32 %a
+  %z2 = select i1 %cond, i32 %x, i32 %y
+  %z3 = select i1 %cond, i32 %x, i32 %b
+; CHECK-NOT:     cmov
+; CHECK:         ja [[FALSE_BB:.*]]
+; CHECK-DAG:     movl %{{.*}}, %[[R1:.*]]
+; CHECK-DAG:     movl (%r{{..}}), %[[R2:.*]]
+; CHECK-DAG:     movl %{{.*}} %[[R3:.*]]
+; CHECK:       [[FALSE_BB]]:
+; CHECK:         addl
+; CHECK-DAG:       %[[R1]]
+; CHECK-DAG:       ,
+; CHECK-DAG:       %[[R3]]
+; CHECK-DAG:     addl
+; CHECK-DAG:       %[[R2]]
+; CHECK-DAG:       ,
+; CHECK-DAG:       %[[R3]]
+; CHECK:         movl %[[R3]], %eax
+; CHECK:         retq
+  %s1 = add i32 %z1, %z2
+  %s2 = add i32 %s1, %z3
+  ret i32 %s2
+}
+
+; Same as before but with operands reversed in the select with a load.
+define i32 @test_cmov_memoperand_in_group2(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 {
+; CHECK-LABEL: test_cmov_memoperand_in_group2:
+entry:
+  %cond = icmp ugt i32 %a, %b
+; CHECK:         cmpl
+  %y = load i32, i32* %y.ptr
+  %z2 = select i1 %cond, i32 %a, i32 %x
+  %z1 = select i1 %cond, i32 %y, i32 %x
+  %z3 = select i1 %cond, i32 %b, i32 %x
+; CHECK-NOT:     cmov
+; CHECK:         jbe [[FALSE_BB:.*]]
+; CHECK-DAG:     movl %{{.*}}, %[[R1:.*]]
+; CHECK-DAG:     movl (%r{{..}}), %[[R2:.*]]
+; CHECK-DAG:     movl %{{.*}} %[[R3:.*]]
+; CHECK:       [[FALSE_BB]]:
+; CHECK:         addl
+; CHECK-DAG:       %[[R1]]
+; CHECK-DAG:       ,
+; CHECK-DAG:       %[[R3]]
+; CHECK-DAG:     addl
+; CHECK-DAG:       %[[R2]]
+; CHECK-DAG:       ,
+; CHECK-DAG:       %[[R3]]
+; CHECK:         movl %[[R3]], %eax
+; CHECK:         retq
+  %s1 = add i32 %z1, %z2
+  %s2 = add i32 %s1, %z3
+  ret i32 %s2
+}
+
+; Test that we don't convert a group of cmovs with conflicting directions of
+; loads.
+define i32 @test_cmov_memoperand_conflicting_dir(i32 %a, i32 %b, i32 %x, i32* %y1.ptr, i32* %y2.ptr) #0 {
+; CHECK-LABEL: test_cmov_memoperand_conflicting_dir:
+entry:
+  %cond = icmp ugt i32 %a, %b
+; CHECK:         cmpl
+  %y1 = load i32, i32* %y1.ptr
+  %y2 = load i32, i32* %y2.ptr
+  %z1 = select i1 %cond, i32 %x, i32 %y1
+  %z2 = select i1 %cond, i32 %y2, i32 %x
+; CHECK:         cmoval
+; CHECK:         cmoval
+  %s1 = add i32 %z1, %z2
+  ret i32 %s1
+}
+
+; Test that we can convert a group of cmovs where only one has a memory
+; operand and where that memory operand's registers come from a prior cmov in the group.
+define i32 @test_cmov_memoperand_in_group_reuse_for_addr(i32 %a, i32 %b, i32* %x, i32* %y) #0 {
+; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr:
+entry:
+  %cond = icmp ugt i32 %a, %b
+; CHECK:         cmpl
+  %p = select i1 %cond, i32* %x, i32* %y
+  %load = load i32, i32* %p
+  %z = select i1 %cond, i32 %a, i32 %load
+; CHECK-NOT:     cmov
+; CHECK:         ja [[FALSE_BB:.*]]
+; CHECK:         movl (%r{{..}}), %[[R:.*]]
+; CHECK:       [[FALSE_BB]]:
+; CHECK:         movl %[[R]], %eax
+; CHECK:         retq
+  ret i32 %z
+}
+
+; Test that we can convert a group of two cmovs with memory operands where one
+; uses the result of the other as part of the address.
+define i32 @test_cmov_memoperand_in_group_reuse_for_addr2(i32 %a, i32 %b, i32* %x, i32** %y) #0 {
+; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr2:
+entry:
+  %cond = icmp ugt i32 %a, %b
+; CHECK:         cmpl
+  %load1 = load i32*, i32** %y
+  %p = select i1 %cond, i32* %x, i32* %load1
+  %load2 = load i32, i32* %p
+  %z = select i1 %cond, i32 %a, i32 %load2
+; CHECK-NOT:     cmov
+; CHECK:         ja [[FALSE_BB:.*]]
+; CHECK:         movq (%r{{..}}), %[[R1:.*]]
+; CHECK:         movl (%[[R1]]), %[[R2:.*]]
+; CHECK:       [[FALSE_BB]]:
+; CHECK:         movl %[[R2]], %eax
+; CHECK:         retq
+  ret i32 %z
+}
+
  attributes #0 = {"target-cpu"="x86-64"}
author	Chandler Carruth <chandlerc@gmail.com>
	Sat, 19 Aug 2017 05:01:19 +0000 (05:01 +0000)
committer	Chandler Carruth <chandlerc@gmail.com>
	Sat, 19 Aug 2017 05:01:19 +0000 (05:01 +0000)
lib/Target/X86/X86CmovConversion.cpp		patch \| blob \| history
test/CodeGen/X86/cmov.ll		patch \| blob \| history
test/CodeGen/X86/pr15981.ll		patch \| blob \| history
test/CodeGen/X86/x86-cmov-converter.ll		patch \| blob \| history