From: Chandler Carruth <chandlerc@gmail.com>
Date: Tue, 24 Jul 2018 00:21:59 +0000 (+0000)
Subject: [x86/SLH] Remove complex SHRX-based post-load hardening.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3813cf4b746a4e60025b8a85934288b3d1e24e79;p=llvm

[x86/SLH] Remove complex SHRX-based post-load hardening.

This code was really nasty, had several bugs in it originally, and
wasn't carrying its weight. While on Zen we have all 4 ports available
for SHRX, on all of the Intel parts with Agner's tables, SHRX can only
execute on 2 ports, giving it 1/2 the throughput of OR.

Worse, all too often this pattern required two SHRX instructions in
a chain, hurting the critical path by a lot.

Even if we end up needing to safe/restore EFLAGS, that is no longer so
bad. We pay for a uop to save the flag, but we very likely get fusion
when it is used by forming a test/jCC pair or something similar. In
practice, I don't expect the SHRX to be a significant savings here, so
I'd like to avoid the complex code required. We can always resurrect
this if/when someone has a specific performance issue addressed by it.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@337781 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 3d2bbd3e55c..93dcf95bebd 100644
--- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -1907,81 +1907,18 @@ void X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
 
   auto InsertPt = std::next(MI.getIterator());
   unsigned FlagsReg = 0;
-  bool EFLAGSLive = isEFLAGSLive(MBB, InsertPt, *TRI);
-  if (EFLAGSLive && !Subtarget->hasBMI2()) {
+  if (isEFLAGSLive(MBB, InsertPt, *TRI))
     FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
-    EFLAGSLive = false;
-  }
 
-  if (!EFLAGSLive) {
-    unsigned StateReg = GetStateRegInRC(*DefRC);
-    unsigned NewDefReg = MRI->createVirtualRegister(DefRC);
-    DefOp.setReg(NewDefReg);
-    auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg)
-                   .addReg(StateReg)
-                   .addReg(NewDefReg);
-    OrI->addRegisterDead(X86::EFLAGS, TRI);
-    ++NumInstsInserted;
-    LLVM_DEBUG(dbgs() << "  Inserting or: "; OrI->dump(); dbgs() << "\n");
-  } else {
-    assert(Subtarget->hasBMI2() &&
-           "Cannot harden loads and preserve EFLAGS without BMI2!");
-
-    unsigned ShiftOpCode = DefRegBytes < 4 ? X86::SHRX32rr : X86::SHRX64rr;
-    auto &ShiftRC =
-        DefRegBytes < 4 ? X86::GR32_NOSPRegClass : X86::GR64_NOSPRegClass;
-    int ShiftRegBytes = TRI->getRegSizeInBits(ShiftRC) / 8;
-    unsigned DefSubRegImm = SubRegImms[Log2_32(DefRegBytes)];
-
-    unsigned StateReg = GetStateRegInRC(ShiftRC);
-
-    // First have the def instruction def a temporary register.
-    unsigned TmpReg = MRI->createVirtualRegister(DefRC);
-    DefOp.setReg(TmpReg);
-    // Now copy it into a register of the shift RC.
-    unsigned ShiftInputReg = TmpReg;
-    if (DefRegBytes != ShiftRegBytes) {
-      unsigned UndefReg = MRI->createVirtualRegister(&ShiftRC);
-      BuildMI(MBB, InsertPt, Loc, TII->get(X86::IMPLICIT_DEF), UndefReg);
-      ShiftInputReg = MRI->createVirtualRegister(&ShiftRC);
-      BuildMI(MBB, InsertPt, Loc, TII->get(X86::INSERT_SUBREG), ShiftInputReg)
-          .addReg(UndefReg)
-          .addReg(TmpReg)
-          .addImm(DefSubRegImm);
-    }
-
-    // We shift this once if the shift is wider than the def and thus we can
-    // shift *all* of the def'ed bytes out. Otherwise we need to do two shifts.
-
-    unsigned ShiftedReg = MRI->createVirtualRegister(&ShiftRC);
-    auto Shift1I =
-        BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), ShiftedReg)
-            .addReg(ShiftInputReg)
-            .addReg(StateReg);
-    (void)Shift1I;
-    ++NumInstsInserted;
-    LLVM_DEBUG(dbgs() << "  Inserting shrx: "; Shift1I->dump(); dbgs() << "\n");
-
-    // The only way we have a bit left is if all 8 bytes were defined. Do an
-    // extra shift to get the last bit in this case.
-    if (DefRegBytes == ShiftRegBytes) {
-      // We can just directly def the old def register as its the same size.
-      ShiftInputReg = ShiftedReg;
-      auto Shift2I =
-          BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), OldDefReg)
-              .addReg(ShiftInputReg)
-              .addReg(StateReg);
-      (void)Shift2I;
-      ++NumInstsInserted;
-      LLVM_DEBUG(dbgs() << "  Inserting shrx: "; Shift2I->dump();
-                 dbgs() << "\n");
-    } else {
-      // When we have different size shift register we need to fix up the
-      // class. We can do that as we copy into the old def register.
-      BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), OldDefReg)
-          .addReg(ShiftedReg, 0, DefSubRegImm);
-    }
-  }
+  unsigned StateReg = GetStateRegInRC(*DefRC);
+  unsigned NewDefReg = MRI->createVirtualRegister(DefRC);
+  DefOp.setReg(NewDefReg);
+  auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg)
+                 .addReg(StateReg)
+                 .addReg(NewDefReg);
+  OrI->addRegisterDead(X86::EFLAGS, TRI);
+  ++NumInstsInserted;
+  LLVM_DEBUG(dbgs() << "  Inserting or: "; OrI->dump(); dbgs() << "\n");
 
   if (FlagsReg)
     restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);