Merging r323155:

author Reid Kleckner <rnk@google.com>

Thu, 1 Feb 2018 21:28:26 +0000 (21:28 +0000)

committer Reid Kleckner <rnk@google.com>

Thu, 1 Feb 2018 21:28:26 +0000 (21:28 +0000)
author Reid Kleckner <rnk@google.com>
Thu, 1 Feb 2018 21:28:26 +0000 (21:28 +0000)
committer Reid Kleckner <rnk@google.com>
Thu, 1 Feb 2018 21:28:26 +0000 (21:28 +0000)
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h

index 96cfce5b84dfe613a087773a527d17ea8839ba63..7bfe30b96448a7e008aa6d3b3d27211c6a13b64a 100644 (file)
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -420,6 +420,9 @@ namespace llvm {
    /// shuffles.
    FunctionPass *createExpandReductionsPass();
  
+  // This pass expands indirectbr instructions.
+  FunctionPass *createIndirectBrExpandPass();
+
  } // End llvm namespace
  
  #endif
diff --git a/include/llvm/CodeGen/TargetPassConfig.h b/include/llvm/CodeGen/TargetPassConfig.h

index aaf0ab5d5481d262f6d719887b7c84fd2c1e6388..195ddfff68fcbbe7af0ea1d58c843a46f0dc05c0 100644 (file)
--- a/include/llvm/CodeGen/TargetPassConfig.h
+++ b/include/llvm/CodeGen/TargetPassConfig.h
@@ -406,6 +406,13 @@ protected:
    /// immediately before machine code is emitted.
    virtual void addPreEmitPass() { }
  
+  /// Targets may add passes immediately before machine code is emitted in this
+  /// callback. This is called even later than `addPreEmitPass`.
+  // FIXME: Rename `addPreEmitPass` to something more sensible given its actual
+  // position and remove the `2` suffix here as this callback is what
+  // `addPreEmitPass` *should* be but in reality isn't.
+  virtual void addPreEmitPass2() {}
+
    /// Utilities for targets to add passes to the pass manager.
    ///
  
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h

index 39ac4649b70d4e822967444ddd7fa0ba6b193a08..2718c528559bd7722b04158ba34fa9f2a35e42a4 100644 (file)
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -157,6 +157,7 @@ void initializeIVUsersWrapperPassPass(PassRegistry&);
  void initializeIfConverterPass(PassRegistry&);
  void initializeImplicitNullChecksPass(PassRegistry&);
  void initializeIndVarSimplifyLegacyPassPass(PassRegistry&);
+void initializeIndirectBrExpandPassPass(PassRegistry&);
  void initializeInductiveRangeCheckEliminationPass(PassRegistry&);
  void initializeInferAddressSpacesPass(PassRegistry&);
  void initializeInferFunctionAttrsLegacyPassPass(PassRegistry&);
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h

index 23711d636c9a084e50b7a6f2a64f41e368108116..da6d1c47be193148600d2691735cb4c0d23d528a 100644 (file)
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -799,7 +799,7 @@ public:
    }
  
    /// Return true if lowering to a jump table is allowed.
-  bool areJTsAllowed(const Function *Fn) const {
+  virtual bool areJTsAllowed(const Function *Fn) const {
      if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true")
        return false;
  
diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h

index 9440c56dcf17efcf8248cb651f5893b8348a9a25..c561a884f1a2fcfdffb60b5a2d1370ef9e8cf1aa 100644 (file)
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -172,6 +172,9 @@ public:
    /// \brief True if the subtarget should run the atomic expansion pass.
    virtual bool enableAtomicExpand() const;
  
+  /// True if the subtarget should run the indirectbr expansion pass.
+  virtual bool enableIndirectBrExpand() const;
+
    /// \brief Override generic scheduling policy within a region.
    ///
    /// This is a convenient way for targets that don't provide any custom
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt

index 7f3c6da912687422c79b03d0e0d909b94055f9eb..7c118a645d3e97fe1274ffd093fd51175907b7c0 100644 (file)
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -34,6 +34,7 @@ add_llvm_library(LLVMCodeGen
    GlobalMerge.cpp
    IfConversion.cpp
    ImplicitNullChecks.cpp
+  IndirectBrExpandPass.cpp
    InlineSpiller.cpp
    InterferenceCache.cpp
    InterleavedAccessPass.cpp
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp

index b7fd45a3f6a661dd33637038bd1b44779f57a3bc..8074f4751d6fe0394f7e0626e571671dd7544dee 100644 (file)
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -39,6 +39,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
    initializeGCModuleInfoPass(Registry);
    initializeIfConverterPass(Registry);
    initializeImplicitNullChecksPass(Registry);
+  initializeIndirectBrExpandPassPass(Registry);
    initializeInterleavedAccessPass(Registry);
    initializeLiveDebugValuesPass(Registry);
    initializeLiveDebugVariablesPass(Registry);
diff --git a/lib/CodeGen/IndirectBrExpandPass.cpp b/lib/CodeGen/IndirectBrExpandPass.cpp

new file mode 100644 (file)

index 0000000..3adcda9
--- /dev/null
+++ b/lib/CodeGen/IndirectBrExpandPass.cpp
@@ -0,0 +1,221 @@
+//===- IndirectBrExpandPass.cpp - Expand indirectbr to switch -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Implements an expansion pass to turn `indirectbr` instructions in the IR
+/// into `switch` instructions. This works by enumerating the basic blocks in
+/// a dense range of integers, replacing each `blockaddr` constant with the
+/// corresponding integer constant, and then building a switch that maps from
+/// the integers to the actual blocks. All of the indirectbr instructions in the
+/// function are redirected to this common switch.
+///
+/// While this is generically useful if a target is unable to codegen
+/// `indirectbr` natively, it is primarily useful when there is some desire to
+/// get the builtin non-jump-table lowering of a switch even when the input
+/// source contained an explicit indirect branch construct.
+///
+/// Note that it doesn't make any sense to enable this pass unless a target also
+/// disables jump-table lowering of switches. Doing that is likely to pessimize
+/// the code.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "indirectbr-expand"
+
+namespace {
+
+class IndirectBrExpandPass : public FunctionPass {
+  const TargetLowering *TLI = nullptr;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  IndirectBrExpandPass() : FunctionPass(ID) {
+    initializeIndirectBrExpandPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+
+} // end anonymous namespace
+
+char IndirectBrExpandPass::ID = 0;
+
+INITIALIZE_PASS(IndirectBrExpandPass, DEBUG_TYPE,
+                "Expand indirectbr instructions", false, false)
+
+FunctionPass *llvm::createIndirectBrExpandPass() {
+  return new IndirectBrExpandPass();
+}
+
+bool IndirectBrExpandPass::runOnFunction(Function &F) {
+  auto &DL = F.getParent()->getDataLayout();
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  auto &TM = TPC->getTM<TargetMachine>();
+  auto &STI = *TM.getSubtargetImpl(F);
+  if (!STI.enableIndirectBrExpand())
+    return false;
+  TLI = STI.getTargetLowering();
+
+  SmallVector<IndirectBrInst *, 1> IndirectBrs;
+
+  // Set of all potential successors for indirectbr instructions.
+  SmallPtrSet<BasicBlock *, 4> IndirectBrSuccs;
+
+  // Build a list of indirectbrs that we want to rewrite.
+  for (BasicBlock &BB : F)
+    if (auto *IBr = dyn_cast<IndirectBrInst>(BB.getTerminator())) {
+      // Handle the degenerate case of no successors by replacing the indirectbr
+      // with unreachable as there is no successor available.
+      if (IBr->getNumSuccessors() == 0) {
+        (void)new UnreachableInst(F.getContext(), IBr);
+        IBr->eraseFromParent();
+        continue;
+      }
+
+      IndirectBrs.push_back(IBr);
+      for (BasicBlock *SuccBB : IBr->successors())
+        IndirectBrSuccs.insert(SuccBB);
+    }
+
+  if (IndirectBrs.empty())
+    return false;
+
+  // If we need to replace any indirectbrs we need to establish integer
+  // constants that will correspond to each of the basic blocks in the function
+  // whose address escapes. We do that here and rewrite all the blockaddress
+  // constants to just be those integer constants cast to a pointer type.
+  SmallVector<BasicBlock *, 4> BBs;
+
+  for (BasicBlock &BB : F) {
+    // Skip blocks that aren't successors to an indirectbr we're going to
+    // rewrite.
+    if (!IndirectBrSuccs.count(&BB))
+      continue;
+
+    auto IsBlockAddressUse = [&](const Use &U) {
+      return isa<BlockAddress>(U.getUser());
+    };
+    auto BlockAddressUseIt = llvm::find_if(BB.uses(), IsBlockAddressUse);
+    if (BlockAddressUseIt == BB.use_end())
+      continue;
+
+    assert(std::find_if(std::next(BlockAddressUseIt), BB.use_end(),
+                        IsBlockAddressUse) == BB.use_end() &&
+           "There should only ever be a single blockaddress use because it is "
+           "a constant and should be uniqued.");
+
+    auto *BA = cast<BlockAddress>(BlockAddressUseIt->getUser());
+
+    // Skip if the constant was formed but ended up not being used (due to DCE
+    // or whatever).
+    if (!BA->isConstantUsed())
+      continue;
+
+    // Compute the index we want to use for this basic block. We can't use zero
+    // because null can be compared with block addresses.
+    int BBIndex = BBs.size() + 1;
+    BBs.push_back(&BB);
+
+    auto *ITy = cast<IntegerType>(DL.getIntPtrType(BA->getType()));
+    ConstantInt *BBIndexC = ConstantInt::get(ITy, BBIndex);
+
+    // Now rewrite the blockaddress to an integer constant based on the index.
+    // FIXME: We could potentially preserve the uses as arguments to inline asm.
+    // This would allow some uses such as diagnostic information in crashes to
+    // have higher quality even when this transform is enabled, but would break
+    // users that round-trip blockaddresses through inline assembly and then
+    // back into an indirectbr.
+    BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(BBIndexC, BA->getType()));
+  }
+
+  if (BBs.empty()) {
+    // There are no blocks whose address is taken, so any indirectbr instruction
+    // cannot get a valid input and we can replace all of them with unreachable.
+    for (auto *IBr : IndirectBrs) {
+      (void)new UnreachableInst(F.getContext(), IBr);
+      IBr->eraseFromParent();
+    }
+    return true;
+  }
+
+  BasicBlock *SwitchBB;
+  Value *SwitchValue;
+
+  // Compute a common integer type across all the indirectbr instructions.
+  IntegerType *CommonITy = nullptr;
+  for (auto *IBr : IndirectBrs) {
+    auto *ITy =
+        cast<IntegerType>(DL.getIntPtrType(IBr->getAddress()->getType()));
+    if (!CommonITy || ITy->getBitWidth() > CommonITy->getBitWidth())
+      CommonITy = ITy;
+  }
+
+  auto GetSwitchValue = [DL, CommonITy](IndirectBrInst *IBr) {
+    return CastInst::CreatePointerCast(
+        IBr->getAddress(), CommonITy,
+        Twine(IBr->getAddress()->getName()) + ".switch_cast", IBr);
+  };
+
+  if (IndirectBrs.size() == 1) {
+    // If we only have one indirectbr, we can just directly replace it within
+    // its block.
+    SwitchBB = IndirectBrs[0]->getParent();
+    SwitchValue = GetSwitchValue(IndirectBrs[0]);
+    IndirectBrs[0]->eraseFromParent();
+  } else {
+    // Otherwise we need to create a new block to hold the switch across BBs,
+    // jump to that block instead of each indirectbr, and phi together the
+    // values for the switch.
+    SwitchBB = BasicBlock::Create(F.getContext(), "switch_bb", &F);
+    auto *SwitchPN = PHINode::Create(CommonITy, IndirectBrs.size(),
+                                     "switch_value_phi", SwitchBB);
+    SwitchValue = SwitchPN;
+
+    // Now replace the indirectbr instructions with direct branches to the
+    // switch block and fill out the PHI operands.
+    for (auto *IBr : IndirectBrs) {
+      SwitchPN->addIncoming(GetSwitchValue(IBr), IBr->getParent());
+      BranchInst::Create(SwitchBB, IBr);
+      IBr->eraseFromParent();
+    }
+  }
+
+  // Now build the switch in the block. The block will have no terminator
+  // already.
+  auto *SI = SwitchInst::Create(SwitchValue, BBs[0], BBs.size(), SwitchBB);
+
+  // Add a case for each block.
+  for (int i : llvm::seq<int>(1, BBs.size()))
+    SI->addCase(ConstantInt::get(CommonITy, i + 1), BBs[i]);
+
+  return true;
+}
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp

index 817e58ce59e1077212bf70d015d3432f94c9ac73..624520c610c61c10e386324b277275d8e4161e40 100644 (file)
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -790,6 +790,9 @@ void TargetPassConfig::addMachinePasses() {
    if (EnableMachineOutliner)
      PM->add(createMachineOutlinerPass());
  
+  // Add passes that directly emit MI after all other MI passes.
+  addPreEmitPass2();
+
    AddingMachinePasses = false;
  }
  
diff --git a/lib/CodeGen/TargetSubtargetInfo.cpp b/lib/CodeGen/TargetSubtargetInfo.cpp

index f6d5bc80ddffb4d2396a56bbf9818fc3e7847c21..d02e39f34c5ddeffca075d17d66e435c05902149 100644 (file)
--- a/lib/CodeGen/TargetSubtargetInfo.cpp
+++ b/lib/CodeGen/TargetSubtargetInfo.cpp
@@ -37,6 +37,10 @@ bool TargetSubtargetInfo::enableAtomicExpand() const {
    return true;
  }
  
+bool TargetSubtargetInfo::enableIndirectBrExpand() const {
+  return false;
+}
+
  bool TargetSubtargetInfo::enableMachineScheduler() const {
    return false;
  }
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt

index 6e08d4cff6eaf9853ebcf63bd666523a00d98688..ae58dbd4c42d82af8842e4bdb3a865150df35eb0 100644 (file)
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -57,6 +57,7 @@ set(sources
    X86OptimizeLEAs.cpp
    X86PadShortFunction.cpp
    X86RegisterInfo.cpp
+  X86RetpolineThunks.cpp
    X86SelectionDAGInfo.cpp
    X86ShuffleDecodeConstantPool.cpp
    X86Subtarget.cpp
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h

index 91201d1fec85ab37bcf5033627cd011aa010d5b1..25e4b893e6c2eb828451b00f3d44db4bb016322c 100644 (file)
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -22,6 +22,7 @@ namespace llvm {
  class FunctionPass;
  class ImmutablePass;
  class InstructionSelector;
+class ModulePass;
  class PassRegistry;
  class X86RegisterBankInfo;
  class X86Subtarget;
@@ -98,6 +99,9 @@ void initializeFixupBWInstPassPass(PassRegistry &);
  /// encoding when possible in order to reduce code size.
  FunctionPass *createX86EvexToVexInsts();
  
+/// This pass creates the thunks for the retpoline feature.
+ModulePass *createX86RetpolineThunksPass();
+
  InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                    X86Subtarget &,
                                                    X86RegisterBankInfo &);
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td

index 54eabeac5126479c7eb48cf8f4ea4902d3b8e436..62543b0ff37bf97eb57de314d6bfb1acdcee7123 100644 (file)
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -290,6 +290,27 @@ def FeatureERMSB
            "ermsb", "HasERMSB", "true",
            "REP MOVS/STOS are fast">;
  
+// Enable mitigation of some aspects of speculative execution related
+// vulnerabilities by removing speculatable indirect branches. This disables
+// jump-table formation, rewrites explicit `indirectbr` instructions into
+// `switch` instructions, and uses a special construct called a "retpoline" to
+// prevent speculation of the remaining indirect branches (indirect calls and
+// tail calls).
+def FeatureRetpoline
+    : SubtargetFeature<"retpoline", "UseRetpoline", "true",
+                       "Remove speculation of indirect branches from the "
+                       "generated code, either by avoiding them entirely or "
+                       "lowering them with a speculation blocking construct.">;
+
+// Rely on external thunks for the emitted retpoline calls. This allows users
+// to provide their own custom thunk definitions in highly specialized
+// environments such as a kernel that does boot-time hot patching.
+def FeatureRetpolineExternalThunk
+    : SubtargetFeature<
+          "retpoline-external-thunk", "UseRetpolineExternalThunk", "true",
+          "Enable retpoline, but with an externally provided thunk.",
+          [FeatureRetpoline]>;
+
  //===----------------------------------------------------------------------===//
  // X86 processors supported.
  //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h

index d7c3b74d3efb2a37e839ad57ce7c14d158463b5d..3a31bfa46427336d0ec5cdf4fd7abdf43ec7ff9d 100644 (file)
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -30,6 +30,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
    StackMaps SM;
    FaultMaps FM;
    std::unique_ptr<MCCodeEmitter> CodeEmitter;
+  bool NeedsRetpoline = false;
  
    // This utility class tracks the length of a stackmap instruction's 'shadow'.
    // It is used by the X86AsmPrinter to ensure that the stackmap shadow
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp

index 527e5d568ac6f90187c7e2599f0bccf12242b889..71f30ba290bcff1795f09140e2254a661981a6ac 100644 (file)
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -3161,6 +3161,10 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
        (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers")))
      return false;
  
+  // Functions using retpoline should use SDISel for calls.
+  if (Subtarget->useRetpoline())
+    return false;
+
    // Handle only C, fastcc, and webkit_js calling conventions for now.
    switch (CC) {
    default: return false;
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp

index f294e819090bc03d0faf940f2ca6f00c8592ebcc..710ffa9472282d0777a5741184eabe205d8428ce 100644 (file)
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -742,6 +742,11 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
                                            bool InProlog) const {
    bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
  
+  // FIXME: Add retpoline support and remove this.
+  if (Is64Bit && IsLargeCodeModel && STI.useRetpoline())
+    report_fatal_error("Emitting stack probe calls on 64-bit with the large "
+                       "code model and retpoline not yet implemented.");
+
    unsigned CallOp;
    if (Is64Bit)
      CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
@@ -2337,6 +2342,10 @@ void X86FrameLowering::adjustForSegmentedStacks(
      // This solution is not perfect, as it assumes that the .rodata section
      // is laid out within 2^31 bytes of each function body, but this seems
      // to be sufficient for JIT.
+    // FIXME: Add retpoline support and remove the error here..
+    if (STI.useRetpoline())
+      report_fatal_error("Emitting morestack calls on 64-bit with the large "
+                         "code model and retpoline not yet implemented.");
      BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
          .addReg(X86::RIP)
          .addImm(0)
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp

index 8f24f98be681fa4399586f103c3598af3580bf50..41d1a31cf74fb6efe4408ae0a63a3268ab1811ce 100644 (file)
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -550,11 +550,11 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
      SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
  
      if (OptLevel != CodeGenOpt::None &&
-        // Only does this when target favors doesn't favor register indirect
-        // call.
+        // Only do this when the target can fold the load into the call or
+        // jmp.
+        !Subtarget->useRetpoline() &&
          ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) ||
           (N->getOpcode() == X86ISD::TC_RETURN &&
-          // Only does this if load can be folded into TC_RETURN.
            (Subtarget->is64Bit() ||
             !getTargetMachine().isPositionIndependent())))) {
        /// Also try moving call address load from outside callseq_start to just
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 607bc4530abb2bcb6120b3f115aa8ab3685162b9..2c2294d6e032fd83c3e589afb48b15affcd30da6 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -24994,6 +24994,15 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
    return isShuffleMaskLegal(Mask, VT);
  }
  
+bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
+  // If the subtarget is using retpolines, we need to not generate jump tables.
+  if (Subtarget.useRetpoline())
+    return false;
+
+  // Otherwise, fallback on the generic logic.
+  return TargetLowering::areJTsAllowed(Fn);
+}
+
  //===----------------------------------------------------------------------===//
  //                           X86 Scheduler Hooks
  //===----------------------------------------------------------------------===//
@@ -26225,6 +26234,115 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
    return BB;
  }
  
+static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
+  switch (RPOpc) {
+  case X86::RETPOLINE_CALL32:
+    return X86::CALLpcrel32;
+  case X86::RETPOLINE_CALL64:
+    return X86::CALL64pcrel32;
+  case X86::RETPOLINE_TCRETURN32:
+    return X86::TCRETURNdi;
+  case X86::RETPOLINE_TCRETURN64:
+    return X86::TCRETURNdi64;
+  }
+  llvm_unreachable("not retpoline opcode");
+}
+
+static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
+                                      unsigned Reg) {
+  switch (Reg) {
+  case 0:
+    assert(!Subtarget.is64Bit() && "R11 should always be available on x64");
+    return Subtarget.useRetpolineExternalThunk()
+               ? "__llvm_external_retpoline_push"
+               : "__llvm_retpoline_push";
+  case X86::EAX:
+    return Subtarget.useRetpolineExternalThunk()
+               ? "__llvm_external_retpoline_eax"
+               : "__llvm_retpoline_eax";
+  case X86::ECX:
+    return Subtarget.useRetpolineExternalThunk()
+               ? "__llvm_external_retpoline_ecx"
+               : "__llvm_retpoline_ecx";
+  case X86::EDX:
+    return Subtarget.useRetpolineExternalThunk()
+               ? "__llvm_external_retpoline_edx"
+               : "__llvm_retpoline_edx";
+  case X86::R11:
+    return Subtarget.useRetpolineExternalThunk()
+               ? "__llvm_external_retpoline_r11"
+               : "__llvm_retpoline_r11";
+  }
+  llvm_unreachable("unexpected reg for retpoline");
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
+                                        MachineBasicBlock *BB) const {
+  // Copy the virtual register into the R11 physical register and
+  // call the retpoline thunk.
+  DebugLoc DL = MI.getDebugLoc();
+  const X86InstrInfo *TII = Subtarget.getInstrInfo();
+  unsigned CalleeVReg = MI.getOperand(0).getReg();
+  unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
+
+  // Find an available scratch register to hold the callee. On 64-bit, we can
+  // just use R11, but we scan for uses anyway to ensure we don't generate
+  // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
+  // already a register use operand to the call to hold the callee. If none
+  // are available, push the callee instead. This is less efficient, but is
+  // necessary for functions using 3 regparms. Such function calls are
+  // (currently) not eligible for tail call optimization, because there is no
+  // scratch register available to hold the address of the callee.
+  SmallVector<unsigned, 3> AvailableRegs;
+  if (Subtarget.is64Bit())
+    AvailableRegs.push_back(X86::R11);
+  else
+    AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX});
+
+  // Zero out any registers that are already used.
+  for (const auto &MO : MI.operands()) {
+    if (MO.isReg() && MO.isUse())
+      for (unsigned &Reg : AvailableRegs)
+        if (Reg == MO.getReg())
+          Reg = 0;
+  }
+
+  // Choose the first remaining non-zero available register.
+  unsigned AvailableReg = 0;
+  for (unsigned MaybeReg : AvailableRegs) {
+    if (MaybeReg) {
+      AvailableReg = MaybeReg;
+      break;
+    }
+  }
+
+  const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
+
+  if (AvailableReg == 0) {
+    // No register available. Use PUSH. This must not be a tailcall, and this
+    // must not be x64.
+    if (Subtarget.is64Bit())
+      report_fatal_error(
+          "Cannot make an indirect call on x86-64 using both retpoline and a "
+          "calling convention that preservers r11");
+    if (Opc != X86::CALLpcrel32)
+      report_fatal_error("Cannot make an indirect tail call on x86 using "
+                         "retpoline without a preserved register");
+    BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg);
+    MI.getOperand(0).ChangeToES(Symbol);
+    MI.setDesc(TII->get(Opc));
+  } else {
+    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
+        .addReg(CalleeVReg);
+    MI.getOperand(0).ChangeToES(Symbol);
+    MI.setDesc(TII->get(Opc));
+    MachineInstrBuilder(*BB->getParent(), &MI)
+        .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
+  }
+  return BB;
+}
+
  MachineBasicBlock *
  X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                      MachineBasicBlock *MBB) const {
@@ -26689,6 +26807,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
    case X86::TLS_base_addr32:
    case X86::TLS_base_addr64:
      return EmitLoweredTLSAddr(MI, BB);
+  case X86::RETPOLINE_CALL32:
+  case X86::RETPOLINE_CALL64:
+  case X86::RETPOLINE_TCRETURN32:
+  case X86::RETPOLINE_TCRETURN64:
+    return EmitLoweredRetpoline(MI, BB);
    case X86::CATCHRET:
      return EmitLoweredCatchRet(MI, BB);
    case X86::CATCHPAD:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h

index dbbc2bbba6a4ab86cb404108da072d7777d62a54..7eeb153502afc1ac26d7afc26dca4913f3ff0a09 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -986,6 +986,9 @@ namespace llvm {
      bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
                                  EVT VT) const override;
  
+    /// Returns true if lowering to a jump table is allowed.
+    bool areJTsAllowed(const Function *Fn) const override;
+
      /// If true, then instruction selection should
      /// seek to shrink the FP constant of the specified type to a smaller type
      /// in order to save space and / or reduce runtime.
@@ -1289,6 +1292,9 @@ namespace llvm {
      MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
                                            MachineBasicBlock *BB) const;
  
+    MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI,
+                                            MachineBasicBlock *BB) const;
+
      MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
                                          MachineBasicBlock *MBB) const;
  
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td

index d003d027ddb9f5abc36058bc379f18e472808240..296ea6943abc0cccb1bae317cdb41cb7bbd9f0ea 100644 (file)
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -1106,14 +1106,14 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
  
  def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
            (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
-          Requires<[Not64BitMode]>;
+          Requires<[Not64BitMode, NotUseRetpoline]>;
  
  // FIXME: This is disabled for 32-bit PIC mode because the global base
  // register which is part of the address mode may be assigned a
  // callee-saved register.
  def : Pat<(X86tcret (load addr:$dst), imm:$off),
            (TCRETURNmi addr:$dst, imm:$off)>,
-          Requires<[Not64BitMode, IsNotPIC]>;
+          Requires<[Not64BitMode, IsNotPIC, NotUseRetpoline]>;
  
  def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
            (TCRETURNdi tglobaladdr:$dst, imm:$off)>,
@@ -1125,13 +1125,21 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
  
  def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
            (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
-          Requires<[In64BitMode]>;
+          Requires<[In64BitMode, NotUseRetpoline]>;
  
  // Don't fold loads into X86tcret requiring more than 6 regs.
  // There wouldn't be enough scratch registers for base+index.
  def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
            (TCRETURNmi64 addr:$dst, imm:$off)>,
-          Requires<[In64BitMode]>;
+          Requires<[In64BitMode, NotUseRetpoline]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+          (RETPOLINE_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[In64BitMode, UseRetpoline]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+          (RETPOLINE_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[Not64BitMode, UseRetpoline]>;
  
  def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
            (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td

index 4ea223e82be9cc683acdc7446138aca6c0bbca4b..f13936495f6af1954ff33c10389e93bfcfa6490b 100644 (file)
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -211,11 +211,12 @@ let isCall = 1 in
                        Sched<[WriteJumpLd]>;
      def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
                          "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
-                      OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+                      OpSize32, Requires<[Not64BitMode,NotUseRetpoline]>,
+                      Sched<[WriteJump]>;
      def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
                          "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
                          IIC_CALL_MEM>, OpSize32,
-                      Requires<[Not64BitMode,FavorMemIndirectCall]>,
+                      Requires<[Not64BitMode,FavorMemIndirectCall,NotUseRetpoline]>,
                        Sched<[WriteJumpLd]>;
  
      let Predicates = [Not64BitMode] in {
@@ -298,11 +299,12 @@ let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
    def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
                          "call{q}\t{*}$dst", [(X86call GR64:$dst)],
                          IIC_CALL_RI>,
-                      Requires<[In64BitMode]>;
+                      Requires<[In64BitMode,NotUseRetpoline]>;
    def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
                          "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
                          IIC_CALL_MEM>,
-                      Requires<[In64BitMode,FavorMemIndirectCall]>;
+                      Requires<[In64BitMode,FavorMemIndirectCall,
+                                NotUseRetpoline]>;
  
    def FARCALL64   : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
                         "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
@@ -341,6 +343,27 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
    }
  }
  
+let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
+    Uses = [RSP],
+    usesCustomInserter = 1,
+    SchedRW = [WriteJump] in {
+  def RETPOLINE_CALL32 :
+    PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>,
+            Requires<[Not64BitMode,UseRetpoline]>;
+
+  def RETPOLINE_CALL64 :
+    PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>,
+            Requires<[In64BitMode,UseRetpoline]>;
+
+  // Retpoline variant of indirect tail calls.
+  let isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+    def RETPOLINE_TCRETURN64 :
+      PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>;
+    def RETPOLINE_TCRETURN32 :
+      PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>;
+  }
+}
+
  // Conditional tail calls are similar to the above, but they are branches
  // rather than barriers, and they use EFLAGS.
  let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td

index fab70e918b8adbb93208bc9379b70adb84b05959..0ba2d3abb039f2e7f45d202b3d8e7e6ded45d3ec 100644 (file)
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -917,6 +917,8 @@ def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
  def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
  def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
  def HasMFence    : Predicate<"Subtarget->hasMFence()">;
+def UseRetpoline : Predicate<"Subtarget->useRetpoline()">;
+def NotUseRetpoline : Predicate<"!Subtarget->useRetpoline()">;
  
  //===----------------------------------------------------------------------===//
  // X86 Instruction Format Definitions.
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp

index fd2837b79103eecec12bf0ec08f58cd057a85127..a1dc5f9ffefcf76078715ed445f3af4c39066de6 100644 (file)
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -874,6 +874,10 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
        // address is to far away. (TODO: support non-relative addressing)
        break;
      case MachineOperand::MO_Register:
+      // FIXME: Add retpoline support and remove this.
+      if (Subtarget->useRetpoline())
+        report_fatal_error("Lowering register statepoints with retpoline not "
+                           "yet implemented.");
        CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
        CallOpcode = X86::CALL64r;
        break;
@@ -1028,6 +1032,10 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
  
      EmitAndCountInstruction(
          MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
+    // FIXME: Add retpoline support and remove this.
+    if (Subtarget->useRetpoline())
+      report_fatal_error(
+          "Lowering patchpoint with retpoline not yet implemented.");
      EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
    }
  
diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp

new file mode 100644 (file)

index 0000000..6b4bc8a
--- /dev/null
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -0,0 +1,276 @@
+//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86  --=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Pass that injects an MI thunk implementing a "retpoline". This is
+/// a RET-implemented trampoline that is used to lower indirect calls in a way
+/// that prevents speculation on some x86 processors and can be used to mitigate
+/// security vulnerabilities due to targeted speculative execution and side
+/// channels such as CVE-2017-5715.
+///
+/// TODO(chandlerc): All of this code could use better comments and
+/// documentation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-retpoline-thunks"
+
+namespace {
+class X86RetpolineThunks : public ModulePass {
+public:
+  static char ID;
+
+  X86RetpolineThunks() : ModulePass(ID) {}
+
+  StringRef getPassName() const override { return "X86 Retpoline Thunks"; }
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfo>();
+    AU.addPreserved<MachineModuleInfo>();
+  }
+
+private:
+  MachineModuleInfo *MMI;
+  const TargetMachine *TM;
+  bool Is64Bit;
+  const X86Subtarget *STI;
+  const X86InstrInfo *TII;
+
+  Function *createThunkFunction(Module &M, StringRef Name);
+  void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
+  void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB);
+  void createThunk(Module &M, StringRef NameSuffix,
+                   Optional<unsigned> Reg = None);
+};
+
+} // end anonymous namespace
+
+ModulePass *llvm::createX86RetpolineThunksPass() {
+  return new X86RetpolineThunks();
+}
+
+char X86RetpolineThunks::ID = 0;
+
+bool X86RetpolineThunks::runOnModule(Module &M) {
+  DEBUG(dbgs() << getPassName() << '\n');
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  assert(TPC && "X86-specific target pass should not be run without a target "
+                "pass config!");
+
+  MMI = &getAnalysis<MachineModuleInfo>();
+  TM = &TPC->getTM<TargetMachine>();
+  Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64;
+
+  // Only add a thunk if we have at least one function that has the retpoline
+  // feature enabled in its subtarget.
+  // FIXME: Conditionalize on indirect calls so we don't emit a thunk when
+  // nothing will end up calling it.
+  // FIXME: It's a little silly to look at every function just to enumerate
+  // the subtargets, but eventually we'll want to look at them for indirect
+  // calls, so maybe this is OK.
+  if (!llvm::any_of(M, [&](const Function &F) {
+        // Save the subtarget we find for use in emitting the subsequent
+        // thunk.
+        STI = &TM->getSubtarget<X86Subtarget>(F);
+        return STI->useRetpoline() && !STI->useRetpolineExternalThunk();
+      }))
+    return false;
+
+  // If we have a relevant subtarget, get the instr info as well.
+  TII = STI->getInstrInfo();
+
+  if (Is64Bit) {
+    // __llvm_retpoline_r11:
+    //   callq .Lr11_call_target
+    // .Lr11_capture_spec:
+    //   pause
+    //   lfence
+    //   jmp .Lr11_capture_spec
+    // .align 16
+    // .Lr11_call_target:
+    //   movq %r11, (%rsp)
+    //   retq
+
+    createThunk(M, "r11", X86::R11);
+  } else {
+    // For 32-bit targets we need to emit a collection of thunks for various
+    // possible scratch registers as well as a fallback that is used when
+    // there are no scratch registers and assumes the retpoline target has
+    // been pushed.
+    //   __llvm_retpoline_eax:
+    //         calll .Leax_call_target
+    //   .Leax_capture_spec:
+    //         pause
+    //         jmp .Leax_capture_spec
+    //   .align 16
+    //   .Leax_call_target:
+    //         movl %eax, (%esp)  # Clobber return addr
+    //         retl
+    //
+    //   __llvm_retpoline_ecx:
+    //   ... # Same setup
+    //         movl %ecx, (%esp)
+    //         retl
+    //
+    //   __llvm_retpoline_edx:
+    //   ... # Same setup
+    //         movl %edx, (%esp)
+    //         retl
+    //
+    // This last one is a bit more special and so needs a little extra
+    // handling.
+    // __llvm_retpoline_push:
+    //         calll .Lpush_call_target
+    // .Lpush_capture_spec:
+    //         pause
+    //         lfence
+    //         jmp .Lpush_capture_spec
+    // .align 16
+    // .Lpush_call_target:
+    //         # Clear pause_loop return address.
+    //         addl $4, %esp
+    //         # Top of stack words are: Callee, RA. Exchange Callee and RA.
+    //         pushl 4(%esp)  # Push callee
+    //         pushl 4(%esp)  # Push RA
+    //         popl 8(%esp)   # Pop RA to final RA
+    //         popl (%esp)    # Pop callee to next top of stack
+    //         retl           # Ret to callee
+    createThunk(M, "eax", X86::EAX);
+    createThunk(M, "ecx", X86::ECX);
+    createThunk(M, "edx", X86::EDX);
+    createThunk(M, "push");
+  }
+
+  return true;
+}
+
+Function *X86RetpolineThunks::createThunkFunction(Module &M, StringRef Name) {
+  LLVMContext &Ctx = M.getContext();
+  auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
+  Function *F =
+      Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M);
+  F->setVisibility(GlobalValue::HiddenVisibility);
+  F->setComdat(M.getOrInsertComdat(Name));
+
+  // Add Attributes so that we don't create a frame, unwind information, or
+  // inline.
+  AttrBuilder B;
+  B.addAttribute(llvm::Attribute::NoUnwind);
+  B.addAttribute(llvm::Attribute::Naked);
+  F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+
+  // Populate our function a bit so that we can verify.
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F);
+  IRBuilder<> Builder(Entry);
+
+  Builder.CreateRetVoid();
+  return F;
+}
+
+void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
+                                                    unsigned Reg) {
+  const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
+  const unsigned SPReg = Is64Bit ? X86::RSP : X86::ESP;
+  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(MovOpc)), SPReg, false, 0)
+      .addReg(Reg);
+}
+void X86RetpolineThunks::insert32BitPushReturnAddrClobber(
+    MachineBasicBlock &MBB) {
+  // The instruction sequence we use to replace the return address without
+  // a scratch register is somewhat complicated:
+  //   # Clear capture_spec from return address.
+  //   addl $4, %esp
+  //   # Top of stack words are: Callee, RA. Exchange Callee and RA.
+  //   pushl 4(%esp)  # Push callee
+  //   pushl 4(%esp)  # Push RA
+  //   popl 8(%esp)   # Pop RA to final RA
+  //   popl (%esp)    # Pop callee to next top of stack
+  //   retl           # Ret to callee
+  BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP)
+      .addReg(X86::ESP)
+      .addImm(4);
+  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
+               false, 4);
+  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
+               false, 4);
+  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
+               false, 8);
+  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
+               false, 0);
+}
+
+void X86RetpolineThunks::createThunk(Module &M, StringRef NameSuffix,
+                                     Optional<unsigned> Reg) {
+  Function &F =
+      *createThunkFunction(M, (Twine("__llvm_retpoline_") + NameSuffix).str());
+  MachineFunction &MF = MMI->getOrCreateMachineFunction(F);
+
+  // Set MF properties. We never use vregs...
+  MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+
+  BasicBlock &OrigEntryBB = F.getEntryBlock();
+  MachineBasicBlock *Entry = MF.CreateMachineBasicBlock(&OrigEntryBB);
+  MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(&OrigEntryBB);
+  MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(&OrigEntryBB);
+
+  MF.push_back(Entry);
+  MF.push_back(CaptureSpec);
+  MF.push_back(CallTarget);
+
+  const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+  const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;
+
+  BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addMBB(CallTarget);
+  Entry->addSuccessor(CallTarget);
+  Entry->addSuccessor(CaptureSpec);
+  CallTarget->setHasAddressTaken();
+
+  // In the capture loop for speculation, we want to stop the processor from
+  // speculating as fast as possible. On Intel processors, the PAUSE instruction
+  // will block speculation without consuming any execution resources. On AMD
+  // processors, the PAUSE instruction is (essentially) a nop, so we also use an
+  // LFENCE instruction which they have advised will stop speculation as well
+  // with minimal resource utilization. We still end the capture with a jump to
+  // form an infinite loop to fully guarantee that no matter what implementation
+  // of the x86 ISA, speculating this code path never escapes.
+  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE));
+  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE));
+  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec);
+  CaptureSpec->setHasAddressTaken();
+  CaptureSpec->addSuccessor(CaptureSpec);
+
+  CallTarget->setAlignment(4);
+  if (Reg) {
+    insertRegReturnAddrClobber(*CallTarget, *Reg);
+  } else {
+    assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!");
+    insert32BitPushReturnAddrClobber(*CallTarget);
+  }
+  BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
+}
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp

index 24845beac22d1eb8be4262296fb5fb1628f4c521..0180090afcb5055f224246b1e6fb6b8e7c14f54c 100644 (file)
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -315,6 +315,8 @@ void X86Subtarget::initializeEnvironment() {
    HasCLFLUSHOPT = false;
    HasCLWB = false;
    IsBTMemSlow = false;
+  UseRetpoline = false;
+  UseRetpolineExternalThunk = false;
    IsPMULLDSlow = false;
    IsSHLDSlow = false;
    IsUAMem16Slow = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h

index 427a0001bef98a36a27241730616cc0e95888b9e..614f833be1be9f0cf6a10d40b629d525eee705b9 100644 (file)
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -297,6 +297,14 @@ protected:
    /// Processor supports Cache Line Write Back instruction
    bool HasCLWB;
  
+  /// Use a retpoline thunk rather than indirect calls to block speculative
+  /// execution.
+  bool UseRetpoline;
+
+  /// When using a retpoline thunk, call an externally provided thunk rather
+  /// than emitting one inside the compiler.
+  bool UseRetpolineExternalThunk;
+
    /// Use software floating point for code generation.
    bool UseSoftFloat;
  
@@ -506,6 +514,8 @@ public:
    bool hasPKU() const { return HasPKU; }
    bool hasMPX() const { return HasMPX; }
    bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
+  bool useRetpoline() const { return UseRetpoline; }
+  bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
  
    bool isXRaySupported() const override { return is64Bit(); }
  
@@ -639,6 +649,10 @@ public:
    /// compiler runtime or math libraries.
    bool hasSinCos() const;
  
+  /// If we are using retpolines, we need to expand indirectbr to avoid it
+  /// lowering to an actual indirect jump.
+  bool enableIndirectBrExpand() const override { return useRetpoline(); }
+
    /// Enable the MachineScheduler pass for all X86 subtargets.
    bool enableMachineScheduler() const override { return true; }
  
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp

index 08c2cdaefe71d30c5888e4953935ce9891e468dd..939e44749aeb059f0883f8e078d98e961d90571c 100644 (file)
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -305,6 +305,7 @@ public:
    void addPreRegAlloc() override;
    void addPostRegAlloc() override;
    void addPreEmitPass() override;
+  void addPreEmitPass2() override;
    void addPreSched2() override;
  };
  
@@ -334,6 +335,11 @@ void X86PassConfig::addIRPasses() {
  
    if (TM->getOptLevel() != CodeGenOpt::None)
      addPass(createInterleavedAccessPass());
+
+  // Add passes that handle indirect branch removal and insertion of a retpoline
+  // thunk. These will be a no-op unless a function subtarget has the retpoline
+  // feature enabled.
+  addPass(createIndirectBrExpandPass());
  }
  
  bool X86PassConfig::addInstSelector() {
@@ -418,3 +424,7 @@ void X86PassConfig::addPreEmitPass() {
      addPass(createX86EvexToVexInsts());
    }
  }
+
+void X86PassConfig::addPreEmitPass2() {
+  addPass(createX86RetpolineThunksPass());
+}
diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll

index 5e375cc42e018753b2a8bca9f8ab478de3e02216..f9bd66f850450c439c88199e8aa22ef61b9e2116 100644 (file)
--- a/test/CodeGen/X86/O0-pipeline.ll
+++ b/test/CodeGen/X86/O0-pipeline.ll
@@ -25,6 +25,7 @@
  ; CHECK-NEXT:       Inserts calls to mcount-like functions
  ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
  ; CHECK-NEXT:       Expand reduction intrinsics
+; CHECK-NEXT:       Expand indirectbr instructions
  ; CHECK-NEXT:     Rewrite Symbols
  ; CHECK-NEXT:     FunctionPass Manager
  ; CHECK-NEXT:       Dominator Tree Construction
@@ -55,6 +56,8 @@
  ; CHECK-NEXT:       Machine Natural Loop Construction
  ; CHECK-NEXT:       Insert XRay ops
  ; CHECK-NEXT:       Implement the 'patchable-function' attribute
+; CHECK-NEXT:     X86 Retpoline Thunks
+; CHECK-NEXT:     FunctionPass Manager
  ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
  ; CHECK-NEXT:       Machine Optimization Remark Emitter
  ; CHECK-NEXT:       MachineDominator Tree Construction
diff --git a/test/CodeGen/X86/retpoline-external.ll b/test/CodeGen/X86/retpoline-external.ll

new file mode 100644 (file)

index 0000000..66d32ba
--- /dev/null
+++ b/test/CodeGen/X86/retpoline-external.ll
@@ -0,0 +1,166 @@
+; RUN: llc -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64
+; RUN: llc -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST
+
+; RUN: llc -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86
+; RUN: llc -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST
+
+declare void @bar(i32)
+
+; Test a simple indirect call and tail call.
+define void @icall_reg(void (i32)* %fp, i32 %x) #0 {
+entry:
+  tail call void @bar(i32 %x)
+  tail call void %fp(i32 %x)
+  tail call void @bar(i32 %x)
+  tail call void %fp(i32 %x)
+  ret void
+}
+
+; X64-LABEL: icall_reg:
+; X64-DAG:   movq %rdi, %[[fp:[^ ]*]]
+; X64-DAG:   movl %esi, %[[x:[^ ]*]]
+; X64:       movl %[[x]], %edi
+; X64:       callq bar
+; X64-DAG:   movl %[[x]], %edi
+; X64-DAG:   movq %[[fp]], %r11
+; X64:       callq __llvm_external_retpoline_r11
+; X64:       movl %[[x]], %edi
+; X64:       callq bar
+; X64-DAG:   movl %[[x]], %edi
+; X64-DAG:   movq %[[fp]], %r11
+; X64:       jmp __llvm_external_retpoline_r11 # TAILCALL
+
+; X64FAST-LABEL: icall_reg:
+; X64FAST:       callq bar
+; X64FAST:       callq __llvm_external_retpoline_r11
+; X64FAST:       callq bar
+; X64FAST:       jmp __llvm_external_retpoline_r11 # TAILCALL
+
+; X86-LABEL: icall_reg:
+; X86-DAG:   movl 12(%esp), %[[fp:[^ ]*]]
+; X86-DAG:   movl 16(%esp), %[[x:[^ ]*]]
+; X86:       pushl %[[x]]
+; X86:       calll bar
+; X86:       movl %[[fp]], %eax
+; X86:       pushl %[[x]]
+; X86:       calll __llvm_external_retpoline_eax
+; X86:       pushl %[[x]]
+; X86:       calll bar
+; X86:       movl %[[fp]], %eax
+; X86:       pushl %[[x]]
+; X86:       calll __llvm_external_retpoline_eax
+; X86-NOT:   # TAILCALL
+
+; X86FAST-LABEL: icall_reg:
+; X86FAST:       calll bar
+; X86FAST:       calll __llvm_external_retpoline_eax
+; X86FAST:       calll bar
+; X86FAST:       calll __llvm_external_retpoline_eax
+
+
+@global_fp = external global void (i32)*
+
+; Test an indirect call through a global variable.
+define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 {
+  %fp1 = load void (i32)*, void (i32)** @global_fp
+  call void %fp1(i32 %x)
+  %fp2 = load void (i32)*, void (i32)** @global_fp
+  tail call void %fp2(i32 %x)
+  ret void
+}
+
+; X64-LABEL: icall_global_fp:
+; X64-DAG:   movl %edi, %[[x:[^ ]*]]
+; X64-DAG:   movq global_fp(%rip), %r11
+; X64:       callq __llvm_external_retpoline_r11
+; X64-DAG:   movl %[[x]], %edi
+; X64-DAG:   movq global_fp(%rip), %r11
+; X64:       jmp __llvm_external_retpoline_r11 # TAILCALL
+
+; X64FAST-LABEL: icall_global_fp:
+; X64FAST:       movq global_fp(%rip), %r11
+; X64FAST:       callq __llvm_external_retpoline_r11
+; X64FAST:       movq global_fp(%rip), %r11
+; X64FAST:       jmp __llvm_external_retpoline_r11 # TAILCALL
+
+; X86-LABEL: icall_global_fp:
+; X86:       movl global_fp, %eax
+; X86:       pushl 4(%esp)
+; X86:       calll __llvm_external_retpoline_eax
+; X86:       addl $4, %esp
+; X86:       movl global_fp, %eax
+; X86:       jmp __llvm_external_retpoline_eax # TAILCALL
+
+; X86FAST-LABEL: icall_global_fp:
+; X86FAST:       calll __llvm_external_retpoline_eax
+; X86FAST:       jmp __llvm_external_retpoline_eax # TAILCALL
+
+
+%struct.Foo = type { void (%struct.Foo*)** }
+
+; Test an indirect call through a vtable.
+define void @vcall(%struct.Foo* %obj) #0 {
+  %vptr_field = getelementptr %struct.Foo, %struct.Foo* %obj, i32 0, i32 0
+  %vptr = load void (%struct.Foo*)**, void (%struct.Foo*)*** %vptr_field
+  %vslot = getelementptr void(%struct.Foo*)*, void(%struct.Foo*)** %vptr, i32 1
+  %fp = load void(%struct.Foo*)*, void(%struct.Foo*)** %vslot
+  tail call void %fp(%struct.Foo* %obj)
+  tail call void %fp(%struct.Foo* %obj)
+  ret void
+}
+
+; X64-LABEL: vcall:
+; X64:       movq %rdi, %[[obj:[^ ]*]]
+; X64:       movq (%[[obj]]), %[[vptr:[^ ]*]]
+; X64:       movq 8(%[[vptr]]), %[[fp:[^ ]*]]
+; X64:       movq %[[fp]], %r11
+; X64:       callq __llvm_external_retpoline_r11
+; X64-DAG:   movq %[[obj]], %rdi
+; X64-DAG:   movq %[[fp]], %r11
+; X64:       jmp __llvm_external_retpoline_r11 # TAILCALL
+
+; X64FAST-LABEL: vcall:
+; X64FAST:       callq __llvm_external_retpoline_r11
+; X64FAST:       jmp __llvm_external_retpoline_r11 # TAILCALL
+
+; X86-LABEL: vcall:
+; X86:       movl 8(%esp), %[[obj:[^ ]*]]
+; X86:       movl (%[[obj]]), %[[vptr:[^ ]*]]
+; X86:       movl 4(%[[vptr]]), %[[fp:[^ ]*]]
+; X86:       movl %[[fp]], %eax
+; X86:       pushl %[[obj]]
+; X86:       calll __llvm_external_retpoline_eax
+; X86:       addl $4, %esp
+; X86:       movl %[[fp]], %eax
+; X86:       jmp __llvm_external_retpoline_eax # TAILCALL
+
+; X86FAST-LABEL: vcall:
+; X86FAST:       calll __llvm_external_retpoline_eax
+; X86FAST:       jmp __llvm_external_retpoline_eax # TAILCALL
+
+
+declare void @direct_callee()
+
+define void @direct_tail() #0 {
+  tail call void @direct_callee()
+  ret void
+}
+
+; X64-LABEL: direct_tail:
+; X64:       jmp direct_callee # TAILCALL
+; X64FAST-LABEL: direct_tail:
+; X64FAST:   jmp direct_callee # TAILCALL
+; X86-LABEL: direct_tail:
+; X86:       jmp direct_callee # TAILCALL
+; X86FAST-LABEL: direct_tail:
+; X86FAST:   jmp direct_callee # TAILCALL
+
+
+; Lastly check that no thunks were emitted.
+; X64-NOT: __{{.*}}_retpoline_{{.*}}:
+; X64FAST-NOT: __{{.*}}_retpoline_{{.*}}:
+; X86-NOT: __{{.*}}_retpoline_{{.*}}:
+; X86FAST-NOT: __{{.*}}_retpoline_{{.*}}:
+
+
+attributes #0 = { "target-features"="+retpoline-external-thunk" }
diff --git a/test/CodeGen/X86/retpoline.ll b/test/CodeGen/X86/retpoline.ll

new file mode 100644 (file)

index 0000000..b0d4c85
--- /dev/null
+++ b/test/CodeGen/X86/retpoline.ll
@@ -0,0 +1,363 @@
+; RUN: llc -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64
+; RUN: llc -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST
+
+; RUN: llc -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86
+; RUN: llc -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST
+
+declare void @bar(i32)
+
+; Test a simple indirect call and tail call.
+define void @icall_reg(void (i32)* %fp, i32 %x) #0 {
+entry:
+  tail call void @bar(i32 %x)
+  tail call void %fp(i32 %x)
+  tail call void @bar(i32 %x)
+  tail call void %fp(i32 %x)
+  ret void
+}
+
+; X64-LABEL: icall_reg:
+; X64-DAG:   movq %rdi, %[[fp:[^ ]*]]
+; X64-DAG:   movl %esi, %[[x:[^ ]*]]
+; X64:       movl %[[x]], %edi
+; X64:       callq bar
+; X64-DAG:   movl %[[x]], %edi
+; X64-DAG:   movq %[[fp]], %r11
+; X64:       callq __llvm_retpoline_r11
+; X64:       movl %[[x]], %edi
+; X64:       callq bar
+; X64-DAG:   movl %[[x]], %edi
+; X64-DAG:   movq %[[fp]], %r11
+; X64:       jmp __llvm_retpoline_r11 # TAILCALL
+
+; X64FAST-LABEL: icall_reg:
+; X64FAST:       callq bar
+; X64FAST:       callq __llvm_retpoline_r11
+; X64FAST:       callq bar
+; X64FAST:       jmp __llvm_retpoline_r11 # TAILCALL
+
+; X86-LABEL: icall_reg:
+; X86-DAG:   movl 12(%esp), %[[fp:[^ ]*]]
+; X86-DAG:   movl 16(%esp), %[[x:[^ ]*]]
+; X86:       pushl %[[x]]
+; X86:       calll bar
+; X86:       movl %[[fp]], %eax
+; X86:       pushl %[[x]]
+; X86:       calll __llvm_retpoline_eax
+; X86:       pushl %[[x]]
+; X86:       calll bar
+; X86:       movl %[[fp]], %eax
+; X86:       pushl %[[x]]
+; X86:       calll __llvm_retpoline_eax
+; X86-NOT:   # TAILCALL
+
+; X86FAST-LABEL: icall_reg:
+; X86FAST:       calll bar
+; X86FAST:       calll __llvm_retpoline_eax
+; X86FAST:       calll bar
+; X86FAST:       calll __llvm_retpoline_eax
+
+
+@global_fp = external global void (i32)*
+
+; Test an indirect call through a global variable.
+define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 {
+  %fp1 = load void (i32)*, void (i32)** @global_fp
+  call void %fp1(i32 %x)
+  %fp2 = load void (i32)*, void (i32)** @global_fp
+  tail call void %fp2(i32 %x)
+  ret void
+}
+
+; X64-LABEL: icall_global_fp:
+; X64-DAG:   movl %edi, %[[x:[^ ]*]]
+; X64-DAG:   movq global_fp(%rip), %r11
+; X64:       callq __llvm_retpoline_r11
+; X64-DAG:   movl %[[x]], %edi
+; X64-DAG:   movq global_fp(%rip), %r11
+; X64:       jmp __llvm_retpoline_r11 # TAILCALL
+
+; X64FAST-LABEL: icall_global_fp:
+; X64FAST:       movq global_fp(%rip), %r11
+; X64FAST:       callq __llvm_retpoline_r11
+; X64FAST:       movq global_fp(%rip), %r11
+; X64FAST:       jmp __llvm_retpoline_r11 # TAILCALL
+
+; X86-LABEL: icall_global_fp:
+; X86:       movl global_fp, %eax
+; X86:       pushl 4(%esp)
+; X86:       calll __llvm_retpoline_eax
+; X86:       addl $4, %esp
+; X86:       movl global_fp, %eax
+; X86:       jmp __llvm_retpoline_eax # TAILCALL
+
+; X86FAST-LABEL: icall_global_fp:
+; X86FAST:       calll __llvm_retpoline_eax
+; X86FAST:       jmp __llvm_retpoline_eax # TAILCALL
+
+
+%struct.Foo = type { void (%struct.Foo*)** }
+
+; Test an indirect call through a vtable.
+define void @vcall(%struct.Foo* %obj) #0 {
+  %vptr_field = getelementptr %struct.Foo, %struct.Foo* %obj, i32 0, i32 0
+  %vptr = load void (%struct.Foo*)**, void (%struct.Foo*)*** %vptr_field
+  %vslot = getelementptr void(%struct.Foo*)*, void(%struct.Foo*)** %vptr, i32 1
+  %fp = load void(%struct.Foo*)*, void(%struct.Foo*)** %vslot
+  tail call void %fp(%struct.Foo* %obj)
+  tail call void %fp(%struct.Foo* %obj)
+  ret void
+}
+
+; X64-LABEL: vcall:
+; X64:       movq %rdi, %[[obj:[^ ]*]]
+; X64:       movq (%[[obj]]), %[[vptr:[^ ]*]]
+; X64:       movq 8(%[[vptr]]), %[[fp:[^ ]*]]
+; X64:       movq %[[fp]], %r11
+; X64:       callq __llvm_retpoline_r11
+; X64-DAG:   movq %[[obj]], %rdi
+; X64-DAG:   movq %[[fp]], %r11
+; X64:       jmp __llvm_retpoline_r11 # TAILCALL
+
+; X64FAST-LABEL: vcall:
+; X64FAST:       callq __llvm_retpoline_r11
+; X64FAST:       jmp __llvm_retpoline_r11 # TAILCALL
+
+; X86-LABEL: vcall:
+; X86:       movl 8(%esp), %[[obj:[^ ]*]]
+; X86:       movl (%[[obj]]), %[[vptr:[^ ]*]]
+; X86:       movl 4(%[[vptr]]), %[[fp:[^ ]*]]
+; X86:       movl %[[fp]], %eax
+; X86:       pushl %[[obj]]
+; X86:       calll __llvm_retpoline_eax
+; X86:       addl $4, %esp
+; X86:       movl %[[fp]], %eax
+; X86:       jmp __llvm_retpoline_eax # TAILCALL
+
+; X86FAST-LABEL: vcall:
+; X86FAST:       calll __llvm_retpoline_eax
+; X86FAST:       jmp __llvm_retpoline_eax # TAILCALL
+
+
+declare void @direct_callee()
+
+define void @direct_tail() #0 {
+  tail call void @direct_callee()
+  ret void
+}
+
+; X64-LABEL: direct_tail:
+; X64:       jmp direct_callee # TAILCALL
+; X64FAST-LABEL: direct_tail:
+; X64FAST:   jmp direct_callee # TAILCALL
+; X86-LABEL: direct_tail:
+; X86:       jmp direct_callee # TAILCALL
+; X86FAST-LABEL: direct_tail:
+; X86FAST:   jmp direct_callee # TAILCALL
+
+
+declare void @nonlazybind_callee() #1
+
+define void @nonlazybind_caller() #0 {
+  call void @nonlazybind_callee()
+  tail call void @nonlazybind_callee()
+  ret void
+}
+
+; nonlazybind wasn't implemented in LLVM 5.0, so this looks the same as direct.
+; X64-LABEL: nonlazybind_caller:
+; X64:       callq nonlazybind_callee
+; X64:       jmp nonlazybind_callee # TAILCALL
+; X64FAST-LABEL: nonlazybind_caller:
+; X64FAST:   callq nonlazybind_callee
+; X64FAST:   jmp nonlazybind_callee # TAILCALL
+; X86-LABEL: nonlazybind_caller:
+; X86:       calll nonlazybind_callee
+; X86:       jmp nonlazybind_callee # TAILCALL
+; X86FAST-LABEL: nonlazybind_caller:
+; X86FAST:   calll nonlazybind_callee
+; X86FAST:   jmp nonlazybind_callee # TAILCALL
+
+
+@indirectbr_rewrite.targets = constant [10 x i8*] [i8* blockaddress(@indirectbr_rewrite, %bb0),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb1),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb2),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb3),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb4),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb5),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb6),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb7),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb8),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb9)]
+
+; Check that when retpolines are enabled a function with indirectbr gets
+; rewritten to use switch, and that in turn doesn't get lowered as a jump
+; table.
+define void @indirectbr_rewrite(i64* readonly %p, i64* %sink) #0 {
+; X64-LABEL: indirectbr_rewrite:
+; X64-NOT:     jmpq
+; X86-LABEL: indirectbr_rewrite:
+; X86-NOT:     jmpl
+entry:
+  %i0 = load i64, i64* %p
+  %target.i0 = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i0
+  %target0 = load i8*, i8** %target.i0
+  indirectbr i8* %target0, [label %bb1, label %bb3]
+
+bb0:
+  store volatile i64 0, i64* %sink
+  br label %latch
+
+bb1:
+  store volatile i64 1, i64* %sink
+  br label %latch
+
+bb2:
+  store volatile i64 2, i64* %sink
+  br label %latch
+
+bb3:
+  store volatile i64 3, i64* %sink
+  br label %latch
+
+bb4:
+  store volatile i64 4, i64* %sink
+  br label %latch
+
+bb5:
+  store volatile i64 5, i64* %sink
+  br label %latch
+
+bb6:
+  store volatile i64 6, i64* %sink
+  br label %latch
+
+bb7:
+  store volatile i64 7, i64* %sink
+  br label %latch
+
+bb8:
+  store volatile i64 8, i64* %sink
+  br label %latch
+
+bb9:
+  store volatile i64 9, i64* %sink
+  br label %latch
+
+latch:
+  %i.next = load i64, i64* %p
+  %target.i.next = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i.next
+  %target.next = load i8*, i8** %target.i.next
+  ; Potentially hit a full 10 successors here so that even if we rewrite as
+  ; a switch it will try to be lowered with a jump table.
+  indirectbr i8* %target.next, [label %bb0,
+                                label %bb1,
+                                label %bb2,
+                                label %bb3,
+                                label %bb4,
+                                label %bb5,
+                                label %bb6,
+                                label %bb7,
+                                label %bb8,
+                                label %bb9]
+}
+
+; Lastly check that the necessary thunks were emitted.
+;
+; X64-LABEL:         .section        .text.__llvm_retpoline_r11,{{.*}},__llvm_retpoline_r11,comdat
+; X64-NEXT:          .hidden __llvm_retpoline_r11
+; X64-NEXT:          .weak   __llvm_retpoline_r11
+; X64:       __llvm_retpoline_r11:
+; X64-NEXT:  # {{.*}}                                # %entry
+; X64-NEXT:          callq   [[CALL_TARGET:.*]]
+; X64-NEXT:  [[CAPTURE_SPEC:.*]]:                    # Block address taken
+; X64-NEXT:                                          # %entry
+; X64-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; X64-NEXT:          pause
+; X64-NEXT:          lfence
+; X64-NEXT:          jmp     [[CAPTURE_SPEC]]
+; X64-NEXT:          .p2align        4, 0x90
+; X64-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X64-NEXT:                                          # %entry
+; X64-NEXT:          movq    %r11, (%rsp)
+; X64-NEXT:          retq
+;
+; X86-LABEL:         .section        .text.__llvm_retpoline_eax,{{.*}},__llvm_retpoline_eax,comdat
+; X86-NEXT:          .hidden __llvm_retpoline_eax
+; X86-NEXT:          .weak   __llvm_retpoline_eax
+; X86:       __llvm_retpoline_eax:
+; X86-NEXT:  # {{.*}}                                # %entry
+; X86-NEXT:          calll   [[CALL_TARGET:.*]]
+; X86-NEXT:  [[CAPTURE_SPEC:.*]]:                    # Block address taken
+; X86-NEXT:                                          # %entry
+; X86-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; X86-NEXT:          pause
+; X86-NEXT:          lfence
+; X86-NEXT:          jmp     [[CAPTURE_SPEC]]
+; X86-NEXT:          .p2align        4, 0x90
+; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X86-NEXT:                                          # %entry
+; X86-NEXT:          movl    %eax, (%esp)
+; X86-NEXT:          retl
+;
+; X86-LABEL:         .section        .text.__llvm_retpoline_ecx,{{.*}},__llvm_retpoline_ecx,comdat
+; X86-NEXT:          .hidden __llvm_retpoline_ecx
+; X86-NEXT:          .weak   __llvm_retpoline_ecx
+; X86:       __llvm_retpoline_ecx:
+; X86-NEXT:  # {{.*}}                                # %entry
+; X86-NEXT:          calll   [[CALL_TARGET:.*]]
+; X86-NEXT:  [[CAPTURE_SPEC:.*]]:                    # Block address taken
+; X86-NEXT:                                          # %entry
+; X86-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; X86-NEXT:          pause
+; X86-NEXT:          lfence
+; X86-NEXT:          jmp     [[CAPTURE_SPEC]]
+; X86-NEXT:          .p2align        4, 0x90
+; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X86-NEXT:                                          # %entry
+; X86-NEXT:          movl    %ecx, (%esp)
+; X86-NEXT:          retl
+;
+; X86-LABEL:         .section        .text.__llvm_retpoline_edx,{{.*}},__llvm_retpoline_edx,comdat
+; X86-NEXT:          .hidden __llvm_retpoline_edx
+; X86-NEXT:          .weak   __llvm_retpoline_edx
+; X86:       __llvm_retpoline_edx:
+; X86-NEXT:  # {{.*}}                                # %entry
+; X86-NEXT:          calll   [[CALL_TARGET:.*]]
+; X86-NEXT:  [[CAPTURE_SPEC:.*]]:                    # Block address taken
+; X86-NEXT:                                          # %entry
+; X86-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; X86-NEXT:          pause
+; X86-NEXT:          lfence
+; X86-NEXT:          jmp     [[CAPTURE_SPEC]]
+; X86-NEXT:          .p2align        4, 0x90
+; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X86-NEXT:                                          # %entry
+; X86-NEXT:          movl    %edx, (%esp)
+; X86-NEXT:          retl
+;
+; X86-LABEL:         .section        .text.__llvm_retpoline_push,{{.*}},__llvm_retpoline_push,comdat
+; X86-NEXT:          .hidden __llvm_retpoline_push
+; X86-NEXT:          .weak   __llvm_retpoline_push
+; X86:       __llvm_retpoline_push:
+; X86-NEXT:  # {{.*}}                                # %entry
+; X86-NEXT:          calll   [[CALL_TARGET:.*]]
+; X86-NEXT:  [[CAPTURE_SPEC:.*]]:                    # Block address taken
+; X86-NEXT:                                          # %entry
+; X86-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; X86-NEXT:          pause
+; X86-NEXT:          lfence
+; X86-NEXT:          jmp     [[CAPTURE_SPEC]]
+; X86-NEXT:          .p2align        4, 0x90
+; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X86-NEXT:                                          # %entry
+; X86-NEXT:          addl    $4, %esp
+; X86-NEXT:          pushl   4(%esp)
+; X86-NEXT:          pushl   4(%esp)
+; X86-NEXT:          popl    8(%esp)
+; X86-NEXT:          popl    (%esp)
+; X86-NEXT:          retl
+
+
+attributes #0 = { "target-features"="+retpoline" }
+attributes #1 = { nonlazybind }
diff --git a/test/Transforms/IndirectBrExpand/basic.ll b/test/Transforms/IndirectBrExpand/basic.ll

new file mode 100644 (file)

index 0000000..d0319c6
--- /dev/null
+++ b/test/Transforms/IndirectBrExpand/basic.ll
@@ -0,0 +1,63 @@
+; RUN: opt < %s -indirectbr-expand -S | FileCheck %s
+;
+; REQUIRES: x86-registered-target
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@test1.targets = constant [4 x i8*] [i8* blockaddress(@test1, %bb0),
+                                     i8* blockaddress(@test1, %bb1),
+                                     i8* blockaddress(@test1, %bb2),
+                                     i8* blockaddress(@test1, %bb3)]
+; CHECK-LABEL: @test1.targets = constant [4 x i8*]
+; CHECK:       [i8* inttoptr (i64 1 to i8*),
+; CHECK:        i8* inttoptr (i64 2 to i8*),
+; CHECK:        i8* inttoptr (i64 3 to i8*),
+; CHECK:        i8* blockaddress(@test1, %bb3)]
+
+define void @test1(i64* readonly %p, i64* %sink) #0 {
+; CHECK-LABEL: define void @test1(
+entry:
+  %i0 = load i64, i64* %p
+  %target.i0 = getelementptr [4 x i8*], [4 x i8*]* @test1.targets, i64 0, i64 %i0
+  %target0 = load i8*, i8** %target.i0
+  ; Only a subset of blocks are viable successors here.
+  indirectbr i8* %target0, [label %bb0, label %bb1]
+; CHECK-NOT:     indirectbr
+; CHECK:         %[[ENTRY_V:.*]] = ptrtoint i8* %{{.*}} to i64
+; CHECK-NEXT:    br label %[[SWITCH_BB:.*]]
+
+bb0:
+  store volatile i64 0, i64* %sink
+  br label %latch
+
+bb1:
+  store volatile i64 1, i64* %sink
+  br label %latch
+
+bb2:
+  store volatile i64 2, i64* %sink
+  br label %latch
+
+bb3:
+  store volatile i64 3, i64* %sink
+  br label %latch
+
+latch:
+  %i.next = load i64, i64* %p
+  %target.i.next = getelementptr [4 x i8*], [4 x i8*]* @test1.targets, i64 0, i64 %i.next
+  %target.next = load i8*, i8** %target.i.next
+  ; A different subset of blocks are viable successors here.
+  indirectbr i8* %target.next, [label %bb1, label %bb2]
+; CHECK-NOT:     indirectbr
+; CHECK:         %[[LATCH_V:.*]] = ptrtoint i8* %{{.*}} to i64
+; CHECK-NEXT:    br label %[[SWITCH_BB]]
+;
+; CHECK:       [[SWITCH_BB]]:
+; CHECK-NEXT:    %[[V:.*]] = phi i64 [ %[[ENTRY_V]], %entry ], [ %[[LATCH_V]], %latch ]
+; CHECK-NEXT:    switch i64 %[[V]], label %bb0 [
+; CHECK-NEXT:      i64 2, label %bb1
+; CHECK-NEXT:      i64 3, label %bb2
+; CHECK-NEXT:    ]
+}
+
+attributes #0 = { "target-features"="+retpoline" }
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp

index 24cce58047f1a0aa82c2cd5e4a80c0e513ae8477..1c4a599e00d0710c4bc174d72c8a079a6b2b2982 100644 (file)
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -401,6 +401,7 @@ int main(int argc, char **argv) {
    initializeSjLjEHPreparePass(Registry);
    initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
    initializeGlobalMergePass(Registry);
+  initializeIndirectBrExpandPassPass(Registry);
    initializeInterleavedAccessPass(Registry);
    initializeCountingFunctionInserterPass(Registry);
    initializeUnreachableBlockElimLegacyPassPass(Registry);
author	Reid Kleckner <rnk@google.com>
	Thu, 1 Feb 2018 21:28:26 +0000 (21:28 +0000)
committer	Reid Kleckner <rnk@google.com>
	Thu, 1 Feb 2018 21:28:26 +0000 (21:28 +0000)
include/llvm/CodeGen/Passes.h		patch \| blob \| history
include/llvm/CodeGen/TargetPassConfig.h		patch \| blob \| history
include/llvm/InitializePasses.h		patch \| blob \| history
include/llvm/Target/TargetLowering.h		patch \| blob \| history
include/llvm/Target/TargetSubtargetInfo.h		patch \| blob \| history
lib/CodeGen/CMakeLists.txt		patch \| blob \| history
lib/CodeGen/CodeGen.cpp		patch \| blob \| history
lib/CodeGen/IndirectBrExpandPass.cpp	[new file with mode: 0644]	patch \| blob
lib/CodeGen/TargetPassConfig.cpp		patch \| blob \| history
lib/CodeGen/TargetSubtargetInfo.cpp		patch \| blob \| history
lib/Target/X86/CMakeLists.txt		patch \| blob \| history
lib/Target/X86/X86.h		patch \| blob \| history
lib/Target/X86/X86.td		patch \| blob \| history
lib/Target/X86/X86AsmPrinter.h		patch \| blob \| history
lib/Target/X86/X86FastISel.cpp		patch \| blob \| history
lib/Target/X86/X86FrameLowering.cpp		patch \| blob \| history
lib/Target/X86/X86ISelDAGToDAG.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.h		patch \| blob \| history
lib/Target/X86/X86InstrCompiler.td		patch \| blob \| history
lib/Target/X86/X86InstrControl.td		patch \| blob \| history
lib/Target/X86/X86InstrInfo.td		patch \| blob \| history
lib/Target/X86/X86MCInstLower.cpp		patch \| blob \| history
lib/Target/X86/X86RetpolineThunks.cpp	[new file with mode: 0644]	patch \| blob
lib/Target/X86/X86Subtarget.cpp		patch \| blob \| history
lib/Target/X86/X86Subtarget.h		patch \| blob \| history
lib/Target/X86/X86TargetMachine.cpp		patch \| blob \| history
test/CodeGen/X86/O0-pipeline.ll		patch \| blob \| history
test/CodeGen/X86/retpoline-external.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/retpoline.ll	[new file with mode: 0644]	patch \| blob
test/Transforms/IndirectBrExpand/basic.ll	[new file with mode: 0644]	patch \| blob
tools/opt/opt.cpp		patch \| blob \| history