From: Dan Gohman <dan433584@gmail.com>
Date: Tue, 10 May 2016 04:24:02 +0000 (+0000)
Subject: [WebAssembly] Move register stackification and coloring to a late phase.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=01a542927d700791ef19fcaa840a8443164512b6;p=llvm

[WebAssembly] Move register stackification and coloring to a late phase.

Move the register stackification and coloring passes to run very late, after
PEI, tail duplication, and most other passes. This means that all code emitted
and expanded by those passes is now exposed to these passes. This also
eliminates the need for prologue/epilogue code to be manually stackified,
which significantly simplifies the code.

This does require running LiveIntervals a second time. It's useful to think
of these late passes not as late optimization passes, but as a domain-specific
compression algorithm based on knowledge of liveness information. It's used to
compress the code after all conventional optimizations are complete, which is
why it uses LiveIntervals at a phase when actual optimization passes don't
typically need it.

Differential Revision: http://reviews.llvm.org/D20075


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@269012 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index f1aa314d1ff..c4c060fba2e 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -1845,7 +1845,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       if (!MO.isReg() || !MO.isDef() || MO.isDead())
         continue;
       unsigned Reg = MO.getReg();
-      if (!Reg)
+      if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg))
         continue;
       LocalDefs.push_back(Reg);
       addRegAndItsAliases(Reg, TRI, LocalDefsSet);
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index b4fcfa4fed5..ba107e949a6 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -22,13 +22,16 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyLowerBrUnless.cpp
   WebAssemblyMachineFunctionInfo.cpp
   WebAssemblyMCInstLower.cpp
+  WebAssemblyOptimizeLiveIntervals.cpp
   WebAssemblyOptimizeReturned.cpp
   WebAssemblyPeephole.cpp
   WebAssemblyPEI.cpp
+  WebAssemblyPrepareForLiveIntervals.cpp
   WebAssemblyRegisterInfo.cpp
   WebAssemblyRegColoring.cpp
   WebAssemblyRegNumbering.cpp
   WebAssemblyRegStackify.cpp
+  WebAssemblyReplacePhysRegs.cpp
   WebAssemblySelectionDAGInfo.cpp
   WebAssemblySetP2AlignOperands.cpp
   WebAssemblyStoreResults.cpp
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index 1b9de43e4f8..d12825d5fb2 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -23,17 +23,25 @@ namespace llvm {
 class WebAssemblyTargetMachine;
 class FunctionPass;
 
+// LLVM IR passes.
 FunctionPass *createWebAssemblyOptimizeReturned();
 
+// ISel and immediate followup passes.
 FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
                                        CodeGenOpt::Level OptLevel);
 FunctionPass *createWebAssemblyArgumentMove();
 FunctionPass *createWebAssemblySetP2AlignOperands();
 
+// Regalloc-time passes.
+FunctionPass *createWebAssemblyPEI();
+
+// Late passes.
+FunctionPass *createWebAssemblyReplacePhysRegs();
+FunctionPass *createWebAssemblyPrepareForLiveIntervals();
+FunctionPass *createWebAssemblyOptimizeLiveIntervals();
 FunctionPass *createWebAssemblyStoreResults();
 FunctionPass *createWebAssemblyRegStackify();
 FunctionPass *createWebAssemblyRegColoring();
-FunctionPass *createWebAssemblyPEI();
 FunctionPass *createWebAssemblyFixIrreducibleControlFlow();
 FunctionPass *createWebAssemblyCFGStackify();
 FunctionPass *createWebAssemblyLowerBrUnless();
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 509573e47c3..d7a753d978b 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -93,10 +93,7 @@ private:
 //===----------------------------------------------------------------------===//
 
 MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
-  const TargetRegisterClass *TRC =
-      TargetRegisterInfo::isVirtualRegister(RegNo)
-          ? MRI->getRegClass(RegNo)
-          : MRI->getTargetRegisterInfo()->getMinimalPhysRegClass(RegNo);
+  const TargetRegisterClass *TRC = MRI->getRegClass(RegNo);
   for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
     if (TRC->hasType(T))
       return T;
@@ -183,13 +180,6 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
     LocalTypes.push_back(getRegType(VReg));
     AnyWARegs = true;
   }
-  auto &PhysRegs = MFI->getPhysRegs();
-  for (unsigned PReg = 0; PReg < PhysRegs.size(); ++PReg) {
-    if (PhysRegs[PReg] == -1U)
-      continue;
-    LocalTypes.push_back(getRegType(PReg));
-    AnyWARegs = true;
-  }
   if (AnyWARegs)
     getTargetStreamer()->emitLocal(LocalTypes);
 
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 2e50c7e6c9f..7e2f4169e15 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -83,8 +83,11 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
                             MachineBasicBlock::iterator &InsertStore,
                             DebugLoc DL) {
   auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer");
-  unsigned SPAddr =
-      MF.getRegInfo().createVirtualRegister(&WebAssembly::I32RegClass);
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterClass *PtrRC =
+      MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+  unsigned SPAddr = MRI.createVirtualRegister(PtrRC);
+  unsigned Discard = MRI.createVirtualRegister(PtrRC);
   const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
 
   BuildMI(MBB, InsertAddr, DL, TII->get(WebAssembly::CONST_I32), SPAddr)
@@ -92,13 +95,12 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
   auto *MMO = new MachineMemOperand(MachinePointerInfo(),
                                     MachineMemOperand::MOStore, 4, 4);
   BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::STORE_I32),
-          SrcReg)
+          Discard)
       .addImm(0)
       .addReg(SPAddr)
       .addImm(2)  // p2align
       .addReg(SrcReg)
       .addMemOperand(MMO);
-  MF.getInfo<WebAssemblyFunctionInfo>()->stackifyVReg(SPAddr);
 }
 
 MachineBasicBlock::iterator
@@ -122,7 +124,6 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
   auto *MFI = MF.getFrameInfo();
   assert(MFI->getCalleeSavedInfo().empty() &&
          "WebAssembly should not have callee-saved registers");
-  auto *WFI = MF.getInfo<WebAssemblyFunctionInfo>();
 
   if (!needsSP(MF, *MFI)) return;
   uint64_t StackSize = MFI->getStackSize();
@@ -133,8 +134,10 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
   auto InsertPt = MBB.begin();
   DebugLoc DL;
 
-  unsigned SPAddr = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-  unsigned SPReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+  const TargetRegisterClass *PtrRC =
+      MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+  unsigned SPAddr = MRI.createVirtualRegister(PtrRC);
+  unsigned SPReg = MRI.createVirtualRegister(PtrRC);
   auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer");
   BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), SPAddr)
       .addExternalSymbol(SPSymbol);
@@ -151,25 +154,22 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
       .addReg(SPAddr)  // addr
       .addImm(2)       // p2align
       .addMemOperand(LoadMMO);
-  WFI->stackifyVReg(SPAddr);
 
   if (StackSize) {
     // Subtract the frame size
-    unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+    unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
     BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
         .addImm(StackSize);
     BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::SUB_I32),
             WebAssembly::SP32)
         .addReg(SPReg)
         .addReg(OffsetReg);
-    WFI->stackifyVReg(OffsetReg);
-    WFI->stackifyVReg(SPReg);
   }
   if (hasFP(MF)) {
     // Unlike most conventional targets (where FP points to the saved FP),
     // FP points to the bottom of the fixed-size locals, so we can use positive
     // offsets in load/store instructions.
-    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::COPY_LOCAL_I32),
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::COPY),
             WebAssembly::FP32)
         .addReg(WebAssembly::SP32);
   }
@@ -183,40 +183,31 @@ void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF,
   auto *MFI = MF.getFrameInfo();
   uint64_t StackSize = MFI->getStackSize();
   if (!needsSP(MF, *MFI) || !needsSPWriteback(MF, *MFI)) return;
-  auto *WFI = MF.getInfo<WebAssemblyFunctionInfo>();
   const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
   auto &MRI = MF.getRegInfo();
   auto InsertPt = MBB.getFirstTerminator();
   DebugLoc DL;
 
-  if (InsertPt != MBB.end()) {
+  if (InsertPt != MBB.end())
     DL = InsertPt->getDebugLoc();
 
-    // If code has been stackified with the return, disconnect it so that we
-    // don't break the tree when we insert code just before the return.
-    if (InsertPt->isReturn() && InsertPt->getNumExplicitOperands() != 0) {
-      WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
-      MFI.unstackifyVReg(InsertPt->getOperand(0).getReg());
-    }
-  }
-
   // Restore the stack pointer. If we had fixed-size locals, add the offset
   // subtracted in the prolog.
   unsigned SPReg = 0;
   MachineBasicBlock::iterator InsertAddr = InsertPt;
   if (StackSize) {
-    unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+    const TargetRegisterClass *PtrRC =
+        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+    unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
     InsertAddr =
         BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
             .addImm(StackSize);
     // In the epilog we don't need to write the result back to the SP32 physreg
     // because it won't be used again. We can use a stackified register instead.
-    SPReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+    SPReg = MRI.createVirtualRegister(PtrRC);
     BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::ADD_I32), SPReg)
         .addReg(hasFP(MF) ? WebAssembly::FP32 : WebAssembly::SP32)
         .addReg(OffsetReg);
-    WFI->stackifyVReg(OffsetReg);
-    WFI->stackifyVReg(SPReg);
   } else {
     SPReg = hasFP(MF) ? WebAssembly::FP32 : WebAssembly::SP32;
   }
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 28aa1114f16..79950a6b257 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -39,18 +39,13 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
   ///   - defined and used in LIFO order with other stack registers
   BitVector VRegStackified;
 
-  // One entry for each possible target reg. we expect it to be small.
-  std::vector<unsigned> PhysRegs;
-
   // A virtual register holding the pointer to the vararg buffer for vararg
   // functions. It is created and set in TLI::LowerFormalArguments and read by
   // TLI::LowerVASTART
   unsigned VarargVreg = -1U;
 
  public:
-  explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {
-    PhysRegs.resize(WebAssembly::NUM_TARGET_REGS, -1U);
-  }
+  explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {}
   ~WebAssemblyFunctionInfo() override;
 
   void addParam(MVT VT) { Params.push_back(VT); }
@@ -69,11 +64,6 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
       VRegStackified.resize(TargetRegisterInfo::virtReg2Index(VReg) + 1);
     VRegStackified.set(TargetRegisterInfo::virtReg2Index(VReg));
   }
-  void unstackifyVReg(unsigned VReg) {
-    if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
-      return;
-    VRegStackified.reset(TargetRegisterInfo::virtReg2Index(VReg));
-  }
   bool isVRegStackified(unsigned VReg) const {
     if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
       return false;
@@ -87,11 +77,8 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
     WARegs[TargetRegisterInfo::virtReg2Index(VReg)] = WAReg;
   }
   unsigned getWAReg(unsigned Reg) const {
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-      assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size());
-      return WARegs[TargetRegisterInfo::virtReg2Index(Reg)];
-    }
-    return PhysRegs[Reg];
+    assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size());
+    return WARegs[TargetRegisterInfo::virtReg2Index(Reg)];
   }
   // If new virtual registers are created after initWARegs has been called,
   // this function can be used to add WebAssembly register mappings for them.
@@ -99,13 +86,6 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
     assert(VReg = WARegs.size());
     WARegs.push_back(WAReg);
   }
-
-  void addPReg(unsigned PReg, unsigned WAReg) {
-    assert(PReg < WebAssembly::NUM_TARGET_REGS);
-    assert(WAReg < -1U);
-    PhysRegs[PReg] = WAReg;
-  }
-  const std::vector<unsigned> &getPhysRegs() const { return PhysRegs; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
new file mode 100644
index 00000000000..473de7ddae7
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -0,0 +1,105 @@
+//===--- WebAssemblyOptimizeLiveIntervals.cpp - LiveInterval processing ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Optimize LiveIntervals for use in a post-RA context.
+//
+/// LiveIntervals normally runs before register allocation when the code is
+/// only recently lowered out of SSA form, so it's uncommon for registers to
+/// have multiple defs, and then they do, the defs are usually closely related.
+/// Later, after coalescing, tail duplication, and other optimizations, it's
+/// more common to see registers with multiple unrelated defs. This pass
+/// updates LiveIntervalAnalysis to distribute the value numbers across separate
+/// LiveIntervals.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-optimize-live-intervals"
+
+namespace {
+class WebAssemblyOptimizeLiveIntervals final : public MachineFunctionPass {
+  const char *getPassName() const override {
+    return "WebAssembly Optimize Live Intervals";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(LiveVariablesID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyOptimizeLiveIntervals() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyOptimizeLiveIntervals::ID = 0;
+FunctionPass *llvm::createWebAssemblyOptimizeLiveIntervals() {
+  return new WebAssemblyOptimizeLiveIntervals();
+}
+
+bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Optimize LiveIntervals **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+
+  // We don't preserve SSA form.
+  MRI.leaveSSA();
+
+  assert(MRI.tracksLiveness() &&
+         "OptimizeLiveIntervals expects liveness");
+
+  // Split multiple-VN LiveIntervals into multiple LiveIntervals.
+  SmallVector<LiveInterval*, 4> SplitLIs;
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+
+    LIS.splitSeparateComponents(LIS.getInterval(Reg), SplitLIs);
+    SplitLIs.clear();
+  }
+
+  // In PrepareForLiveIntervals, we conservatively inserted IMPLICIT_DEF
+  // instructions to satisfy LiveIntervals' requirement that all uses be
+  // dominated by defs. Now that LiveIntervals has computed which of these
+  // defs are actually needed and which are dead, remove the dead ones.
+  for (auto MII = MF.begin()->begin(), MIE = MF.begin()->end(); MII != MIE; ) {
+    MachineInstr *MI = &*MII++;
+    if (MI->isImplicitDef() && MI->getOperand(0).isDead()) {
+      LiveInterval &LI = LIS.getInterval(MI->getOperand(0).getReg());
+      LIS.removeVRegDefAt(LI, LIS.getInstructionIndex(*MI).getRegSlot());
+      LIS.RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+    }
+  }
+
+  return false;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyPEI.cpp b/lib/Target/WebAssembly/WebAssemblyPEI.cpp
index e0308ce3c28..ec24ba51528 100644
--- a/lib/Target/WebAssembly/WebAssemblyPEI.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPEI.cpp
@@ -36,6 +36,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/InlineAsm.h"
diff --git a/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
new file mode 100644
index 00000000000..30444ac598a
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -0,0 +1,136 @@
+//===- WebAssemblyPrepareForLiveIntervals.cpp - Prepare for LiveIntervals -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Fix up code to meet LiveInterval's requirements.
+///
+/// Some CodeGen passes don't preserve LiveInterval's requirements, because
+/// they run after register allocation and it isn't important. However,
+/// WebAssembly runs LiveIntervals in a late pass. This pass transforms code
+/// to meet LiveIntervals' requirements; primarily, it ensures that all
+/// virtual register uses have definitions (IMPLICIT_DEF definitions if
+/// nothing else).
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-prepare-for-live-intervals"
+
+namespace {
+class WebAssemblyPrepareForLiveIntervals final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyPrepareForLiveIntervals() : MachineFunctionPass(ID) {}
+
+private:
+  const char *getPassName() const override {
+    return "WebAssembly Prepare For LiveIntervals";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyPrepareForLiveIntervals::ID = 0;
+FunctionPass *llvm::createWebAssemblyPrepareForLiveIntervals() {
+  return new WebAssemblyPrepareForLiveIntervals();
+}
+
+/// Test whether the given instruction is an ARGUMENT.
+static bool IsArgument(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case WebAssembly::ARGUMENT_I32:
+  case WebAssembly::ARGUMENT_I64:
+  case WebAssembly::ARGUMENT_F32:
+  case WebAssembly::ARGUMENT_F64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Test whether the given register has an ARGUMENT def.
+static bool HasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+  for (auto &Def : MRI.def_instructions(Reg))
+    if (IsArgument(&Def))
+      return true;
+  return false;
+}
+
+bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Prepare For LiveIntervals **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  bool Changed = false;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  MachineBasicBlock &Entry = *MF.begin();
+
+  assert(!mustPreserveAnalysisID(LiveIntervalsID) &&
+         "LiveIntervals shouldn't be active yet!");
+
+  // We don't preserve SSA form.
+  MRI.leaveSSA();
+
+  // BranchFolding and perhaps other passes don't preserve IMPLICIT_DEF
+  // instructions. LiveIntervals requires that all paths to virtual register
+  // uses provide a definition. Insert IMPLICIT_DEFs in the entry block to
+  // conservatively satisfy this.
+  //
+  // TODO: This is fairly heavy-handed; find a better approach.
+  //
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+
+    // Skip unused registers.
+    if (MRI.use_nodbg_empty(Reg))
+      continue;
+
+    // Skip registers that have an ARGUMENT definition.
+    if (HasArgumentDef(Reg, MRI))
+      continue;
+
+    BuildMI(Entry, Entry.begin(), DebugLoc(),
+            TII.get(WebAssembly::IMPLICIT_DEF), Reg);
+    Changed = true;
+  }
+
+  // Move ARGUMENT_* instructions to the top of the entry block, so that their
+  // liveness reflects the fact that these really are live-in values.
+  for (auto MII = Entry.begin(), MIE = Entry.end(); MII != MIE; ) {
+    MachineInstr *MI = &*MII++;
+    if (IsArgument(MI)) {
+      MI->removeFromParent();
+      Entry.insert(Entry.begin(), MI);
+    }
+  }
+
+  // Ok, we're now ready to run LiveIntervalAnalysis again.
+  MF.getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
+
+  return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
index d508d448d71..4a8fd96f832 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -61,7 +61,6 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
 
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const MachineFrameInfo &FrameInfo = *MF.getFrameInfo();
 
   MFI.initWARegs();
 
@@ -73,11 +72,13 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
     case WebAssembly::ARGUMENT_I32:
     case WebAssembly::ARGUMENT_I64:
     case WebAssembly::ARGUMENT_F32:
-    case WebAssembly::ARGUMENT_F64:
+    case WebAssembly::ARGUMENT_F64: {
+      int64_t Imm = MI.getOperand(1).getImm();
       DEBUG(dbgs() << "Arg VReg " << MI.getOperand(0).getReg() << " -> WAReg "
-                   << MI.getOperand(1).getImm() << "\n");
-      MFI.setWAReg(MI.getOperand(0).getReg(), MI.getOperand(1).getImm());
+                   << Imm << "\n");
+      MFI.setWAReg(MI.getOperand(0).getReg(), Imm);
       break;
+    }
     default:
       break;
     }
@@ -107,17 +108,6 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
       MFI.setWAReg(VReg, CurReg++);
     }
   }
-  // Allocate locals for used physical registers
-  bool HasFP =
-      MF.getSubtarget<WebAssemblySubtarget>().getFrameLowering()->hasFP(MF);
-  if (FrameInfo.getStackSize() > 0 || FrameInfo.adjustsStack() || HasFP) {
-    DEBUG(dbgs() << "PReg SP " << CurReg << "\n");
-    MFI.addPReg(WebAssembly::SP32, CurReg++);
-  }
-  if (HasFP) {
-    DEBUG(dbgs() << "PReg FP " << CurReg << "\n");
-    MFI.addPReg(WebAssembly::FP32, CurReg++);
-  }
 
   return true;
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index c6a3f136d40..8a00d544f48 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -110,6 +110,10 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
       continue;
 
     if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      // Ignore ARGUMENTS; it's just used to keep the ARGUMENT_* instructions
+      // from moving down, and we've already checked for that.
+      if (Reg == WebAssembly::ARGUMENTS)
+        continue;
       // If the physical register is never modified, ignore it.
       if (!MRI.isPhysRegModified(Reg))
         continue;
@@ -118,7 +122,7 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
     }
 
     // Ask LiveIntervals whether moving this virtual register use or def to
-    // Insert will change value numbers are seen.
+    // Insert will change which value numbers are seen.
     const LiveInterval &LI = LIS.getInterval(Reg);
     VNInfo *DefVNI =
         MO.isDef() ? LI.getVNInfoAt(LIS.getInstructionIndex(*Def).getRegSlot())
@@ -141,11 +145,23 @@ static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
 static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
                                      const MachineBasicBlock &MBB,
                                      const MachineRegisterInfo &MRI,
-                                     const MachineDominatorTree &MDT) {
+                                     const MachineDominatorTree &MDT,
+                                     LiveIntervals &LIS) {
+  const LiveInterval &LI = LIS.getInterval(Reg);
+
+  const MachineInstr *OneUseInst = OneUse.getParent();
+  VNInfo *OneUseVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(*OneUseInst));
+
   for (const MachineOperand &Use : MRI.use_operands(Reg)) {
     if (&Use == &OneUse)
       continue;
+
     const MachineInstr *UseInst = Use.getParent();
+    VNInfo *UseVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(*UseInst));
+
+    if (UseVNI != OneUseVNI)
+      continue;
+
     const MachineInstr *OneUseInst = OneUse.getParent();
     if (UseInst->getOpcode() == TargetOpcode::PHI) {
       // Test that the PHI use, which happens on the CFG edge rather than
@@ -183,13 +199,33 @@ static unsigned GetTeeLocalOpcode(const TargetRegisterClass *RC) {
 
 /// A single-use def in the same block with no intervening memory or register
 /// dependencies; move the def down and nest it with the current instruction.
-static MachineInstr *MoveForSingleUse(unsigned Reg, MachineInstr *Def,
+static MachineInstr *MoveForSingleUse(unsigned Reg, MachineOperand& Op,
+                                      MachineInstr *Def,
                                       MachineBasicBlock &MBB,
                                       MachineInstr *Insert, LiveIntervals &LIS,
-                                      WebAssemblyFunctionInfo &MFI) {
+                                      WebAssemblyFunctionInfo &MFI,
+                                      MachineRegisterInfo &MRI) {
   MBB.splice(Insert, &MBB, Def);
   LIS.handleMove(*Def);
-  MFI.stackifyVReg(Reg);
+
+  if (MRI.hasOneDef(Reg)) {
+    MFI.stackifyVReg(Reg);
+  } else {
+    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+    Def->getOperand(0).setReg(NewReg);
+    Op.setReg(NewReg);
+
+    // Tell LiveIntervals about the new register.
+    LIS.createAndComputeVirtRegInterval(NewReg);
+
+    // Tell LiveIntervals about the changes to the old register.
+    LiveInterval &LI = LIS.getInterval(Reg);
+    LIS.removeVRegDefAt(LI, LIS.getInstructionIndex(*Def).getRegSlot());
+    LIS.shrinkToUses(&LI);
+
+    MFI.stackifyVReg(NewReg);
+  }
+
   ImposeStackOrdering(Def);
   return Def;
 }
@@ -211,18 +247,23 @@ RematerializeCheapDef(unsigned Reg, MachineOperand &Op, MachineInstr *Def,
   MFI.stackifyVReg(NewReg);
   ImposeStackOrdering(Clone);
 
+  // Shrink the interval.
+  bool IsDead = MRI.use_empty(Reg);
+  if (!IsDead) {
+    LiveInterval &LI = LIS.getInterval(Reg);
+    LIS.shrinkToUses(&LI);
+    IsDead = !LI.liveAt(LIS.getInstructionIndex(*Def).getDeadSlot());
+  }
+
   // If that was the last use of the original, delete the original.
-  // Otherwise shrink the LiveInterval.
-  if (MRI.use_empty(Reg)) {
+  if (IsDead) {
     SlotIndex Idx = LIS.getInstructionIndex(*Def).getRegSlot();
     LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx);
-    LIS.removeVRegDefAt(LIS.getInterval(Reg), Idx);
     LIS.removeInterval(Reg);
     LIS.RemoveMachineInstrFromMaps(*Def);
     Def->eraseFromParent();
-  } else {
-    LIS.shrinkToUses(&LIS.getInterval(Reg));
   }
+
   return Clone;
 }
 
@@ -488,13 +529,13 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
         bool CanMove = SameBlock && IsSafeToMove(Def, Insert, AA, LIS, MRI) &&
                        !TreeWalker.IsOnStack(Reg);
         if (CanMove && MRI.hasOneUse(Reg)) {
-          Insert = MoveForSingleUse(Reg, Def, MBB, Insert, LIS, MFI);
+          Insert = MoveForSingleUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI);
         } else if (Def->isAsCheapAsAMove() &&
                    TII->isTriviallyReMaterializable(Def, &AA)) {
           Insert = RematerializeCheapDef(Reg, Op, Def, MBB, Insert, LIS, MFI,
                                          MRI, TII, TRI);
         } else if (CanMove &&
-                   OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT)) {
+                   OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS)) {
           Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
                                          MRI, TII);
         } else {
@@ -536,15 +577,13 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   SmallVector<unsigned, 0> Stack;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
+      if (MI.isDebugValue())
+        continue;
       for (MachineOperand &MO : reverse(MI.explicit_operands())) {
         if (!MO.isReg())
           continue;
         unsigned Reg = MO.getReg();
 
-        // Don't stackify physregs like SP or FP.
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
-          continue;
-
         if (MFI.isVRegStackified(Reg)) {
           if (MO.isDef())
             Stack.push_back(Reg);
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 683b52c58d4..239fe89b7ef 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -51,51 +51,6 @@ WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction & /*MF*/) const {
   return Reserved;
 }
 
-static bool isStackifiedVReg(const WebAssemblyFunctionInfo *WFI,
-                             const MachineOperand& Op) {
-  if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
-    return TargetRegisterInfo::isVirtualRegister(Reg) &&
-        WFI->isVRegStackified(Reg);
-  }
-  return false;
-}
-
-static bool canStackifyOperand(const MachineInstr& Inst) {
-  unsigned Op = Inst.getOpcode();
-  return Op != TargetOpcode::PHI &&
-      Op != TargetOpcode::INLINEASM &&
-      Op != TargetOpcode::DBG_VALUE;
-}
-
-// Determine if the FI sequence can be stackified, and if so, where the code can
-// be inserted. If stackification is possible, returns true and ajusts II to
-// point to the insertion point.
-bool findInsertPt(const WebAssemblyFunctionInfo *WFI, MachineBasicBlock &MBB,
-                  unsigned OperandNum, MachineBasicBlock::iterator &II) {
-  if (!canStackifyOperand(*II)) return false;
-
-  MachineBasicBlock::iterator InsertPt(II);
-  int StackCount = 0;
-  // Operands are popped in reverse order, so any operands after FIOperand
-  // impose a constraint
-  for (unsigned i = OperandNum; i < II->getNumOperands(); i++) {
-    if (isStackifiedVReg(WFI, II->getOperand(i))) ++StackCount;
-  }
-  // Walk backwards, tracking stack depth. When it reaches 0 we have reached the
-  // top of the subtree.
-  while (StackCount) {
-    if (InsertPt == MBB.begin()) return false;
-    --InsertPt;
-    for (const auto &def : InsertPt->defs())
-      if (isStackifiedVReg(WFI, def)) --StackCount;
-    for (const auto &use : InsertPt->explicit_uses())
-      if (isStackifiedVReg(WFI, use)) ++StackCount;
-  }
-  II = InsertPt;
-  return true;
-}
-
 void WebAssemblyRegisterInfo::eliminateFrameIndex(
     MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum,
     RegScavenger * /*RS*/) const {
@@ -104,56 +59,67 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex(
 
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   const MachineFrameInfo &MFI = *MF.getFrameInfo();
   int64_t FrameOffset = MFI.getStackSize() + MFI.getObjectOffset(FrameIndex);
 
+  // If this is the address operand of a load or store, make it relative to SP
+  // and fold the frame offset directly in.
   if (MI.mayLoadOrStore() && FIOperandNum == WebAssembly::MemOpAddressOperandNo) {
-    // If this is the address operand of a load or store, make it relative to SP
-    // and fold the frame offset directly in.
     assert(FrameOffset >= 0 && MI.getOperand(1).getImm() >= 0);
     int64_t Offset = MI.getOperand(1).getImm() + FrameOffset;
 
-    if (static_cast<uint64_t>(Offset) > std::numeric_limits<uint32_t>::max()) {
-      // If this happens the program is invalid, but better to error here than
-      // generate broken code.
-      report_fatal_error("Memory offset field overflow");
+    if (static_cast<uint64_t>(Offset) <= std::numeric_limits<uint32_t>::max()) {
+      MI.getOperand(FIOperandNum - 1).setImm(Offset);
+      MI.getOperand(FIOperandNum)
+          .ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false);
+      return;
     }
-    MI.getOperand(FIOperandNum - 1).setImm(Offset);
-    MI.getOperand(FIOperandNum)
-        .ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false);
-  } else {
-    // Otherwise calculate the address
-    auto &MRI = MF.getRegInfo();
-    const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
-
-    unsigned FIRegOperand = WebAssembly::SP32;
-    if (FrameOffset) {
-      // Create i32.add SP, offset and make it the operand. We want to stackify
-      // this sequence, but we need to preserve the LIFO expr stack ordering
-      // (i.e. we can't insert our code in between MI and any operands it
-      // pops before FIOperand).
-      auto *WFI = MF.getInfo<WebAssemblyFunctionInfo>();
-      bool CanStackifyFI = findInsertPt(WFI, MBB, FIOperandNum, II);
+  }
 
-      unsigned OffsetOp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-      BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::CONST_I32),
-              OffsetOp)
-          .addImm(FrameOffset);
-      if (CanStackifyFI) {
-        WFI->stackifyVReg(OffsetOp);
-        FIRegOperand = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
-        WFI->stackifyVReg(FIRegOperand);
-      } else {
-        FIRegOperand = OffsetOp;
+  // If this is an address being added to a constant, fold the frame offset
+  // into the constant.
+  if (MI.getOpcode() == WebAssembly::ADD_I32) {
+    MachineOperand &OtherMO = MI.getOperand(3 - FIOperandNum);
+    if (OtherMO.isReg()) {
+      unsigned OtherMOReg = OtherMO.getReg();
+      if (TargetRegisterInfo::isVirtualRegister(OtherMOReg)) {
+        MachineInstr *Def = MF.getRegInfo().getUniqueVRegDef(OtherMOReg);
+        // TODO: For now we just opportunistically do this in the case where
+        // the CONST_I32 happens to have exactly one def and one use. We
+        // should generalize this to optimize in more cases.
+        if (Def && Def->getOpcode() == WebAssembly::CONST_I32 &&
+            MRI.hasOneNonDBGUse(Def->getOperand(0).getReg())) {
+          MachineOperand &ImmMO = Def->getOperand(1);
+          ImmMO.setImm(ImmMO.getImm() + uint32_t(FrameOffset));
+          MI.getOperand(FIOperandNum)
+              .ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false);
+          return;
+        }
       }
-      BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::ADD_I32),
-              FIRegOperand)
-          .addReg(WebAssembly::SP32)
-          .addReg(OffsetOp);
     }
-    MI.getOperand(FIOperandNum).ChangeToRegister(FIRegOperand, /*IsDef=*/false);
   }
+
+  // Otherwise create an i32.add SP, offset and make it the operand.
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+  unsigned FIRegOperand = WebAssembly::SP32;
+  if (FrameOffset) {
+    // Create i32.add SP, offset and make it the operand.
+    const TargetRegisterClass *PtrRC =
+        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+    unsigned OffsetOp = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::CONST_I32),
+            OffsetOp)
+        .addImm(FrameOffset);
+    FIRegOperand = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::ADD_I32),
+            FIRegOperand)
+        .addReg(WebAssembly::SP32)
+        .addReg(OffsetOp);
+  }
+  MI.getOperand(FIOperandNum).ChangeToRegister(FIRegOperand, /*IsDef=*/false);
 }
 
 unsigned
diff --git a/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
new file mode 100644
index 00000000000..11bda47eac5
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
@@ -0,0 +1,97 @@
+//===-- WebAssemblyReplacePhysRegs.cpp - Replace phys regs with virt regs -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a pass that replaces physical registers with
+/// virtual registers.
+///
+/// LLVM expects certain physical registers, such as a stack pointer. However,
+/// WebAssembly doesn't actually have such physical registers. This pass is run
+/// once LLVM no longer needs these registers, and replaces them with virtual
+/// registers, so they can participate in register stackifying and coloring in
+/// the normal way.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-replace-phys-regs"
+
+namespace {
+class WebAssemblyReplacePhysRegs final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyReplacePhysRegs() : MachineFunctionPass(ID) {}
+
+private:
+  const char *getPassName() const override {
+    return "WebAssembly Replace Physical Registers";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyReplacePhysRegs::ID = 0;
+FunctionPass *llvm::createWebAssemblyReplacePhysRegs() {
+  return new WebAssemblyReplacePhysRegs();
+}
+
+bool WebAssemblyReplacePhysRegs::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Replace Physical Registers **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const auto &TRI = *MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
+  bool Changed = false;
+
+  assert(!mustPreserveAnalysisID(LiveIntervalsID) &&
+         "LiveIntervals shouldn't be active yet!");
+  // We don't preserve SSA or liveness.
+  MRI.leaveSSA();
+  MRI.invalidateLiveness();
+
+  for (unsigned PReg = WebAssembly::NoRegister + 1;
+       PReg < WebAssembly::NUM_TARGET_REGS; ++PReg) {
+    // Skip fake registers that are never used explicitly.
+    if (PReg == WebAssembly::EXPR_STACK || PReg == WebAssembly::ARGUMENTS)
+      continue;
+
+    // Replace explicit uses of the physical register with a virtual register.
+    const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PReg);
+    unsigned VReg = WebAssembly::NoRegister;
+    for (auto I = MRI.reg_begin(PReg), E = MRI.reg_end(); I != E; ) {
+      MachineOperand &MO = *I++;
+      if (!MO.isImplicit()) {
+        if (VReg == WebAssembly::NoRegister)
+          VReg = MRI.createVirtualRegister(RC);
+        MO.setReg(VReg);
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index fb2ac9871ef..b3da7861f29 100644
--- a/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -29,6 +29,7 @@
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -55,6 +56,9 @@ public:
     AU.addPreserved<MachineBlockFrequencyInfo>();
     AU.addRequired<MachineDominatorTree>();
     AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -74,57 +78,78 @@ FunctionPass *llvm::createWebAssemblyStoreResults() {
 static bool ReplaceDominatedUses(MachineBasicBlock &MBB, MachineInstr &MI,
                                  unsigned FromReg, unsigned ToReg,
                                  const MachineRegisterInfo &MRI,
-                                 MachineDominatorTree &MDT) {
+                                 MachineDominatorTree &MDT,
+                                 LiveIntervals &LIS) {
   bool Changed = false;
+
+  LiveInterval *FromLI = &LIS.getInterval(FromReg);
+  LiveInterval *ToLI = &LIS.getInterval(ToReg);
+
+  SlotIndex FromIdx = LIS.getInstructionIndex(MI).getRegSlot();
+  VNInfo *FromVNI = FromLI->getVNInfoAt(FromIdx);
+
+  SmallVector<SlotIndex, 4> Indices;
+
   for (auto I = MRI.use_begin(FromReg), E = MRI.use_end(); I != E;) {
     MachineOperand &O = *I++;
     MachineInstr *Where = O.getParent();
-    if (Where->getOpcode() == TargetOpcode::PHI) {
-      // PHIs use their operands on their incoming CFG edges rather than
-      // in their parent blocks. Get the basic block paired with this use
-      // of FromReg and check that MI's block dominates it.
-      MachineBasicBlock *Pred =
-          Where->getOperand(&O - &Where->getOperand(0) + 1).getMBB();
-      if (!MDT.dominates(&MBB, Pred))
-        continue;
-    } else {
-      // For a non-PHI, check that MI dominates the instruction in the
-      // normal way.
-      if (&MI == Where || !MDT.dominates(&MI, Where))
-        continue;
-    }
+
+    // Check that MI dominates the instruction in the normal way.
+    if (&MI == Where || !MDT.dominates(&MI, Where))
+      continue;
+
+    // If this use gets a different value, skip it.
+    SlotIndex WhereIdx = LIS.getInstructionIndex(*Where);
+    VNInfo *WhereVNI = FromLI->getVNInfoAt(WhereIdx);
+    if (WhereVNI && WhereVNI != FromVNI)
+      continue;
+
+    // Make sure ToReg isn't clobbered before it gets there.
+    VNInfo *ToVNI = ToLI->getVNInfoAt(WhereIdx);
+    if (ToVNI && ToVNI != FromVNI)
+      continue;
+
     Changed = true;
     DEBUG(dbgs() << "Setting operand " << O << " in " << *Where << " from "
                  << MI << "\n");
     O.setReg(ToReg);
-    // If the store's def was previously dead, it is no longer. But the
-    // dead flag shouldn't be set yet.
-    assert(!MI.getOperand(0).isDead() && "Unexpected dead flag");
+
+    // If the store's def was previously dead, it is no longer.
+    MI.getOperand(0).setIsDead(false);
+
+    Indices.push_back(WhereIdx.getRegSlot());
+  }
+
+  if (Changed) {
+    // Extend ToReg's liveness.
+    LIS.extendToIndices(*ToLI, Indices);
+
+    // Shrink FromReg's liveness.
+    LIS.shrinkToUses(FromLI);
+
+    // If we replaced all dominated uses, FromReg is now killed at MI.
+    if (!FromLI->liveAt(FromIdx.getDeadSlot()))
+      MI.addRegisterKilled(FromReg,
+                           MBB.getParent()->getSubtarget<WebAssemblySubtarget>()
+                                 .getRegisterInfo());
   }
+
   return Changed;
 }
 
 static bool optimizeStore(MachineBasicBlock &MBB, MachineInstr &MI,
                           const MachineRegisterInfo &MRI,
-                          MachineDominatorTree &MDT) {
-  const auto &Stored = MI.getOperand(WebAssembly::StoreValueOperandNo);
-  switch (Stored.getType()) {
-  case MachineOperand::MO_Register: {
-    unsigned ToReg = MI.getOperand(0).getReg();
-    unsigned FromReg = Stored.getReg();
-    return ReplaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT);
-  }
-  case MachineOperand::MO_FrameIndex:
-    // TODO: optimize.
-    return false;
-  default:
-    report_fatal_error("Store results: store not consuming reg or frame index");
-  }
+                          MachineDominatorTree &MDT,
+                          LiveIntervals &LIS) {
+  unsigned ToReg = MI.getOperand(0).getReg();
+  unsigned FromReg = MI.getOperand(WebAssembly::StoreValueOperandNo).getReg();
+  return ReplaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT, LIS);
 }
 
 static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
                          const MachineRegisterInfo &MRI,
                          MachineDominatorTree &MDT,
+                         LiveIntervals &LIS,
                          const WebAssemblyTargetLowering &TLI,
                          const TargetLibraryInfo &LibInfo) {
   MachineOperand &Op1 = MI.getOperand(1);
@@ -142,23 +167,12 @@ static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
   if (!LibInfo.getLibFunc(Name, Func))
     return false;
 
-  const auto &Op2 = MI.getOperand(2);
-  switch (Op2.getType()) {
-  case MachineOperand::MO_Register: {
-    unsigned FromReg = Op2.getReg();
-    unsigned ToReg = MI.getOperand(0).getReg();
-    if (MRI.getRegClass(FromReg) != MRI.getRegClass(ToReg))
-      report_fatal_error("Store results: call to builtin function with wrong "
-                         "signature, from/to mismatch");
-    return ReplaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT);
-  }
-  case MachineOperand::MO_FrameIndex:
-    // TODO: optimize.
-    return false;
-  default:
+  unsigned FromReg = MI.getOperand(2).getReg();
+  unsigned ToReg = MI.getOperand(0).getReg();
+  if (MRI.getRegClass(FromReg) != MRI.getRegClass(ToReg))
     report_fatal_error("Store results: call to builtin function with wrong "
-                       "signature, not consuming reg or frame index");
-  }
+                       "signature, from/to mismatch");
+  return ReplaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT, LIS);
 }
 
 bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
@@ -167,14 +181,18 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
            << "********** Function: " << MF.getName() << '\n';
   });
 
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
   const WebAssemblyTargetLowering &TLI =
       *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
   const auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
   bool Changed = false;
 
-  assert(MRI.isSSA() && "StoreResults depends on SSA form");
+  // We don't preserve SSA form.
+  MRI.leaveSSA();
+
+  assert(MRI.tracksLiveness() && "StoreResults expects liveness tracking");
 
   for (auto &MBB : MF) {
     DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
@@ -191,11 +209,11 @@ bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
       case WebAssembly::STORE_F64:
       case WebAssembly::STORE_I32:
       case WebAssembly::STORE_I64:
-        Changed |= optimizeStore(MBB, MI, MRI, MDT);
+        Changed |= optimizeStore(MBB, MI, MRI, MDT, LIS);
         break;
       case WebAssembly::CALL_I32:
       case WebAssembly::CALL_I64:
-        Changed |= optimizeCall(MBB, MI, MRI, MDT, TLI, LibInfo);
+        Changed |= optimizeCall(MBB, MI, MRI, MDT, LIS, TLI, LibInfo);
         break;
       }
   }
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index dd3431b9670..472e0d36273 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -103,8 +103,6 @@ public:
 
   void addIRPasses() override;
   bool addInstSelector() override;
-  bool addILPOpts() override;
-  void addPreRegAlloc() override;
   void addPostRegAlloc() override;
   bool addGCPasses() override { return false; }
   void addPreEmitPass() override;
@@ -162,19 +160,6 @@ bool WebAssemblyPassConfig::addInstSelector() {
   return false;
 }
 
-bool WebAssemblyPassConfig::addILPOpts() {
-  (void)TargetPassConfig::addILPOpts();
-  return true;
-}
-
-void WebAssemblyPassConfig::addPreRegAlloc() {
-  TargetPassConfig::addPreRegAlloc();
-
-  // Prepare store instructions for register stackifying.
-  if (getOptLevel() != CodeGenOpt::None)
-    addPass(createWebAssemblyStoreResults());
-}
-
 void WebAssemblyPassConfig::addPostRegAlloc() {
   // TODO: The following CodeGen passes don't currently support code containing
   // virtual registers. Consider removing their restrictions and re-enabling
@@ -194,14 +179,6 @@ void WebAssemblyPassConfig::addPostRegAlloc() {
   disablePass(&LiveDebugValuesID);
   disablePass(&PatchableFunctionID);
 
-  if (getOptLevel() != CodeGenOpt::None) {
-    // Mark registers as representing wasm's expression stack.
-    addPass(createWebAssemblyRegStackify());
-
-    // Run the register coloring pass to reduce the total number of registers.
-    addPass(createWebAssemblyRegColoring());
-  }
-
   TargetPassConfig::addPostRegAlloc();
 
   // Run WebAssembly's version of the PrologEpilogInserter. Target-independent
@@ -213,6 +190,33 @@ void WebAssemblyPassConfig::addPostRegAlloc() {
 void WebAssemblyPassConfig::addPreEmitPass() {
   TargetPassConfig::addPreEmitPass();
 
+  // Now that we have a prologue and epilogue and all frame indices are
+  // rewritten, eliminate SP and FP. This allows them to be stackified,
+  // colored, and numbered with the rest of the registers.
+  addPass(createWebAssemblyReplacePhysRegs());
+
+  if (getOptLevel() != CodeGenOpt::None) {
+    // LiveIntervals isn't commonly run this late. Re-establish preconditions.
+    addPass(createWebAssemblyPrepareForLiveIntervals());
+
+    // Depend on LiveIntervals and perform some optimizations on it.
+    addPass(createWebAssemblyOptimizeLiveIntervals());
+
+    // Prepare store instructions for register stackifying.
+    addPass(createWebAssemblyStoreResults());
+
+    // Mark registers as representing wasm's expression stack. This is a key
+    // code-compression technique in WebAssembly. We run this pass (and
+    // StoreResults above) very late, so that it sees as much code as possible,
+    // including code emitted by PEI and expanded by late tail duplication.
+    addPass(createWebAssemblyRegStackify());
+
+    // Run the register coloring pass to reduce the total number of registers.
+    // This runs after stackification so that it doesn't consider registers
+    // that become stackified.
+    addPass(createWebAssemblyRegColoring());
+  }
+
   // Eliminate multiple-entry loops.
   addPass(createWebAssemblyFixIrreducibleControlFlow());
 
diff --git a/test/CodeGen/WebAssembly/byval.ll b/test/CodeGen/WebAssembly/byval.ll
index 232a4a21022..c7a21abbcf5 100644
--- a/test/CodeGen/WebAssembly/byval.ll
+++ b/test/CodeGen/WebAssembly/byval.ll
@@ -23,26 +23,27 @@ declare void @ext_byval_func_empty(%EmptyStruct* byval)
 ; CHECK-LABEL: byval_arg
 define void @byval_arg(%SmallStruct* %ptr) {
  ; CHECK: .param i32
+ ; CHECK: i32.const $push[[L4:.+]]=, __stack_pointer
  ; Subtract 16 from SP (SP is 16-byte aligned)
  ; CHECK: i32.const $push[[L1:.+]]=, __stack_pointer
  ; CHECK-NEXT: i32.load $push[[L2:.+]]=, 0($pop[[L1]])
  ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 16
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, $pop[[L2]], $pop[[L3]]
+ ; CHECK-NEXT: i32.sub $push[[L10:.+]]=, $pop[[L2]], $pop[[L3]]
  ; Ensure SP is stored back before the call
- ; CHECK-NEXT: i32.const $push[[L4:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.store {{.*}}=, 0($pop[[L4]]), [[SP]]
+ ; CHECK-NEXT: i32.store $push[[L12:.+]]=, 0($pop[[L4]]), $pop[[L10]]{{$}}
+ ; CHECK-NEXT: tee_local $push[[L11:.+]]=, $[[SP:.+]]=, $pop[[L12]]{{$}}
  ; Copy the SmallStruct argument to the stack (SP+12, original SP-4)
- ; CHECK-NEXT: i32.load $push[[L4:.+]]=, 0($0)
- ; CHECK-NEXT: i32.store {{.*}}=, 12([[SP]]), $pop[[L4]]
+ ; CHECK-NEXT: i32.load $push[[L0:.+]]=, 0($0)
+ ; CHECK-NEXT: i32.store $discard=, 12($pop[[L11]]), $pop[[L0]]
  ; Pass a pointer to the stack slot to the function
- ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 12
- ; CHECK-NEXT: i32.add $push[[ARG:.+]]=, [[SP]], $pop[[L5]]
- ; CHECK-NEXT: call ext_byval_func@FUNCTION, $pop[[ARG]]
+ ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 12{{$}}
+ ; CHECK-NEXT: i32.add $push[[ARG:.+]]=, $[[SP]], $pop[[L5]]{{$}}
+ ; CHECK-NEXT: call ext_byval_func@FUNCTION, $pop[[ARG]]{{$}}
  call void @ext_byval_func(%SmallStruct* byval %ptr)
  ; Restore the stack
  ; CHECK-NEXT: i32.const $push[[L7:.+]]=, __stack_pointer
  ; CHECK-NEXT: i32.const $push[[L6:.+]]=, 16
- ; CHECK-NEXT: i32.add $push[[L8:.+]]=, [[SP]], $pop[[L6]]
+ ; CHECK-NEXT: i32.add $push[[L8:.+]]=, $[[SP]], $pop[[L6]]
  ; CHECK-NEXT: i32.store {{.*}}=, 0($pop[[L7]]), $pop[[L8]]
  ; CHECK-NEXT: return
  ret void
@@ -53,14 +54,16 @@ define void @byval_arg_align8(%SmallStruct* %ptr) {
  ; CHECK: .param i32
  ; Don't check the entire SP sequence, just enough to get the alignment.
  ; CHECK: i32.const $push[[L1:.+]]=, 16
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, {{.+}}, $pop[[L1]]
+ ; CHECK-NEXT: i32.sub $push[[L10:.+]]=, {{.+}}, $pop[[L1]]
+ ; CHECK-NEXT: i32.store $push[[L12:.+]]=, 0($pop{{.+}}), $pop[[L10]]{{$}}
+ ; CHECK-NEXT: tee_local $push[[L11:.+]]=, $[[SP:.+]]=, $pop[[L12]]{{$}}
  ; Copy the SmallStruct argument to the stack (SP+8, original SP-8)
- ; CHECK: i32.load $push[[L4:.+]]=, 0($0){{$}}
- ; CHECK-NEXT: i32.store {{.*}}=, 8([[SP]]), $pop[[L4]]{{$}}
+ ; CHECK-NEXT: i32.load $push[[L0:.+]]=, 0($0){{$}}
+ ; CHECK-NEXT: i32.store $discard=, 8($pop[[L11]]), $pop[[L0]]{{$}}
  ; Pass a pointer to the stack slot to the function
- ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 8
- ; CHECK-NEXT: i32.add $push[[ARG:.+]]=, [[SP]], $pop[[L5]]
- ; CHECK-NEXT: call ext_byval_func_align8@FUNCTION, $pop[[ARG]]
+ ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 8{{$}}
+ ; CHECK-NEXT: i32.add $push[[ARG:.+]]=, $[[SP]], $pop[[L5]]{{$}}
+ ; CHECK-NEXT: call ext_byval_func_align8@FUNCTION, $pop[[ARG]]{{$}}
  call void @ext_byval_func_align8(%SmallStruct* byval align 8 %ptr)
  ret void
 }
@@ -70,13 +73,15 @@ define void @byval_arg_double(%AlignedStruct* %ptr) {
  ; CHECK: .param i32
  ; Subtract 16 from SP (SP is 16-byte aligned)
  ; CHECK: i32.const $push[[L1:.+]]=, 16
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, {{.+}}, $pop[[L1]]
+ ; CHECK-NEXT: i32.sub $push[[L12:.+]]=, {{.+}}, $pop[[L1]]
+ ; CHECK-NEXT: i32.store $push[[L15:.+]]=, {{.+}}, $pop[[L12]]
+ ; CHECK-NEXT: tee_local $push[[L14:.+]]=, $[[SP:.+]]=, $pop[[L15]]
  ; Copy the AlignedStruct argument to the stack (SP+0, original SP-16)
  ; Just check the last load/store pair of the memcpy
  ; CHECK: i64.load $push[[L4:.+]]=, 0($0)
- ; CHECK-NEXT: i64.store {{.*}}=, 0([[SP]]), $pop[[L4]]
+ ; CHECK-NEXT: i64.store $discard=, 0($[[SP]]), $pop[[L4]]
  ; Pass a pointer to the stack slot to the function
- ; CHECK-NEXT: call ext_byval_func_alignedstruct@FUNCTION, [[SP]]
+ ; CHECK-NEXT: call ext_byval_func_alignedstruct@FUNCTION, $[[SP]]
  tail call void @ext_byval_func_alignedstruct(%AlignedStruct* byval %ptr)
  ret void
 }
@@ -108,11 +113,15 @@ define void @byval_empty_callee(%EmptyStruct* byval %ptr) {
 
 ; Call memcpy for "big" byvals.
 ; CHECK-LABEL: big_byval:
+; CHECK: i32.const $push[[L4:.+]]=, __stack_pointer
 ; CHECK: i32.const $push[[L1:.+]]=, __stack_pointer
 ; CHECK-NEXT: i32.load $push[[L2:.+]]=, 0($pop[[L1]])
 ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 131072
-; CHECK-NEXT: i32.sub [[SP:.+]]=, $pop[[L2]], $pop[[L3]]
-; CHECK:      i32.call       ${{[^,]+}}=, memcpy@FUNCTION,
+; CHECK-NEXT: i32.sub $push[[L8:.+]]=, $pop[[L2]], $pop[[L3]]
+; CHECK-NEXT: i32.store $push[[L12:.+]]=, 0($pop[[L4]]), $pop[[L8]]{{$}}
+; CHECK-NEXT: i32.const $push[[L0:.+]]=, 131072
+; CHECK-NEXT: i32.call       $push[[L11:.+]]=, memcpy@FUNCTION, $pop{{.+}}, ${{.+}}, $pop{{.+}}
+; CHECK-NEXT: tee_local      $push[[L9:.+]]=, $[[SP:.+]]=, $pop[[L11]]{{$}}
 ; CHECK-NEXT: call           big_byval_callee@FUNCTION,
 %big = type [131072 x i8]
 declare void @big_byval_callee(%big* byval align 1)
diff --git a/test/CodeGen/WebAssembly/cfg-stackify.ll b/test/CodeGen/WebAssembly/cfg-stackify.ll
index 4c31789973a..442d0d4ea25 100644
--- a/test/CodeGen/WebAssembly/cfg-stackify.ll
+++ b/test/CodeGen/WebAssembly/cfg-stackify.ll
@@ -104,17 +104,17 @@ back:
 ; CHECK-NOT: local
 ; CHECK: block{{$}}
 ; CHECK: br_if 0, {{[^,]+}}{{$}}
-; CHECK: .LBB2_1:
+; CHECK: .LBB2_{{[0-9]+}}:
 ; CHECK: br_if 0, ${{[0-9]+}}{{$}}
-; CHECK: .LBB2_2:
+; CHECK: .LBB2_{{[0-9]+}}:
 ; CHECK: return{{$}}
 ; OPT-LABEL: test2:
 ; OPT-NOT: local
 ; OPT: block{{$}}
 ; OPT: br_if 0, {{[^,]+}}{{$}}
-; OPT: .LBB2_1:
+; OPT: .LBB2_{{[0-9]+}}:
 ; OPT: br_if 0, ${{[0-9]+}}{{$}}
-; OPT: .LBB2_2:
+; OPT: .LBB2_{{[0-9]+}}:
 ; OPT: return{{$}}
 define void @test2(double* nocapture %p, i32 %n) {
 entry:
@@ -393,36 +393,32 @@ exit:
 ; CHECK: .LBB11_1:
 ; CHECK: loop{{$}}
 ; CHECK: block{{$}}
-; CHECK: block{{$}}
 ; CHECK: br_if           0, $0{{$}}
 ; CHECK: br              1{{$}}
 ; CHECK: .LBB11_3:
+; CHECK: end_block{{$}}
 ; CHECK: block{{$}}
 ; CHECK: br_if           0, $1{{$}}
 ; CHECK: br              1{{$}}
 ; CHECK: .LBB11_5:
-; CHECK: .LBB11_6:
 ; CHECK: br              0{{$}}
-; CHECK: .LBB11_7:
+; CHECK: .LBB11_6:
 ; CHECK-NEXT: end_loop{{$}}
 ; OPT-LABEL: doublediamond_in_a_loop:
 ; OPT:      .LBB11_1:
 ; OPT:      loop{{$}}
 ; OPT:      block{{$}}
-; OPT-NEXT: block{{$}}
-; OPT-NEXT: block{{$}}
 ; OPT:      br_if           0, {{[^,]+}}{{$}}
-; OPT:      br_if           1, {{[^,]+}}{{$}}
+; OPT:      block{{$}}
+; OPT:      br_if           0, {{[^,]+}}{{$}}
 ; OPT:      br              2{{$}}
 ; OPT-NEXT: .LBB11_4:
 ; OPT-NEXT: end_block{{$}}
 ; OPT:      br              1{{$}}
 ; OPT:      .LBB11_5:
 ; OPT-NEXT: end_block{{$}}
-; OPT:      .LBB11_6:
-; OPT-NEXT: end_block{{$}}
 ; OPT:      br              0{{$}}
-; OPT:      .LBB11_7:
+; OPT:      .LBB11_6:
 ; OPT-NEXT: end_loop{{$}}
 define i32 @doublediamond_in_a_loop(i32 %a, i32 %b, i32* %p) {
 entry:
@@ -756,33 +752,19 @@ u1:
 ; CHECK-LABEL: test8:
 ; CHECK:       .LBB17_1:
 ; CHECK-NEXT:  loop{{$}}
-; CHECK-NEXT:  block{{$}}
-; CHECK-NOT:   block
-; CHECK:       br_if    0, {{[^,]+}}{{$}}
-; CHECK-NOT:   block
-; CHECK:       br_if    1, {{[^,]+}}{{$}}
-; CHECK-NEXT:  .LBB17_3:
-; CHECK-NEXT:  end_block{{$}}
-; CHECK-NEXT:  loop{{$}}
 ; CHECK-NEXT:  i32.const $push{{[^,]+}}, 0{{$}}
 ; CHECK-NEXT:  br_if    0, {{[^,]+}}{{$}}
-; CHECK-NEXT:  br       2{{$}}
-; CHECK-NEXT:  .LBB17_4:
+; CHECK-NEXT:  br       0{{$}}
+; CHECK-NEXT:  .LBB17_2:
+; CHECK-NEXT:  end_loop{{$}}
 ; OPT-LABEL: test8:
 ; OPT:       .LBB17_1:
 ; OPT-NEXT:  loop{{$}}
-; OPT-NEXT:  block{{$}}
-; OPT-NOT:   block
-; OPT:       br_if    0, {{[^,]+}}{{$}}
-; OPT-NOT:   block
-; OPT:       br_if    1, {{[^,]+}}{{$}}
-; OPT-NEXT:  .LBB17_3:
-; OPT-NEXT:  end_block{{$}}
-; OPT-NEXT:  loop{{$}}
 ; OPT-NEXT:  i32.const $push{{[^,]+}}, 0{{$}}
 ; OPT-NEXT:  br_if    0, {{[^,]+}}{{$}}
-; OPT-NEXT:  br       2{{$}}
-; OPT-NEXT:  .LBB17_4:
+; OPT-NEXT:  br       0{{$}}
+; OPT-NEXT:  .LBB17_2:
+; OPT-NEXT:  end_loop{{$}}
 define i32 @test8() {
 bb:
   br label %bb1
@@ -1195,10 +1177,9 @@ bb5:
 ; CHECK-NEXT:     loop{{$}}
 ; CHECK-NEXT:     i32.const   $push0=, 0{{$}}
 ; CHECK-NEXT:     br_if       0, $pop0{{$}}
-; CHECK-NEXT: .LBB23_2:{{$}}
 ; CHECK-NEXT:     end_loop{{$}}
+; CHECK-NEXT: .LBB23_3:{{$}}
 ; CHECK-NEXT:     loop{{$}}
-; CHECK-NEXT:     i32.const   $discard=, 0{{$}}
 ; CHECK-NEXT:     i32.const   $push1=, 0{{$}}
 ; CHECK-NEXT:     br_if       0, $pop1{{$}}
 ; CHECK-NEXT:     end_loop{{$}}
diff --git a/test/CodeGen/WebAssembly/mem-intrinsics.ll b/test/CodeGen/WebAssembly/mem-intrinsics.ll
index f71903c6eb2..3a211a50e4f 100644
--- a/test/CodeGen/WebAssembly/mem-intrinsics.ll
+++ b/test/CodeGen/WebAssembly/mem-intrinsics.ll
@@ -61,8 +61,8 @@ define void @set_no(i8* %dst, i8 %src, i32 %len) {
 
 
 ; CHECK-LABEL: frame_index:
-; CHECK: i32.call $discard=, memset@FUNCTION, $pop12, $pop1, $pop0{{$}}
-; CHECK: i32.call $discard=, memset@FUNCTION, $0, $pop3, $pop2{{$}}
+; CHECK: i32.call $discard=, memset@FUNCTION, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK: i32.call $push{{[0-9]+}}=, memset@FUNCTION, ${{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK: return{{$}}
 define void @frame_index() {
 entry:
@@ -76,11 +76,13 @@ entry:
 }
 
 ; If the result value of memset doesn't get stackified, it should be marked
-; $discard.
+; $discard. Note that we use a call to prevent tail dup so that we can test
+; this specific functionality.
 
 ; CHECK-LABEL: discard_result:
 ; CHECK: i32.call $discard=, memset@FUNCTION, $0, $1, $2
 declare i8* @def()
+declare void @block_tail_dup()
 define i8* @discard_result(i8* %arg, i8 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
 bb:
   %tmp = icmp eq i32 %arg3, 0
@@ -101,6 +103,37 @@ bb9:
   %tmp10 = call i8* @def()
   br label %bb11
 
+bb11:
+  %tmp12 = phi i8* [ %arg, %bb7 ], [ %arg, %bb8 ], [ %tmp10, %bb9 ]
+  call void @block_tail_dup()
+  ret i8* %tmp12
+}
+
+; This is the same as discard_result, except we let tail dup happen, so the
+; result of the memset *is* stackified.
+
+; CHECK-LABEL: tail_dup_to_reuse_result:
+; CHECK: i32.call $push{{[0-9]+}}=, memset@FUNCTION, $0, $1, $2
+define i8* @tail_dup_to_reuse_result(i8* %arg, i8 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
+bb:
+  %tmp = icmp eq i32 %arg3, 0
+  br i1 %tmp, label %bb5, label %bb9
+
+bb5:
+  %tmp6 = icmp eq i32 %arg4, 0
+  br i1 %tmp6, label %bb7, label %bb8
+
+bb7:
+  call void @llvm.memset.p0i8.i32(i8* %arg, i8 %arg1, i32 %arg2, i32 1, i1 false)
+  br label %bb11
+
+bb8:
+  br label %bb11
+
+bb9:
+  %tmp10 = call i8* @def()
+  br label %bb11
+
 bb11:
   %tmp12 = phi i8* [ %arg, %bb7 ], [ %arg, %bb8 ], [ %tmp10, %bb9 ]
   ret i8* %tmp12
diff --git a/test/CodeGen/WebAssembly/offset.ll b/test/CodeGen/WebAssembly/offset.ll
index 59f26f33497..24b7f89bec1 100644
--- a/test/CodeGen/WebAssembly/offset.ll
+++ b/test/CodeGen/WebAssembly/offset.ll
@@ -397,9 +397,9 @@ define void @aggregate_load_store({i32,i32,i32,i32}* %p, {i32,i32,i32,i32}* %q)
 ; merged into i64 stores.
 
 ; CHECK-LABEL: aggregate_return:
-; CHECK: i64.const   $push0=, 0{{$}}
-; CHECK: i64.store   $push1=, 8($0):p2align=2, $pop0{{$}}
-; CHECK: i64.store   $discard=, 0($0):p2align=2, $pop1{{$}}
+; CHECK: i64.const   $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK: i64.store   $push[[L1:[0-9]+]]=, 8($0):p2align=2, $pop[[L0]]{{$}}
+; CHECK: i64.store   $discard=, 0($0):p2align=2, $pop[[L1]]{{$}}
 define {i32,i32,i32,i32} @aggregate_return() {
   ret {i32,i32,i32,i32} zeroinitializer
 }
@@ -408,12 +408,12 @@ define {i32,i32,i32,i32} @aggregate_return() {
 ; merged.
 
 ; CHECK-LABEL: aggregate_return_without_merge:
-; CHECK: i32.const   $push0=, 0{{$}}
-; CHECK: i32.store8  $push1=, 14($0), $pop0{{$}}
-; CHECK: i32.store16 $push2=, 12($0), $pop1{{$}}
-; CHECK: i32.store   $discard=, 8($0), $pop2{{$}}
-; CHECK: i64.const   $push3=, 0{{$}}
-; CHECK: i64.store   $discard=, 0($0), $pop3{{$}}
+; CHECK: i32.const   $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK: i32.store8  $push[[L1:[0-9]+]]=, 14($0), $pop[[L0]]{{$}}
+; CHECK: i32.store16 $push[[L2:[0-9]+]]=, 12($0), $pop[[L1]]{{$}}
+; CHECK: i32.store   $discard=, 8($0), $pop[[L2]]{{$}}
+; CHECK: i64.const   $push[[L3:[0-9]+]]=, 0{{$}}
+; CHECK: i64.store   $discard=, 0($0), $pop[[L3]]{{$}}
 define {i64,i32,i16,i8} @aggregate_return_without_merge() {
   ret {i64,i32,i16,i8} zeroinitializer
 }
diff --git a/test/CodeGen/WebAssembly/reg-stackify.ll b/test/CodeGen/WebAssembly/reg-stackify.ll
index 4c667d58eb9..0737de58e03 100644
--- a/test/CodeGen/WebAssembly/reg-stackify.ll
+++ b/test/CodeGen/WebAssembly/reg-stackify.ll
@@ -194,9 +194,9 @@ entry:
 ; CHECK-LABEL: simple_multiple_use:
 ; CHECK:  .param      i32, i32{{$}}
 ; CHECK-NEXT:  i32.mul     $push[[NUM0:[0-9]+]]=, $1, $0{{$}}
-; CHECK-NEXT:  tee_local   $push[[NUM1:[0-9]+]]=, $0=, $pop[[NUM0]]{{$}}
+; CHECK-NEXT:  tee_local   $push[[NUM1:[0-9]+]]=, $[[NUM2:[0-9]+]]=, $pop[[NUM0]]{{$}}
 ; CHECK-NEXT:  call        use_a@FUNCTION, $pop[[NUM1]]{{$}}
-; CHECK-NEXT:  call        use_b@FUNCTION, $0{{$}}
+; CHECK-NEXT:  call        use_b@FUNCTION, $[[NUM2]]{{$}}
 ; CHECK-NEXT:  return{{$}}
 declare void @use_a(i32)
 declare void @use_b(i32)
@@ -212,8 +212,8 @@ define void @simple_multiple_use(i32 %x, i32 %y) {
 ; CHECK-LABEL: multiple_uses_in_same_insn:
 ; CHECK:  .param      i32, i32{{$}}
 ; CHECK-NEXT:  i32.mul     $push[[NUM0:[0-9]+]]=, $1, $0{{$}}
-; CHECK-NEXT:  tee_local   $push[[NUM1:[0-9]+]]=, $0=, $pop[[NUM0]]{{$}}
-; CHECK-NEXT:  call        use_2@FUNCTION, $pop[[NUM1]], $0{{$}}
+; CHECK-NEXT:  tee_local   $push[[NUM1:[0-9]+]]=, $[[NUM2:[0-9]+]]=, $pop[[NUM0]]{{$}}
+; CHECK-NEXT:  call        use_2@FUNCTION, $pop[[NUM1]], $[[NUM2]]{{$}}
 ; CHECK-NEXT:  return{{$}}
 declare void @use_2(i32, i32)
 define void @multiple_uses_in_same_insn(i32 %x, i32 %y) {
@@ -273,7 +273,6 @@ define i32 @no_stackify_past_use(i32 %arg) {
 ; CHECK-NEXT:   tee_local       $push[[NUM1:[0-9]+]]=, $[[NUM2:[0-9]+]]=, $pop[[NUM0]]{{$}}
 ; CHECK-NEXT:   f64.select      $push{{[0-9]+}}=, $pop{{[0-9]+}}, $pop[[NUM1]], ${{[0-9]+}}{{$}}
 ; CHECK:        $[[NUM2]]=,
-; CHECK:        $[[NUM2]]=,
 define void @multiple_defs(i32 %arg, i32 %arg1, i1 %arg2, i1 %arg3, i1 %arg4) {
 bb:
   br label %bb5
@@ -325,9 +324,9 @@ define i32 @no_stackify_call_past_load() {
 
 ; Don't move stores past loads if there may be aliasing
 ; CHECK-LABEL: no_stackify_store_past_load
-; CHECK: i32.store {{.*}}, 0($1), $0
+; CHECK: i32.store $[[L0:[0-9]+]]=, 0($1), $0
 ; CHECK: i32.load {{.*}}, 0($2)
-; CHECK: i32.call {{.*}}, callee@FUNCTION, $0
+; CHECK: i32.call {{.*}}, callee@FUNCTION, $[[L0]]{{$}}
 define i32 @no_stackify_store_past_load(i32 %a, i32* %p1, i32* %p2) {
   store i32 %a, i32* %p1
   %b = load i32, i32* %p2, align 4
diff --git a/test/CodeGen/WebAssembly/store-results.ll b/test/CodeGen/WebAssembly/store-results.ll
index 55900992d46..dc6bee8b3b9 100644
--- a/test/CodeGen/WebAssembly/store-results.ll
+++ b/test/CodeGen/WebAssembly/store-results.ll
@@ -61,7 +61,8 @@ for.cond.cleanup4.i:
 }
 
 ; CHECK-LABEL: fi_ret:
-; CHECK: i32.store $discard=,
+; CHECK: i32.store $push0=,
+; CHECK: return $pop0{{$}}
 define hidden i8* @fi_ret(i8** %addr) {
 entry:
   %buf = alloca [27 x i8], align 16
diff --git a/test/CodeGen/WebAssembly/userstack.ll b/test/CodeGen/WebAssembly/userstack.ll
index 09748e6403c..e4ba583b2ac 100644
--- a/test/CodeGen/WebAssembly/userstack.ll
+++ b/test/CodeGen/WebAssembly/userstack.ll
@@ -12,19 +12,20 @@ declare void @ext_func_i32(i32* %ptr)
 ; Check that there is an extra local for the stack pointer.
 ; CHECK: .local i32{{$}}
 define void @alloca32() noredzone {
+ ; CHECK: i32.const $push[[L4:.+]]=, __stack_pointer{{$}}
  ; CHECK: i32.const $push[[L1:.+]]=, __stack_pointer{{$}}
  ; CHECK-NEXT: i32.load $push[[L2:.+]]=, 0($pop[[L1]])
  ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 16
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, $pop[[L2]], $pop[[L3]]
- ; CHECK-NEXT: i32.const $push[[L4:.+]]=, __stack_pointer{{$}}
- ; CHECK-NEXT: i32.store $discard=, 0($pop[[L4]]), [[SP]]
+ ; CHECK-NEXT: i32.sub $push[[L8:.+]]=, $pop[[L2]], $pop[[L3]]
+ ; CHECK-NEXT: i32.store $push[[L10:.+]]=, 0($pop[[L4]]), $pop[[L8]]{{$}}
+ ; CHECK-NEXT: tee_local $push[[L9:.+]]=, $[[SP:.+]]=, $pop[[L10]]{{$}}
  %retval = alloca i32
  ; CHECK: i32.const $push[[L0:.+]]=, 0
- ; CHECK: i32.store {{.*}}=, 12([[SP]]), $pop[[L0]]
+ ; CHECK: i32.store {{.*}}=, 12($pop[[L9]]), $pop[[L0]]
  store i32 0, i32* %retval
  ; CHECK: i32.const $push[[L6:.+]]=, __stack_pointer
  ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 16
- ; CHECK-NEXT: i32.add $push[[L7:.+]]=, [[SP]], $pop[[L5]]
+ ; CHECK-NEXT: i32.add $push[[L7:.+]]=, $[[SP]], $pop[[L5]]
  ; CHECK-NEXT: i32.store $discard=, 0($pop[[L6]]), $pop[[L7]]
  ret void
 }
@@ -32,17 +33,18 @@ define void @alloca32() noredzone {
 ; CHECK-LABEL: alloca3264:
 ; CHECK: .local i32{{$}}
 define void @alloca3264() {
- ; CHECK: i32.const $push[[L1:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.load $push[[L2:.+]]=, 0($pop[[L1]])
- ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 16
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, $pop[[L2]], $pop[[L3]]
+ ; CHECK: i32.const $push[[L2:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.load $push[[L3:.+]]=, 0($pop[[L2]])
+ ; CHECK-NEXT: i32.const $push[[L4:.+]]=, 16
+ ; CHECK-NEXT: i32.sub $push[[L6:.+]]=, $pop[[L3]], $pop[[L4]]
+ ; CHECK-NEXT: tee_local $push[[L5:.+]]=, $[[SP:.+]]=, $pop[[L6]]
  %r1 = alloca i32
  %r2 = alloca double
- ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 0
- ; CHECK-NEXT: i32.store {{.*}}=, 12([[SP]]), $pop[[L3]]
+ ; CHECK-NEXT: i32.const $push[[L0:.+]]=, 0
+ ; CHECK-NEXT: i32.store $discard=, 12($pop[[L5]]), $pop[[L0]]
  store i32 0, i32* %r1
- ; CHECK-NEXT: i64.const $push[[L0:.+]]=, 0
- ; CHECK-NEXT: i64.store {{.*}}=, 0([[SP]]), $pop[[L0]]
+ ; CHECK-NEXT: i64.const $push[[L1:.+]]=, 0
+ ; CHECK-NEXT: i64.store $discard=, 0($[[SP]]), $pop[[L1]]
  store double 0.0, double* %r2
  ; CHECK-NEXT: return
  ret void
@@ -51,52 +53,52 @@ define void @alloca3264() {
 ; CHECK-LABEL: allocarray:
 ; CHECK: .local i32{{$}}
 define void @allocarray() {
- ; CHECK: i32.const $push[[L1:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.load $push[[L2:.+]]=, 0($pop[[L1]])
- ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 144{{$}}
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, $pop[[L2]], $pop[[L3]]
- ; CHECK-NEXT: i32.const $push[[L4:.+]]=, __stack_pointer{{$}}
- ; CHECK-NEXT: i32.store $discard=, 0($pop[[L4]]), [[SP]]
+ ; CHECK: i32.const $push[[L7:.+]]=, __stack_pointer
+ ; CHECK: i32.const $push[[L4:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.load $push[[L5:.+]]=, 0($pop[[L4]])
+ ; CHECK-NEXT: i32.const $push[[L6:.+]]=, 144{{$}}
+ ; CHECK-NEXT: i32.sub $push[[L11:.+]]=, $pop[[L5]], $pop[[L6]]
+ ; CHECK-NEXT: i32.store $[[SP:.+]]=, 0($pop[[L7]]), $pop[[L11]]
  %r = alloca [33 x i32]
 
- ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 12
- ; CHECK-NEXT: i32.add $push[[L7:.+]]=, [[SP]], $pop[[L5]]
- ; CHECK-NEXT: i32.const $push[[L4:.+]]=, 12
- ; CHECK-NEXT: i32.add $push[[L6:.+]]=, $pop[[L7]], $pop[[L4]]
- ; CHECK-NEXT: i32.const $push[[L9:.+]]=, 1{{$}}
- ; CHECK-NEXT: i32.store $push[[L10:.+]]=, 12([[SP]]), $pop[[L9]]{{$}}
- ; CHECK-NEXT: i32.store $discard=, 0($pop3), $pop[[L10]]{{$}}
+ ; CHECK-NEXT: i32.const $push[[L2:.+]]=, 24
+ ; CHECK-NEXT: i32.add $push[[L3:.+]]=, $[[SP]], $pop[[L2]]
+ ; CHECK-NEXT: i32.const $push[[L1:.+]]=, 1{{$}}
+ ; CHECK-NEXT: i32.store $push[[L0:.+]]=, 0($pop[[L3]]), $pop[[L1]]{{$}}
+ ; CHECK-NEXT: i32.store $discard=, 12($[[SP]]), $pop[[L0]]{{$}}
  %p = getelementptr [33 x i32], [33 x i32]* %r, i32 0, i32 0
  store i32 1, i32* %p
  %p2 = getelementptr [33 x i32], [33 x i32]* %r, i32 0, i32 3
  store i32 1, i32* %p2
 
- ; CHECK: i32.const $push[[L12:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.const $push[[L11:.+]]=, 144
- ; CHECK-NEXT: i32.add $push[[L13:.+]]=, [[SP]], $pop[[L11]]
- ; CHECK-NEXT: i32.store $discard=, 0($pop[[L12]]), $pop[[L13]]
+ ; CHECK: i32.const $push[[L10:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.const $push[[L8:.+]]=, 144
+ ; CHECK-NEXT: i32.add $push[[L19:.+]]=, $[[SP]], $pop[[L8]]
+ ; CHECK-NEXT: i32.store $discard=, 0($pop[[L10]]), $pop[[L9]]
  ret void
 }
 
 ; CHECK-LABEL: non_mem_use
 define void @non_mem_use(i8** %addr) {
  ; CHECK: i32.const $push[[L1:.+]]=, 48
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, {{.+}}, $pop[[L1]]
+ ; CHECK-NEXT: i32.sub $push[[L11:.+]]=, {{.+}}, $pop[[L1]]
+ ; CHECK-NEXT: i32.store $[[SP:.+]]=, {{.+}}, $pop[[L11]]
  %buf = alloca [27 x i8], align 16
  %r = alloca i64
  %r2 = alloca i64
  ; %r is at SP+8
+ ; CHECK: tee_local $push[[L12:.+]]=, $[[SP:.+]]=, $pop{{.+}}
  ; CHECK: i32.const $push[[OFF:.+]]=, 8
- ; CHECK-NEXT: i32.add $push[[ARG1:.+]]=, [[SP]], $pop[[OFF]]
+ ; CHECK-NEXT: i32.add $push[[ARG1:.+]]=, $pop[[L12]], $pop[[OFF]]
  ; CHECK-NEXT: call ext_func@FUNCTION, $pop[[ARG1]]
  call void @ext_func(i64* %r)
  ; %r2 is at SP+0, no add needed
- ; CHECK-NEXT: call ext_func@FUNCTION, [[SP]]
+ ; CHECK-NEXT: call ext_func@FUNCTION, $[[SP]]
  call void @ext_func(i64* %r2)
  ; Use as a value, but in a store
  ; %buf is at SP+16
  ; CHECK: i32.const $push[[OFF:.+]]=, 16
- ; CHECK-NEXT: i32.add $push[[VAL:.+]]=, [[SP]], $pop[[OFF]]
+ ; CHECK-NEXT: i32.add $push[[VAL:.+]]=, $[[SP]], $pop[[OFF]]
  ; CHECK-NEXT: i32.store {{.*}}=, 0($0), $pop[[VAL]]
  %gep = getelementptr inbounds [27 x i8], [27 x i8]* %buf, i32 0, i32 0
  store i8* %gep, i8** %addr
@@ -106,23 +108,25 @@ define void @non_mem_use(i8** %addr) {
 ; CHECK-LABEL: allocarray_inbounds:
 ; CHECK: .local i32{{$}}
 define void @allocarray_inbounds() {
- ; CHECK: i32.const $push[[L1:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.load $push[[L2:.+]]=, 0($pop[[L1]])
- ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 32{{$}}
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, $pop[[L2]], $pop[[L3]]
+ ; CHECK: i32.const $push[[L6:.+]]=, __stack_pointer
+ ; CHECK: i32.const $push[[L3:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.load $push[[L4:.+]]=, 0($pop[[L3]])
+ ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 32{{$}}
+ ; CHECK-NEXT: i32.sub $push[[L10:.+]]=, $pop[[L4]], $pop[[L5]]
+ ; CHECK-NEXT: i32.store $[[SP:.+]]=, 0($pop[[L6]]), $pop[[L10]]{{$}}
  %r = alloca [5 x i32]
  ; CHECK: i32.const $push[[L3:.+]]=, 1
- ; CHECK: i32.store {{.*}}=, 12([[SP]]), $pop[[L3]]
+ ; CHECK: i32.store {{.*}}=, 12($[[SP]]), $pop[[L3]]
  %p = getelementptr inbounds [5 x i32], [5 x i32]* %r, i32 0, i32 0
  store i32 1, i32* %p
  ; This store should have both the GEP and the FI folded into it.
- ; CHECK-NEXT: i32.store {{.*}}=, 24([[SP]]), $pop
+ ; CHECK-NEXT: i32.store {{.*}}=, 24($[[SP]]), $pop
  %p2 = getelementptr inbounds [5 x i32], [5 x i32]* %r, i32 0, i32 3
  store i32 1, i32* %p2
  call void @ext_func(i64* null);
  ; CHECK: i32.const $push[[L6:.+]]=, __stack_pointer
  ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 32
- ; CHECK-NEXT: i32.add $push[[L7:.+]]=, [[SP]], $pop[[L5]]
+ ; CHECK-NEXT: i32.add $push[[L7:.+]]=, $[[SP]], $pop[[L5]]
  ; CHECK-NEXT: i32.store $discard=, 0($pop[[L6]]), $pop[[L7]]
  ret void
 }
@@ -130,14 +134,13 @@ define void @allocarray_inbounds() {
 ; CHECK-LABEL: dynamic_alloca:
 define void @dynamic_alloca(i32 %alloc) {
  ; CHECK: i32.const $push[[L1:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.load [[SP:.+]]=, 0($pop[[L1]])
- ; CHECK-NEXT: copy_local [[FP:.+]]=, [[SP]]
+ ; CHECK-NEXT: i32.load $push[[L13:.+]]=, 0($pop[[L1]])
+ ; CHECK-NEXT: tee_local $push[[L12:.+]]=, [[SP:.+]], $pop[[L13]]{{$}}
+ ; CHECK-NEXT: copy_local [[FP:.+]]=, $pop[[L12]]{{$}}
  ; Target independent codegen bumps the stack pointer.
  ; CHECK: i32.sub
- ; CHECK-NEXT: copy_local [[SP]]=,
  ; Check that SP is written back to memory after decrement
- ; CHECK-NEXT: i32.const $push[[L4:.+]]=, __stack_pointer{{$}}
- ; CHECK-NEXT: i32.store $discard=, 0($pop[[L4]]), [[SP]]
+ ; CHECK: i32.store $discard=, 0($pop{{.+}}), 
  %r = alloca i32, i32 %alloc
  ; Target-independent codegen also calculates the store addr
  ; CHECK: call ext_func_i32@FUNCTION
@@ -149,16 +152,17 @@ define void @dynamic_alloca(i32 %alloc) {
 
 ; CHECK-LABEL: dynamic_alloca_redzone:
 define void @dynamic_alloca_redzone(i32 %alloc) {
- ; CHECK: i32.const $push[[L1:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.load [[SP:.+]]=, 0($pop[[L1]])
- ; CHECK-NEXT: copy_local [[FP:.+]]=, [[SP]]
+ ; CHECK: i32.const $push[[L8:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.load $push[[L13:.+]]=, 0($pop[[L1]])
+ ; CHECK-NEXT: tee_local $push[[L12:.+]]=, [[SP:.+]], $pop[[L13]]{{$}}
+ ; CHECK-NEXT: copy_local [[FP:.+]]=, $pop[[L12]]{{$}}
  ; Target independent codegen bumps the stack pointer
- ; CHECK: i32.sub [[R:.+]]=,
- ; CHECK-NEXT: copy_local [[SP]]=, [[R]]
+ ; CHECK: i32.sub
  %r = alloca i32, i32 %alloc
- ; check-next here asserts that SP is not written back.
- ; CHECK-NEXT: i32.const $push[[ZERO:.+]]=, 0
- ; CHECK-NEXT: i32.store $discard=, 0([[R]]), $pop[[ZERO]]
+ ; CHECK-NEXT: tee_local       $push[[L8:.+]]=, $0=, $pop
+ ; CHECK-NEXT: copy_local      $discard=, $pop[[L8]]{{$}}
+ ; CHECK-NEXT: i32.const       $push[[L6:.+]]=, 0{{$}}
+ ; CHECK-NEXT: i32.store       $discard=, 0($0), $pop[[L6]]{{$}}
  store i32 0, i32* %r
  ; CHECK-NEXT: return
  ret void
@@ -167,26 +171,20 @@ define void @dynamic_alloca_redzone(i32 %alloc) {
 ; CHECK-LABEL: dynamic_static_alloca:
 define void @dynamic_static_alloca(i32 %alloc) noredzone {
  ; Decrement SP in the prolog by the static amount and writeback to memory.
- ; CHECK: i32.const $push[[L1:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.load $push[[L2:.+]]=, 0($pop[[L1]])
- ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 16
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, $pop[[L2]], $pop[[L3]]
- ; CHECK-NEXT: copy_local [[FP:.+]]=, [[SP]]
- ; CHECK-NEXT: i32.const $push[[L4:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.store {{.*}}=, 0($pop[[L4]]), [[SP]]
+ ; CHECK: i32.const $push[[L9:.+]]=, __stack_pointer
+ ; CHECK-NEXT: i32.load $push[[L10:.+]]=, 0($pop[[L9]])
+ ; CHECK-NEXT: i32.const $push[[L11:.+]]=, 16
+ ; CHECK-NEXT: i32.sub $push[[L20:.+]]=, $pop[[L10]], $pop[[L11]]
+ ; CHECK-NEXT: tee_local $push[[L19:.+]]=, $[[FP:.+]]=, $pop[[L20]]
+ ; CHECK:      i32.store $push[[L0:.+]]=, 0($pop{{.+}}), $[[FP]]
  ; Decrement SP in the body by the dynamic amount.
  ; CHECK: i32.sub
- ; CHECK: copy_local [[SP]]=,
  ; Writeback to memory.
- ; CHECK-NEXT: i32.const $push[[L4:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.store {{.*}}=, 0($pop[[L4]]), [[SP]]
+ ; CHECK: i32.store $discard=, 0($pop{{.+}}), $pop{{.+}}
  %r1 = alloca i32
  %r = alloca i32, i32 %alloc
  store i32 0, i32* %r
- ; CHECK: i32.const $push[[L6:.+]]=, __stack_pointer
- ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 16
- ; CHECK-NEXT: i32.add $push[[L7:.+]]=, [[FP]], $pop[[L5]]
- ; CHECK-NEXT: i32.store $discard=, 0($pop[[L6]]), $pop[[L7]]
+ ; CHEC: i32.store $discard=, 0($pop{{.+}}), $pop{{.+}}
  ret void
 }
 
@@ -196,10 +194,10 @@ define void @dynamic_static_alloca(i32 %alloc) noredzone {
 define void @copytoreg_fi(i1 %cond, i32* %b) {
 entry:
  ; CHECK: i32.const $push[[L1:.+]]=, 16
- ; CHECK-NEXT: i32.sub [[SP:.+]]=, {{.+}}, $pop[[L1]]
+ ; CHECK-NEXT: i32.sub $push[[L3:.+]]=, {{.+}}, $pop[[L1]]
  %addr = alloca i32
  ; CHECK: i32.const $push[[OFF:.+]]=, 12
- ; CHECK-NEXT: i32.add $push[[ADDR:.+]]=, [[SP]], $pop[[OFF]]
+ ; CHECK-NEXT: i32.add $push[[ADDR:.+]]=, $pop[[L3]], $pop[[OFF]]
  ; CHECK-NEXT: copy_local [[COPY:.+]]=, $pop[[ADDR]]
  br label %body
 body:
@@ -216,12 +214,13 @@ declare i8* @llvm.frameaddress(i32)
 
 ; Test __builtin_frame_address(0).
 ; CHECK-LABEL: frameaddress_0:
-; CHECK: i32.const $push[[L1:.+]]=, __stack_pointer
-; CHECK-NEXT: i32.load [[SP:.+]]=, 0($pop[[L1]])
-; CHECK-NEXT: copy_local [[FP:.+]]=, [[SP]]
-; CHECK-NEXT: call use_i8_star@FUNCTION, [[FP]]
-; CHEC K-NEXT: i32.const $push[[L6:.+]]=, __stack_pointer
-; CHEC K-NEXT: i32.store [[SP]]=, 0($pop[[L6]]), [[FP]]
+; CHECK: i32.const $push[[L0:.+]]=, __stack_pointer
+; CHECK-NEXT: i32.load $push[[L3:.+]]=, 0($pop[[L0]])
+; CHECK-NEXT: copy_local $push[[L4:.+]]=, $pop[[L3]]{{$}}
+; CHECK-NEXT: tee_local $push[[L2:.+]]=, $[[FP:.+]]=, $pop[[L4]]{{$}}
+; CHECK-NEXT: call use_i8_star@FUNCTION, $pop[[L2]]
+; CHECK-NEXT: i32.const $push[[L1:.+]]=, __stack_pointer
+; CHECK-NEXT: i32.store $discard=, 0($pop[[L1]]), $[[FP]]
 define void @frameaddress_0() {
   %t = call i8* @llvm.frameaddress(i32 0)
   call void @use_i8_star(i8* %t)