[AMDGPU] Add support for Whole Wavefront Mode

author Connor Abbott <cwabbott0@gmail.com>

Fri, 4 Aug 2017 18:36:52 +0000 (18:36 +0000)

committer Connor Abbott <cwabbott0@gmail.com>

Fri, 4 Aug 2017 18:36:52 +0000 (18:36 +0000)
author Connor Abbott <cwabbott0@gmail.com>
Fri, 4 Aug 2017 18:36:52 +0000 (18:36 +0000)
committer Connor Abbott <cwabbott0@gmail.com>
Fri, 4 Aug 2017 18:36:52 +0000 (18:36 +0000)
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td

index e5c57e09770199dfe6244c2dea4b45d281439a64..db73503a71d450a3dfe4ce21cd2f950b93d7015c 100644 (file)
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -747,6 +747,15 @@ def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
    [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
  >;
  
+// Copies the active channels of the source value to the destination value,
+// with the guarantee that the source value is computed as if the entire
+// program were executed in Whole Wavefront Mode, i.e. with all channels
+// enabled, with a few exceptions: - Phi nodes with require WWM return an
+// undefined value.
+def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
+  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+>;
+
  //===----------------------------------------------------------------------===//
  // CI+ Intrinsics
  //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h

index f832338fdb402141b7f22b8ed150c617c01dd990..feb73f4501177dff4ded0af4c2530668e2b36c90 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -50,6 +50,7 @@ FunctionPass *createSIMemoryLegalizerPass();
  FunctionPass *createSIDebuggerInsertNopsPass();
  FunctionPass *createSIInsertWaitsPass();
  FunctionPass *createSIInsertWaitcntsPass();
+FunctionPass *createSIFixWWMLivenessPass();
  FunctionPass *createAMDGPUCodeGenPreparePass();
  FunctionPass *createAMDGPUMachineCFGStructurizerPass();
  FunctionPass *createAMDGPURewriteOutArgumentsPass();
@@ -120,6 +121,9 @@ extern char &SIInsertSkipsPassID;
  void initializeSIOptimizeExecMaskingPass(PassRegistry &);
  extern char &SIOptimizeExecMaskingID;
  
+void initializeSIFixWWMLivenessPass(PassRegistry &);
+extern char &SIFixWWMLivenessID;
+
  // Passes common to R600 and SI
  FunctionPass *createAMDGPUPromoteAlloca();
  void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index 3bd334c2082bcc47d80625f6d394be4290302ca1..7a8fd0195931d5ab86d82864db7700a64161dd19 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -168,6 +168,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
    initializeSIMemoryLegalizerPass(*PR);
    initializeSIDebuggerInsertNopsPass(*PR);
    initializeSIOptimizeExecMaskingPass(*PR);
+  initializeSIFixWWMLivenessPass(*PR);
    initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
    initializeAMDGPUAAWrapperPassPass(*PR);
  }
@@ -792,6 +793,10 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
    // SI_ELSE will introduce a copy of the tied operand source after the else.
    insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
  
+  // This must be run after SILowerControlFlow, since it needs to use the
+  // machine-level CFG, but before register allocation.
+  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+
    TargetPassConfig::addFastRegAlloc(RegAllocPass);
  }
  
@@ -808,6 +813,10 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
    // SI_ELSE will introduce a copy of the tied operand source after the else.
    insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
  
+  // This must be run after SILowerControlFlow, since it needs to use the
+  // machine-level CFG, but before register allocation.
+  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+
    TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
  }
  
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt

index 58b6b4857dd0ac90535f9e6cf60c2105ef341a0e..5de828a7b6b500ff65cce210ab50695fb109c7cb 100644 (file)
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -69,6 +69,7 @@ add_llvm_target(AMDGPUCodeGen
    SIFixControlFlowLiveIntervals.cpp
    SIFixSGPRCopies.cpp
    SIFixVGPRCopies.cpp
+  SIFixWWMLiveness.cpp
    SIFoldOperands.cpp
    SIFrameLowering.cpp
    SIInsertSkips.cpp
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

index ee2b415099bae8c074f64633bbe3d3be9115ae49..86ad8df2a21f22bd7c4c5d5be38518be3ad101a7 100644 (file)
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -568,7 +568,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
        default:
          continue;
        case AMDGPU::COPY:
-      case AMDGPU::WQM: {
+      case AMDGPU::WQM:
+      case AMDGPU::WWM: {
          // If the destination register is a physical register there isn't really
          // much we can do to fix this.
          if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp

new file mode 100644 (file)

index 0000000..8b98563
--- /dev/null
+++ b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
@@ -0,0 +1,202 @@
+//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Computations in WWM can overwrite values in inactive channels for
+/// variables that the register allocator thinks are dead. This pass adds fake
+/// uses of those variables to WWM instructions to make sure that they aren't
+/// overwritten.
+///
+/// As an example, consider this snippet:
+/// %vgpr0 = V_MOV_B32_e32 0.0
+/// if (...) {
+///   %vgpr1 = ...
+///   %vgpr2 = WWM %vgpr1<kill>
+///   ... = %vgpr2<kill>
+///   %vgpr0 = V_MOV_B32_e32 1.0
+/// }
+/// ... = %vgpr0
+///
+/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally,
+/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since
+/// writing %vgpr1 would only write to channels that would be clobbered by the
+/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
+/// it would clobber even the inactive channels for which the if-condition is
+/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
+/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the
+/// same register.
+///
+/// In general, we need to figure out what registers might have their inactive
+/// channels which are eventually used accidentally clobbered by a WWM
+/// instruction. We approximate this using two conditions:
+///
+/// 1. A definition of the variable reaches the WWM instruction.
+/// 2. The variable would be live at the WWM instruction if all its defs were
+/// partial defs (i.e. considered as a use), ignoring normal uses.
+///
+/// If a register matches both conditions, then we add an implicit use of it to
+/// the WWM instruction. Condition #2 is the heart of the matter: every
+/// definition is really a partial definition, since every VALU instruction is
+/// implicitly predicated.  We can usually ignore this, but WWM forces us not
+/// to. Condition #1 prevents false positives if the variable is undefined at
+/// the WWM instruction anyways. This is overly conservative in certain cases,
+/// especially in uniform control flow, but this is a workaround anyways until
+/// LLVM gains the notion of predicated uses and definitions of variables.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-wwm-liveness"
+
+namespace {
+
+class SIFixWWMLiveness : public MachineFunctionPass {
+private:
+  LiveIntervals *LIS = nullptr;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  static char ID;
+
+  SIFixWWMLiveness() : MachineFunctionPass(ID) {
+    initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  bool runOnWWMInstruction(MachineInstr &MI);
+
+  void addDefs(const MachineInstr &MI, SparseBitVector<> &set);
+
+  StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // Should preserve the same set that TwoAddressInstructions does.
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(LiveVariablesID);
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE,
+                "SI fix WWM liveness", false, false)
+
+char SIFixWWMLiveness::ID = 0;
+
+char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID;
+
+FunctionPass *llvm::createSIFixWWMLivenessPass() {
+  return new SIFixWWMLiveness();
+}
+
+void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs)
+{
+  for (const MachineOperand &Op : MI.defs()) {
+    if (Op.isReg()) {
+      unsigned Reg = Op.getReg();
+      if (TRI->isVGPR(*MRI, Reg))
+        Regs.set(Reg);
+    }
+  }
+}
+
+bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) {
+  MachineBasicBlock *MBB = WWM.getParent();
+
+  // Compute the registers that are live out of MI by figuring out which defs
+  // are reachable from MI.
+  SparseBitVector<> LiveOut;
+
+  for (auto II = MachineBasicBlock::iterator(WWM), IE =
+       MBB->end(); II != IE; ++II) {
+    addDefs(*II, LiveOut);
+  }
+
+  for (df_iterator<MachineBasicBlock *> I = ++df_begin(MBB),
+                                        E = df_end(MBB);
+       I != E; ++I) {
+    for (const MachineInstr &MI : **I) {
+      addDefs(MI, LiveOut);
+    }
+  }
+
+  // Compute the registers that reach MI.
+  SparseBitVector<> Reachable;
+
+  for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE =
+       MBB->rend(); II != IE; ++II) {
+    addDefs(*II, Reachable);
+  }
+
+  for (idf_iterator<MachineBasicBlock *> I = ++idf_begin(MBB),
+                                         E = idf_end(MBB);
+       I != E; ++I) {
+    for (const MachineInstr &MI : **I) {
+      addDefs(MI, Reachable);
+    }
+  }
+
+  // find the intersection, and add implicit uses.
+  LiveOut &= Reachable;
+
+  bool Modified = false;
+  for (unsigned Reg : LiveOut) {
+    WWM.addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
+    if (LIS) {
+      // FIXME: is there a better way to update the live interval?
+      LIS->removeInterval(Reg);
+      LIS->createAndComputeVirtRegInterval(Reg);
+    }
+    Modified = true;
+  }
+
+  return Modified;
+}
+
+bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
+  bool Modified = false;
+
+  // This doesn't actually need LiveIntervals, but we can preserve them.
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
+        Modified |= runOnWWMInstruction(MI);
+      }
+    }
+  }
+
+  return Modified;
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index fecad1e1646d6f90dcf8a9a921dba09c58a23e8c..f3036ef85ee1ce6a7d30c704d29a62d1499491b0 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3947,6 +3947,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
      return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
                     0);
    }
+  case Intrinsic::amdgcn_wwm: {
+    SDValue Src = Op.getOperand(1);
+    return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
+                   0);
+  }
    default:
      return Op;
    }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp

index dc4b9998786bfce9b3373d3d08021ec725d4588f..0b0d038803193365b387f020f7494b396fc092ea 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1156,6 +1156,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
      MI.eraseFromParent();
      break;
    }
+  case AMDGPU::EXIT_WWM: {
+    // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
+    // is exited.
+    MI.setDesc(get(AMDGPU::S_MOV_B64));
+    break;
+  }
    }
    return true;
  }
@@ -2667,6 +2673,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
    case AMDGPU::PHI: return AMDGPU::PHI;
    case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
    case AMDGPU::WQM: return AMDGPU::WQM;
+  case AMDGPU::WWM: return AMDGPU::WWM;
    case AMDGPU::S_MOV_B32:
      return MI.getOperand(1).isReg() ?
             AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
@@ -3972,6 +3979,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
    case AMDGPU::REG_SEQUENCE:
    case AMDGPU::INSERT_SUBREG:
    case AMDGPU::WQM:
+  case AMDGPU::WWM:
      if (RI.hasVGPRs(NewDstRC))
        return nullptr;
  
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td

index f20ce4d4b2805f7c73af6a5505170860e54c7429..a13c8f32fe6bee65b9bd5e0ce63cc1f16f985a61 100644 (file)
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -117,12 +117,26 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
  def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                        (ins VSrc_b64:$src0)>;
  
-// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy
-// after the WQM pass processes them.
+// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
+// WQM pass processes it.
  def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
  
+// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
+// that the @earlyclobber is respected. The @earlyclobber is to make sure that
+// the instruction that defines $src0 (which is run in WWM) doesn't
+// accidentally clobber inactive channels of $vdst.
+let Constraints = "@earlyclobber $vdst" in {
+def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+}
+
  } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
  
+def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+
  let usesCustomInserter = 1, SALU = 1 in {
  def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
    [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp

index 6d91a7b0cf187156edd2149b9201702221ab9173..1a0f0f9aca90f7fb34d3b601973fbf8306e286e8 100644 (file)
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -9,7 +9,7 @@
  //
  /// \file
  /// \brief This pass adds instructions to enable whole quad mode for pixel
-/// shaders.
+/// shaders, and whole wavefront mode for all programs.
  ///
  /// Whole quad mode is required for derivative computations, but it interferes
  /// with shader side effects (stores and atomics). This pass is run on the
@@ -29,6 +29,13 @@
  ///   ...
  ///   S_MOV_B64 EXEC, Tmp
  ///
+/// We also compute when a sequence of instructions requires Whole Wavefront
+/// Mode (WWM) and insert instructions to save and restore it:
+///
+/// S_OR_SAVEEXEC_B64 Tmp, -1
+/// ...
+/// S_MOV_B64 EXEC, Tmp
+///
  /// In order to avoid excessive switching during sequences of Exact
  /// instructions, the pass first analyzes which instructions must be run in WQM
  /// (aka which instructions produce values that lead to derivative
@@ -85,7 +92,8 @@ namespace {
  
  enum {
    StateWQM = 0x1,
-  StateExact = 0x2,
+  StateWWM = 0x2,
+  StateExact = 0x4,
  };
  
  struct PrintState {
@@ -98,9 +106,14 @@ public:
  static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
    if (PS.State & StateWQM)
      OS << "WQM";
-  if (PS.State & StateExact) {
+  if (PS.State & StateWWM) {
      if (PS.State & StateWQM)
        OS << '|';
+    OS << "WWM";
+  }
+  if (PS.State & StateExact) {
+    if (PS.State & (StateWQM | StateWWM))
+      OS << '|';
      OS << "Exact";
    }
  
@@ -130,6 +143,7 @@ struct WorkItem {
  
  class SIWholeQuadMode : public MachineFunctionPass {
  private:
+  CallingConv::ID CallingConv;
    const SIInstrInfo *TII;
    const SIRegisterInfo *TRI;
    MachineRegisterInfo *MRI;
@@ -163,6 +177,10 @@ private:
                 unsigned SaveWQM, unsigned LiveMaskReg);
    void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
               unsigned SavedWQM);
+  void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+             unsigned SaveOrig);
+  void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+               unsigned SavedOrig);
    void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
  
    void lowerLiveMaskQueries(unsigned LiveMaskReg);
@@ -223,7 +241,7 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
                                        std::vector<WorkItem> &Worklist) {
    InstrInfo &II = Instructions[&MI];
  
-  assert(Flag == StateWQM);
+  assert(!(Flag & StateExact) && Flag != 0);
  
    // Remove any disabled states from the flag. The user that required it gets
    // an undefined value in the helper lanes. For example, this can happen if
@@ -243,7 +261,6 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
  /// Mark all instructions defining the uses in \p MI with \p Flag.
  void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
                                            std::vector<WorkItem> &Worklist) {
-  assert(Flag == StateWQM);
    for (const MachineOperand &Use : MI.uses()) {
      if (!Use.isReg() || !Use.isUse())
        continue;
@@ -302,7 +319,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
        unsigned Opcode = MI.getOpcode();
        char Flags = 0;
  
-      if (TII->isDS(Opcode)) {
+      if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
          Flags = StateWQM;
        } else if (TII->isWQM(Opcode)) {
          // Sampling instructions don't need to produce results for all pixels
@@ -316,6 +333,14 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
          // correct, so we need it to be in WQM.
          Flags = StateWQM;
          LowerToCopyInstrs.push_back(&MI);
+      } else if (Opcode == AMDGPU::WWM) {
+        // The WWM intrinsic doesn't make the same guarantee, and plus it needs
+        // to be executed in WQM or Exact so that its copy doesn't clobber
+        // inactive lanes.
+        markInstructionUses(MI, StateWWM, Worklist);
+        GlobalFlags |= StateWWM;
+        LowerToCopyInstrs.push_back(&MI);
+        continue;
        } else if (TII->isDisableWQM(MI)) {
          BBI.Needs |= StateExact;
          if (!(BBI.InNeeds & StateExact)) {
@@ -323,7 +348,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
            Worklist.push_back(&MBB);
          }
          GlobalFlags |= StateExact;
-        III.Disabled = StateWQM;
+        III.Disabled = StateWQM | StateWWM;
          continue;
        } else {
          if (Opcode == AMDGPU::SI_PS_LIVE) {
@@ -383,7 +408,7 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
  
    // Propagate backwards within block
    if (MachineInstr *PrevMI = MI.getPrevNode()) {
-    char InNeeds = II.Needs | II.OutNeeds;
+    char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
      if (!PrevMI->isPHI()) {
        InstrInfo &PrevII = Instructions[PrevMI];
        if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
@@ -589,6 +614,29 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
    LIS->InsertMachineInstrInMaps(*MI);
  }
  
+void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator Before,
+                            unsigned SaveOrig) {
+  MachineInstr *MI;
+
+  assert(SaveOrig);
+  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
+               SaveOrig)
+           .addImm(-1);
+  LIS->InsertMachineInstrInMaps(*MI);
+}
+
+void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator Before,
+                              unsigned SavedOrig) {
+  MachineInstr *MI;
+
+  assert(SavedOrig);
+  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
+           .addReg(SavedOrig);
+  LIS->InsertMachineInstrInMaps(*MI);
+}
+
  void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
                                     bool isEntry) {
    auto BII = Blocks.find(&MBB);
@@ -597,45 +645,63 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
  
    const BlockInfo &BI = BII->second;
  
-  if (!(BI.InNeeds & StateWQM))
-    return;
-
    // This is a non-entry block that is WQM throughout, so no need to do
    // anything.
-  if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
      return;
  
    DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
  
    unsigned SavedWQMReg = 0;
+  unsigned SavedNonWWMReg = 0;
    bool WQMFromExec = isEntry;
-  char State = isEntry ? StateExact : StateWQM;
+  char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
+  char NonWWMState = 0;
  
    auto II = MBB.getFirstNonPHI(), IE = MBB.end();
    if (isEntry)
      ++II; // Skip the instruction that saves LiveMask
  
-  MachineBasicBlock::iterator First = IE;
+  // This stores the first instruction where it's safe to switch from WQM to
+  // Exact or vice versa.
+  MachineBasicBlock::iterator FirstWQM = IE;
+
+  // This stores the first instruction where it's safe to switch from WWM to
+  // Exact/WQM or to switch to WWM. It must always be the same as, or after,
+  // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
+  // switch to/from WQM as well.
+  MachineBasicBlock::iterator FirstWWM = IE;
    for (;;) {
      MachineBasicBlock::iterator Next = II;
-    char Needs = StateExact | StateWQM;
+    char Needs = StateExact | StateWQM; // WWM is disabled by default
      char OutNeeds = 0;
  
-    if (First == IE)
-      First = II;
+    if (FirstWQM == IE)
+      FirstWQM = II;
+
+    if (FirstWWM == IE)
+      FirstWWM = II;
  
+    // First, figure out the allowed states (Needs) based on the propagated
+    // flags.
      if (II != IE) {
        MachineInstr &MI = *II;
  
        if (requiresCorrectState(MI)) {
          auto III = Instructions.find(&MI);
          if (III != Instructions.end()) {
-          if (III->second.Needs & StateWQM)
+          if (III->second.Needs & StateWWM)
+            Needs = StateWWM;
+          else if (III->second.Needs & StateWQM)
              Needs = StateWQM;
            else
              Needs &= ~III->second.Disabled;
            OutNeeds = III->second.OutNeeds;
          }
+      } else {
+        // If the instruction doesn't actually need a correct EXEC, then we can
+        // safely leave WWM enabled.
+        Needs = StateExact | StateWQM | StateWWM;
        }
  
        if (MI.isTerminator() && OutNeeds == StateExact)
@@ -655,35 +721,63 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
          Needs = StateWQM | StateExact;
      }
  
+    // Now, transition if necessary.
      if (!(Needs & State)) {
+      MachineBasicBlock::iterator First;
+      if (State == StateWWM || Needs == StateWWM) {
+        // We must switch to or from WWM
+        First = FirstWWM;
+      } else {
+        // We only need to switch to/from WQM, so we can use FirstWQM
+        First = FirstWQM;
+      }
+
        MachineBasicBlock::iterator Before =
            prepareInsertion(MBB, First, II, Needs == StateWQM,
                             Needs == StateExact || WQMFromExec);
  
-      if (Needs == StateExact) {
-        if (!WQMFromExec && (OutNeeds & StateWQM))
-          SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      if (State == StateWWM) {
+        assert(SavedNonWWMReg);
+        fromWWM(MBB, Before, SavedNonWWMReg);
+        State = NonWWMState;
+      }
  
-        toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
-        State = StateExact;
+      if (Needs == StateWWM) {
+        NonWWMState = State;
+        SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+        toWWM(MBB, Before, SavedNonWWMReg);
+        State = StateWWM;
        } else {
-        assert(Needs == StateWQM);
-        assert(WQMFromExec == (SavedWQMReg == 0));
+        if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
+          if (!WQMFromExec && (OutNeeds & StateWQM))
+            SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
  
-        toWQM(MBB, Before, SavedWQMReg);
+          toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+          State = StateExact;
+        } else if (State == StateExact && (Needs & StateWQM) &&
+                   !(Needs & StateExact)) {
+          assert(WQMFromExec == (SavedWQMReg == 0));
  
-        if (SavedWQMReg) {
-          LIS->createAndComputeVirtRegInterval(SavedWQMReg);
-          SavedWQMReg = 0;
+          toWQM(MBB, Before, SavedWQMReg);
+
+          if (SavedWQMReg) {
+            LIS->createAndComputeVirtRegInterval(SavedWQMReg);
+            SavedWQMReg = 0;
+          }
+          State = StateWQM;
+        } else {
+          // We can get here if we transitioned from WWM to a non-WWM state that
+          // already matches our needs, but we shouldn't need to do anything.
+          assert(Needs & State);
          }
-        State = StateWQM;
        }
-
-      First = IE;
      }
  
-    if (Needs != (StateExact | StateWQM))
-      First = IE;
+    if (Needs != (StateExact | StateWQM | StateWWM)) {
+      if (Needs != (StateExact | StateWQM))
+        FirstWQM = IE;
+      FirstWWM = IE;
+    }
  
      if (II == IE)
        break;
@@ -710,13 +804,11 @@ void SIWholeQuadMode::lowerCopyInstrs() {
  }
  
  bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
-  if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
-    return false;
-
    Instructions.clear();
    Blocks.clear();
    LiveMaskQueries.clear();
    LowerToCopyInstrs.clear();
+  CallingConv = MF.getFunction()->getCallingConv();
  
    const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
  
@@ -726,14 +818,13 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
    LIS = &getAnalysis<LiveIntervals>();
  
    char GlobalFlags = analyzeFunction(MF);
+  unsigned LiveMaskReg = 0;
    if (!(GlobalFlags & StateWQM)) {
      lowerLiveMaskQueries(AMDGPU::EXEC);
-    return !LiveMaskQueries.empty();
-  }
-
-  // Store a copy of the original live mask when required
-  unsigned LiveMaskReg = 0;
-  {
+    if (!(GlobalFlags & StateWWM))
+      return !LiveMaskQueries.empty();
+  } else {
+    // Store a copy of the original live mask when required
      MachineBasicBlock &Entry = MF.front();
      MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
  
@@ -745,13 +836,14 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
        LIS->InsertMachineInstrInMaps(*MI);
      }
  
+    lowerLiveMaskQueries(LiveMaskReg);
+
      if (GlobalFlags == StateWQM) {
        // For a shader that needs only WQM, we can just set it once.
        BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
                AMDGPU::EXEC)
            .addReg(AMDGPU::EXEC);
  
-      lowerLiveMaskQueries(LiveMaskReg);
        lowerCopyInstrs();
        // EntryMI may become invalid here
        return true;
@@ -760,7 +852,6 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
  
    DEBUG(printInfo());
  
-  lowerLiveMaskQueries(LiveMaskReg);
    lowerCopyInstrs();
  
    // Handle the general case
diff --git a/test/CodeGen/AMDGPU/fix-wwm-liveness.mir b/test/CodeGen/AMDGPU/fix-wwm-liveness.mir

new file mode 100644 (file)

index 0000000..101ba00
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fix-wwm-liveness.mir
@@ -0,0 +1,73 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fix-wwm-liveness -o -  %s | FileCheck %s
+#CHECK: %exec = EXIT_WWM killed %19, implicit %21
+
+---
+name:            test_wwm_liveness
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: sreg_64, preferred-register: '' }
+  - { id: 1, class: sgpr_32, preferred-register: '' }
+  - { id: 2, class: sgpr_32, preferred-register: '' }
+  - { id: 3, class: vgpr_32, preferred-register: '' }
+  - { id: 4, class: vgpr_32, preferred-register: '' }
+  - { id: 5, class: vgpr_32, preferred-register: '' }
+  - { id: 6, class: vgpr_32, preferred-register: '' }
+  - { id: 7, class: vgpr_32, preferred-register: '' }
+  - { id: 8, class: sreg_64, preferred-register: '%vcc' }
+  - { id: 9, class: sreg_64, preferred-register: '' }
+  - { id: 10, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 11, class: sreg_64, preferred-register: '' }
+  - { id: 12, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 13, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 14, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 15, class: sreg_128, preferred-register: '' }
+  - { id: 16, class: vgpr_32, preferred-register: '' }
+  - { id: 17, class: vgpr_32, preferred-register: '' }
+  - { id: 18, class: vgpr_32, preferred-register: '' }
+  - { id: 19, class: sreg_64, preferred-register: '' }
+  - { id: 20, class: sreg_64, preferred-register: '' }
+  - { id: 21, class: vgpr_32, preferred-register: '' }
+  - { id: 22, class: sreg_64, preferred-register: '' }
+  - { id: 23, class: sreg_64, preferred-register: '' }
+liveins:         
+body:             |
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  
+    %21 = V_MOV_B32_e32 0, implicit %exec
+    %5 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit %exec
+    %6 = V_MBCNT_HI_U32_B32_e32 -1, killed %5, implicit %exec
+    %8 = V_CMP_GT_U32_e64 32, killed %6, implicit %exec
+    %22 = COPY %exec, implicit-def %exec
+    %23 = S_AND_B64 %22, %8, implicit-def dead %scc
+    %0 = S_XOR_B64 %23, %22, implicit-def dead %scc
+    %exec = S_MOV_B64_term killed %23
+    SI_MASK_BRANCH %bb.2, implicit %exec
+    S_BRANCH %bb.1
+  
+  bb.1:
+    successors: %bb.2(0x80000000)
+  
+    %13 = S_MOV_B32 61440
+    %14 = S_MOV_B32 -1
+    %15 = REG_SEQUENCE undef %12, 1, undef %10, 2, killed %14, 3, killed %13, 4
+    %19 = COPY %exec
+    %exec = S_MOV_B64 -1
+    %16 = BUFFER_LOAD_DWORD_OFFSET %15, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4)
+    %17 = V_ADD_F32_e32 1065353216, killed %16, implicit %exec
+    %exec = EXIT_WWM killed %19
+    %21 = V_MOV_B32_e32 1, implicit %exec
+    early-clobber %18 = WWM killed %17, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %18, killed %15, 0, 0, 0, 0, 0, implicit %exec :: (store 4)
+  
+  bb.2:
+    %exec = S_OR_B64 %exec, killed %0, implicit-def %scc
+    %vgpr0 = COPY killed %21
+    SI_RETURN_TO_EPILOG killed %vgpr0
+
+...
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll

index 2f47a522dce0581db6c686c114f0bb03c250568c..44bbeeba9f041fb55b500c6eafb164e05d1f1e4a 100644 (file)
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -108,6 +108,154 @@ main_body:
    ret float %out.2
  }
  
+; Check that WWM is triggered by the wwm intrinsic.
+;
+;CHECK-LABEL: {{^}}test_wwm1:
+;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %out = fadd float %src0, %src1
+  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
+  ret float %out.0
+}
+
+; Same as above, but with an integer type.
+;
+;CHECK-LABEL: {{^}}test_wwm2:
+;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_i32_e32
+define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %src0.0 = bitcast float %src0 to i32
+  %src1.0 = bitcast float %src1 to i32
+  %out = add i32 %src0.0, %src1.0
+  %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
+  %out.1 = bitcast i32 %out.0 to float
+  ret float %out.1
+}
+
+; Check that we don't leave WWM on for computations that don't require WWM,
+; since that will lead clobbering things that aren't supposed to be clobbered
+; in cases like this.
+;
+;CHECK-LABEL: {{^}}test_wwm3:
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
+main_body:
+  ; use mbcnt to make sure the branch is divergent
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  %cc = icmp uge i32 %hi, 32
+  br i1 %cc, label %endif, label %if
+
+if:
+  %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
+  %out = fadd float %src, %src
+  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
+  %out.1 = fadd float %src, %out.0
+  br label %endif
+
+endif:
+  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
+  ret float %out.2
+}
+
+; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
+; write could clobber disabled channels in the non-WWM one.
+;
+;CHECK-LABEL: {{^}}test_wwm4:
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK-NEXT: v_mov_b32_e32
+define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
+main_body:
+  ; use mbcnt to make sure the branch is divergent
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  %cc = icmp uge i32 %hi, 32
+  br i1 %cc, label %endif, label %if
+
+if:
+  %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
+  %out = fadd float %src, %src
+  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
+  br label %endif
+
+endif:
+  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
+  ret float %out.1
+}
+
+; Make sure the transition from Exact to WWM then WQM works properly.
+;
+;CHECK-LABEL: {{^}}test_wwm5:
+;CHECK: buffer_load_dword
+;CHECK: buffer_store_dword
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: s_wqm_b64 exec, exec
+define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %temp = fadd float %src1, %src1
+  %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
+  %out = fadd float %temp.0, %temp.0
+  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
+  ret float %out.0
+}
+
+; Check that WWM is turned on correctly across basic block boundaries.
+;
+;CHECK-LABEL: {{^}}test_wwm6:
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;SI-CHECK: buffer_load_dword
+;VI-CHECK: flat_load_dword
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: %if
+;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
+;SI-CHECK: buffer_load_dword
+;VI-CHECK: flat_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG2]]
+define amdgpu_ps float @test_wwm6() {
+main_body:
+  %src0 = load volatile float, float addrspace(1)* undef
+  ; use mbcnt to make sure the branch is divergent
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  %cc = icmp uge i32 %hi, 32
+  br i1 %cc, label %endif, label %if
+
+if:
+  %src1 = load volatile float, float addrspace(1)* undef
+  %out = fadd float %src0, %src1
+  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
+  br label %endif
+
+endif:
+  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
+  ret float %out.1
+}
+
  ; Check a case of one branch of an if-else requiring WQM, the other requiring
  ; exact.
  ;
@@ -530,6 +678,10 @@ declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8
  declare void @llvm.AMDGPU.kill(float) #1
  declare float @llvm.amdgcn.wqm.f32(float) #3
  declare i32 @llvm.amdgcn.wqm.i32(i32) #3
+declare float @llvm.amdgcn.wwm.f32(float) #3
+declare i32 @llvm.amdgcn.wwm.i32(i32) #3
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
  
  attributes #1 = { nounwind }
  attributes #2 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/wqm.mir b/test/CodeGen/AMDGPU/wqm.mir

new file mode 100644 (file)

index 0000000..637a003
--- /dev/null
+++ b/test/CodeGen/AMDGPU/wqm.mir
@@ -0,0 +1,50 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-wqm -o -  %s | FileCheck %s
+
+---
+# Check for awareness that s_or_saveexec_b64 clobbers SCC
+#
+#CHECK: S_OR_SAVEEXEC_B64
+#CHECK: S_CMP_LT_I32
+#CHECK: S_CSELECT_B32
+name:            test_wwm_scc
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: sgpr_32, preferred-register: '' }
+  - { id: 1, class: sgpr_32, preferred-register: '' }
+  - { id: 2, class: sgpr_32, preferred-register: '' }
+  - { id: 3, class: vgpr_32, preferred-register: '' }
+  - { id: 4, class: vgpr_32, preferred-register: '' }
+  - { id: 5, class: sgpr_32, preferred-register: '' }
+  - { id: 6, class: vgpr_32, preferred-register: '' }
+  - { id: 7, class: vgpr_32, preferred-register: '' }
+  - { id: 8, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 9, class: sreg_32, preferred-register: '' }
+  - { id: 10, class: sreg_32, preferred-register: '' }
+  - { id: 11, class: vgpr_32, preferred-register: '' }
+  - { id: 12, class: vgpr_32, preferred-register: '' }
+liveins:         
+  - { reg: '%sgpr0', virtual-reg: '%0' }
+  - { reg: '%sgpr1', virtual-reg: '%1' }
+  - { reg: '%sgpr2', virtual-reg: '%2' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0:
+    liveins: %sgpr0, %sgpr1, %sgpr2, %vgpr0
+  
+    %3 = COPY %vgpr0
+    %2 = COPY %sgpr2
+    %1 = COPY %sgpr1
+    %0 = COPY %sgpr0
+    S_CMP_LT_I32 0, %0, implicit-def %scc
+    %12 = V_ADD_I32_e32 %3, %3, implicit-def %vcc, implicit %exec
+    %5 = S_CSELECT_B32 %2, %1, implicit %scc
+    %11 = V_ADD_I32_e32 %5, %12, implicit-def %vcc, implicit %exec
+    %vgpr0 = WWM %11, implicit %exec
+    SI_RETURN_TO_EPILOG %vgpr0
+
+...
author	Connor Abbott <cwabbott0@gmail.com>
	Fri, 4 Aug 2017 18:36:52 +0000 (18:36 +0000)
committer	Connor Abbott <cwabbott0@gmail.com>
	Fri, 4 Aug 2017 18:36:52 +0000 (18:36 +0000)
include/llvm/IR/IntrinsicsAMDGPU.td		patch \| blob \| history
lib/Target/AMDGPU/AMDGPU.h		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
lib/Target/AMDGPU/CMakeLists.txt		patch \| blob \| history
lib/Target/AMDGPU/SIFixSGPRCopies.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIFixWWMLiveness.cpp	[new file with mode: 0644]	patch \| blob
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
lib/Target/AMDGPU/SIWholeQuadMode.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/fix-wwm-liveness.mir	[new file with mode: 0644]	patch \| blob
test/CodeGen/AMDGPU/wqm.ll		patch \| blob \| history
test/CodeGen/AMDGPU/wqm.mir	[new file with mode: 0644]	patch \| blob