AMDGPU: Fix verifier errors in SILowerControlFlow

author Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 22 Jun 2016 20:15:28 +0000 (20:15 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 22 Jun 2016 20:15:28 +0000 (20:15 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 22 Jun 2016 20:15:28 +0000 (20:15 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 22 Jun 2016 20:15:28 +0000 (20:15 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index 3329e3bafdabafc1da68e39cfef77978d77051f1..942346c6d8f824edc9aaf0c82e3819babdf5e3b2 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -652,7 +652,7 @@ AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
                                    const SDLoc &DL, SelectionDAG &DAG) const {
-  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
+  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
  }
  
  //===---------------------------------------------------------------------===//
@@ -2722,10 +2722,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
    // AMDIL DAG nodes
    NODE_NAME_CASE(CALL);
    NODE_NAME_CASE(UMUL);
-  NODE_NAME_CASE(RET_FLAG);
    NODE_NAME_CASE(BRANCH_COND);
  
    // AMDGPU DAG nodes
+  NODE_NAME_CASE(ENDPGM)
+  NODE_NAME_CASE(RETURN)
    NODE_NAME_CASE(DWORDADDR)
    NODE_NAME_CASE(FRACT)
    NODE_NAME_CASE(CLAMP)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h

index d4b3766adff014bc320ed5741ef0c1a565c50ed8..e5acf364b540c8696009581773497123275d555e 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -219,9 +219,10 @@ enum NodeType : unsigned {
    FIRST_NUMBER = ISD::BUILTIN_OP_END,
    CALL,        // Function call based on a single integer
    UMUL,        // 32bit unsigned multiplication
-  RET_FLAG,
    BRANCH_COND,
    // End AMDIL ISD Opcodes
+  ENDPGM,
+  RETURN,
    DWORDADDR,
    FRACT,
    CLAMP,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td

index a580d805dad3b3798a40315f3fd79502b484eac0..2b13bb9079ea92532509b6aa66887565ed738298 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -261,5 +261,8 @@ def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai
  //===----------------------------------------------------------------------===//
  // Call/Return DAG Nodes
  //===----------------------------------------------------------------------===//
-def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
+
+def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone,
      [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

index 0fd17b41f7e861bf582c6a4e496d222624294231..154e992590e2a7748f2da8c320ec99aba177727c 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -17,7 +17,6 @@
  #include "AMDGPUAsmPrinter.h"
  #include "AMDGPUTargetMachine.h"
  #include "InstPrinter/AMDGPUInstPrinter.h"
-#include "R600InstrInfo.h"
  #include "SIInstrInfo.h"
  #include "llvm/CodeGen/MachineBasicBlock.h"
  #include "llvm/CodeGen/MachineInstr.h"
@@ -107,6 +106,29 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
        ++I;
      }
    } else {
+    // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder
+    // terminator instructions and should only be printed as comments.
+    if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
+      if (isVerbose()) {
+        SmallVector<char, 16> BBStr;
+        raw_svector_ostream Str(BBStr);
+
+        const MachineBasicBlock *MBB = MI->getOperand(1).getMBB();
+        const MCSymbolRefExpr *Expr
+          = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+        Expr->print(Str, MAI);
+        OutStreamer->emitRawComment(" mask branch " + BBStr);
+      }
+
+      return;
+    }
+
+    if (MI->getOpcode() == AMDGPU::SI_RETURN) {
+      if (isVerbose())
+        OutStreamer->emitRawComment(" return");
+      return;
+    }
+
      MCInst TmpInst;
      MCInstLowering.lower(MI, TmpInst);
      EmitToStreamer(*OutStreamer, TmpInst);
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index 3bccf163608986f9c92b7c911ded688bfc3aa807..012e3621ec9247a5e94e49b176a16e1503330639 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -448,8 +448,8 @@ void GCNPassConfig::addPreEmitPass() {
  
    addPass(createSIInsertWaitsPass());
    addPass(createSIShrinkInstructionsPass());
-  addPass(createSILowerControlFlowPass(), false);
-  addPass(createSIDebuggerInsertNopsPass(), false);
+  addPass(createSILowerControlFlowPass());
+  addPass(createSIDebuggerInsertNopsPass());
  }
  
  TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td

index b3ff7261447622317de377b674d95a015f3d4ac5..b7a8a808099d08818569366362e3052cfdb129ea 100644 (file)
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -1539,8 +1539,9 @@ let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
  //===---------------------------------------------------------------------===//
  let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
      usesCustomInserter = 1 in {
-  def RETURN          : ILFormat<(outs), (ins variable_ops),
-      "RETURN", [(IL_retflag)]>;
+  def RETURN : ILFormat<(outs), (ins variable_ops),
+    "RETURN", [(AMDGPUendpgm)]
+  >;
  }
  
  //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index 3bac7dd37bf3a7a0444a62c8145a55c8fdb76bdc..63efbde70c504acf4616eea0dff347154f53acba 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1008,7 +1008,8 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
    if (Flag.getNode())
      RetOps.push_back(Flag);
  
-  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
+  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
+  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
  }
  
  unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
@@ -1469,8 +1470,8 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op,
  
    // FIXME: This should really be selected to s_trap, but that requires
    // setting up the trap handler for it o do anything.
-  return DAG.getNode(AMDGPUISD::RET_FLAG, SDLoc(Op), MVT::Other, Op.
-                     getOperand(0));
+  return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
+                     Op.getOperand(0));
  }
  
  SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td

index 92a4b8ff89a4c4e057a4b416702ed716c06da897..167e3ed749b57fc709dcf052d1d55d78113b3434 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -11,8 +11,9 @@
  //
  //===----------------------------------------------------------------------===//
  
-class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
-    AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+class InstSI <dag outs, dag ins, string asm = "",
+              list<dag> pattern = []> :
+  AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
  
    field bits<1> VM_CNT = 0;
    field bits<1> EXP_CNT = 0;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td

index b40789517ed6a0008f58237cc0fd6497eca77dca..6f7159cf36679f8d8a66005ddde2f36169bba8c0 100644 (file)
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -426,7 +426,7 @@ def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
  let isTerminator = 1 in {
  
  def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
-  [(IL_retflag)]> {
+  [(AMDGPUendpgm)]> {
    let simm16 = 0;
    let isBarrier = 1;
    let hasCtrlDep = 1;
@@ -1908,7 +1908,7 @@ def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$vdst), (ins VSrc_64:$src0), "", []
  } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0
  
  let hasSideEffects = 1, SALU = 1 in {
-def SGPR_USE : InstSI <(outs),(ins), "", []>;
+def SGPR_USE : InstSI <(outs), (ins)>;
  }
  
  let usesCustomInserter = 1, SALU = 1 in {
@@ -1919,61 +1919,57 @@ def GET_GROUPSTATICSIZE : InstSI <(outs SReg_32:$sdst), (ins), "",
  // SI pseudo instructions. These are used by the CFG structurizer pass
  // and should be lowered to ISA instructions prior to codegen.
  
-let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
+let hasSideEffects = 1, isPseudo = 1, isCodeGenOnly = 1 in {
+
+// Dummy terminator instruction to use after control flow instructions
+// replaced with exec mask operations.
+def SI_MASK_BRANCH : InstSI <
+  (outs SReg_64:$dst), (ins brtarget:$target)> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let SALU = 1;
+}
+
  let Uses = [EXEC], Defs = [EXEC] in {
  
  let isBranch = 1, isTerminator = 1 in {
  
  def SI_IF: InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$vcc, brtarget:$target),
-  "",
+  (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), "",
    [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))]
  >;
  
  def SI_ELSE : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src, brtarget:$target),
-  "",
-  [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]
-> {
+  (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target), "",
+  [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]> {
    let Constraints = "$src = $dst";
  }
  
  def SI_LOOP : InstSI <
-  (outs),
-  (ins SReg_64:$saved, brtarget:$target),
-  "si_loop $saved, $target",
+  (outs), (ins SReg_64:$saved, brtarget:$target), "",
    [(int_amdgcn_loop i64:$saved, bb:$target)]
  >;
  
  } // End isBranch = 1, isTerminator = 1
  
  def SI_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src),
-  "si_else $dst, $src",
+  (outs SReg_64:$dst), (ins SReg_64:$src), "",
    [(set i64:$dst, (int_amdgcn_break i64:$src))]
  >;
  
  def SI_IF_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$vcc, SReg_64:$src),
-  "si_if_break $dst, $vcc, $src",
+  (outs SReg_64:$dst),   (ins SReg_64:$vcc, SReg_64:$src), "",
    [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]
  >;
  
  def SI_ELSE_BREAK : InstSI <
-  (outs SReg_64:$dst),
-  (ins SReg_64:$src0, SReg_64:$src1),
-  "si_else_break $dst, $src0, $src1",
+  (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), "",
    [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]
  >;
  
  def SI_END_CF : InstSI <
-  (outs),
-  (ins SReg_64:$saved),
-  "si_end_cf $saved",
+  (outs), (ins SReg_64:$saved), "",
    [(int_amdgcn_end_cf i64:$saved)]
  >;
  
@@ -1981,30 +1977,24 @@ def SI_END_CF : InstSI <
  
  let Uses = [EXEC], Defs = [EXEC,VCC] in {
  def SI_KILL : InstSI <
-  (outs),
-  (ins VSrc_32:$src),
-  "si_kill $src",
+  (outs), (ins VSrc_32:$src), "",
    [(int_AMDGPU_kill f32:$src)]
  >;
  } // End Uses = [EXEC], Defs = [EXEC,VCC]
  
  } // End mayLoad = 1, mayStore = 1, hasSideEffects = 1
  
-let SALU = 1 in
  def SI_PS_LIVE : InstSI <
-  (outs SReg_64:$dst),
-  (ins),
-  "si_ps_live $dst",
-  [(set i1:$dst, (int_amdgcn_ps_live))]
->;
+  (outs SReg_64:$dst), (ins), "",
+  [(set i1:$dst, (int_amdgcn_ps_live))]> {
+  let SALU = 1;
+}
  
  // Used as an isel pseudo to directly emit initialization with an
  // s_mov_b32 rather than a copy of another initialized
  // register. MachineCSE skips copies, and we don't want to have to
  // fold operands before it runs.
-def SI_INIT_M0 : InstSI <
-  (outs),
-  (ins SSrc_32:$src), "", []> {
+def SI_INIT_M0 : InstSI <(outs), (ins SSrc_32:$src)> {
    let Defs = [M0];
    let usesCustomInserter = 1;
    let isPseudo = 1;
@@ -2014,21 +2004,28 @@ def SI_INIT_M0 : InstSI <
    let isReMaterializable = 1;
  }
  
+def SI_RETURN : InstSI <
+  (outs), (ins variable_ops), "", [(AMDGPUreturn)]> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let hasSideEffects = 1;
+  let SALU = 1;
+  let hasNoSchedulingInfo = 1;
+}
+
  let Uses = [EXEC], Defs = [EXEC, VCC, M0] in {
  
  class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
    (outs VGPR_32:$dst, SReg_64:$temp),
-  (ins rc:$src, VSrc_32:$idx, i32imm:$off),
-  "si_indirect_src $dst, $temp, $src, $idx, $off",
-  []
+  (ins rc:$src, VSrc_32:$idx, i32imm:$off)
  >;
  
  class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
    (outs rc:$dst, SReg_64:$temp),
-  (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val),
-  "si_indirect_dst $dst, $temp, $src, $idx, $off, $val",
-  []
-> {
+  (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val)> {
    let Constraints = "$src = $dst";
  }
  
@@ -2052,16 +2049,14 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
    let UseNamedOperandTable = 1, Uses = [EXEC] in {
      def _SAVE : InstSI <
        (outs),
-      (ins sgpr_class:$src, i32imm:$frame_idx),
-      "", []> {
+      (ins sgpr_class:$src, i32imm:$frame_idx)> {
        let mayStore = 1;
        let mayLoad = 0;
      }
  
      def _RESTORE : InstSI <
        (outs sgpr_class:$dst),
-      (ins i32imm:$frame_idx),
-      "", []> {
+      (ins i32imm:$frame_idx)> {
        let mayStore = 0;
        let mayLoad = 1;
      }
@@ -2082,8 +2077,7 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
      def _SAVE : InstSI <
        (outs),
        (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
-           SReg_32:$scratch_offset, i32imm:$offset),
-      "", []> {
+           SReg_32:$scratch_offset, i32imm:$offset)> {
        let mayStore = 1;
        let mayLoad = 0;
      }
@@ -2091,8 +2085,7 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
      def _RESTORE : InstSI <
        (outs vgpr_class:$dst),
        (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset,
-           i32imm:$offset),
-      "", []> {
+           i32imm:$offset)> {
        let mayStore = 0;
        let mayLoad = 1;
      }
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp

index 68609946f3bbe5dab4c916232c039479a44e1b65..65d1d09cd259567714a94cc4d750e487ac1e67d9 100644 (file)
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -88,10 +88,14 @@ private:
    void Kill(MachineInstr &MI);
    void Branch(MachineInstr &MI);
  
-  void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
+  void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
+                              MachineInstr *MovRel,
+                              unsigned SaveReg, unsigned IdxReg, int Offset);
+
+  bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
    void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
-  void IndirectSrc(MachineInstr &MI);
-  void IndirectDst(MachineInstr &MI);
+  bool indirectSrc(MachineInstr &MI);
+  bool indirectDst(MachineInstr &MI);
  
  public:
    static char ID;
@@ -104,11 +108,6 @@ public:
    const char *getPassName() const override {
      return "SI Lower control flow pseudo instructions";
    }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
  };
  
  } // End anonymous namespace
@@ -227,6 +226,10 @@ void SILowerControlFlow::If(MachineInstr &MI) {
  
    Skip(MI, MI.getOperand(2));
  
+  // Insert a pseudo terminator to help keep the verifier happy.
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH), Reg)
+    .addOperand(MI.getOperand(2));
+
    MI.eraseFromParent();
  }
  
@@ -255,6 +258,10 @@ void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
  
    Skip(MI, MI.getOperand(2));
  
+  // Insert a pseudo terminator to help keep the verifier happy.
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH), Dst)
+    .addOperand(MI.getOperand(2));
+
    MI.eraseFromParent();
  }
  
@@ -331,7 +338,8 @@ void SILowerControlFlow::EndCf(MachineInstr &MI) {
  }
  
  void SILowerControlFlow::Branch(MachineInstr &MI) {
-  if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
+  MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+  if (MBB == MI.getParent()->getNextNode())
      MI.eraseFromParent();
  
    // If these aren't equal, this is probably an infinite loop.
@@ -365,75 +373,109 @@ void SILowerControlFlow::Kill(MachineInstr &MI) {
    MI.eraseFromParent();
  }
  
-void SILowerControlFlow::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
+void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
+                                                DebugLoc DL,
+                                                MachineInstr *MovRel,
+                                                unsigned SaveReg,
+                                                unsigned IdxReg,
+                                                int Offset) {
+  MachineBasicBlock::iterator I = LoopBB.begin();
+
+  // Read the next variant into VCC (lower 32 bits) <- also loop target
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
+    .addReg(IdxReg);
+
+  // Move index from VCC into M0
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+    .addReg(AMDGPU::VCC_LO);
+
+  // Compare the just read M0 value to all possible Idx values
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
+    .addReg(AMDGPU::M0)
+    .addReg(IdxReg);
+
+  // Update EXEC, save the original EXEC value to VCC
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+    .addReg(AMDGPU::VCC);
+
+  if (Offset) {
+    BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+      .addReg(AMDGPU::M0)
+      .addImm(Offset);
+  }
+
+  // Do the actual move
+  LoopBB.insert(I, MovRel);
+
+  // Update EXEC, switch all done bits to 0 and all todo bits to 1
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC)
+    .addReg(AMDGPU::VCC);
  
+  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+    .addMBB(&LoopBB);
+}
+
+// Returns true if a new block was inserted.
+bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
    MachineBasicBlock &MBB = *MI.getParent();
    DebugLoc DL = MI.getDebugLoc();
    MachineBasicBlock::iterator I = MI;
  
-  unsigned Save = MI.getOperand(1).getReg();
    unsigned Idx = MI.getOperand(3).getReg();
  
    if (AMDGPU::SReg_32RegClass.contains(Idx)) {
      if (Offset) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-              .addReg(Idx)
-              .addImm(Offset);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+        .addReg(Idx)
+        .addImm(Offset);
      } else {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-              .addReg(Idx);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+        .addReg(Idx);
      }
-    MBB.insert(I, MovRel);
-  } else {
  
-    assert(AMDGPU::SReg_64RegClass.contains(Save));
-    assert(AMDGPU::VGPR_32RegClass.contains(Idx));
+    MBB.insert(I, MovRel);
+    MI.eraseFromParent();
+    return false;
+  }
  
-    // Save the EXEC mask
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
-            .addReg(AMDGPU::EXEC);
+  MachineFunction &MF = *MBB.getParent();
+  unsigned Save = MI.getOperand(1).getReg();
  
-    // Read the next variant into VCC (lower 32 bits) <- also loop target
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-            AMDGPU::VCC_LO)
-            .addReg(Idx);
+  // Reading from a VGPR requires looping over all workitems in the wavefront.
+  assert(AMDGPU::SReg_64RegClass.contains(Save) &&
+         AMDGPU::VGPR_32RegClass.contains(Idx));
  
-    // Move index from VCC into M0
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-            .addReg(AMDGPU::VCC_LO);
+  // Save the EXEC mask
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+    .addReg(AMDGPU::EXEC);
  
-    // Compare the just read M0 value to all possible Idx values
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
-      .addReg(AMDGPU::M0)
-      .addReg(Idx);
+  // To insert the loop we need to split the block. Move everything after this
+  // point to a new block, and insert a new empty block between the two.
+  MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
+  MachineFunction::iterator MBBI(MBB);
+  ++MBBI;
  
-    // Update EXEC, save the original EXEC value to VCC
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
-            .addReg(AMDGPU::VCC);
+  MF.insert(MBBI, LoopBB);
+  MF.insert(MBBI, RemainderBB);
  
-    if (Offset) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-              .addReg(AMDGPU::M0)
-              .addImm(Offset);
-    }
-    // Do the actual move
-    MBB.insert(I, MovRel);
+  LoopBB->addSuccessor(LoopBB);
+  LoopBB->addSuccessor(RemainderBB);
  
-    // Update EXEC, switch all done bits to 0 and all todo bits to 1
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
-            .addReg(AMDGPU::EXEC)
-            .addReg(AMDGPU::VCC);
+  // Move the rest of the block into a new block.
+  RemainderBB->transferSuccessors(&MBB);
+  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
  
-    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-      .addImm(-7);
+  emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, Save, Idx, Offset);
  
-    // Restore EXEC
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
-            .addReg(Save);
+  MachineBasicBlock::iterator First = RemainderBB->begin();
+  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+    .addReg(Save);
  
-  }
    MI.eraseFromParent();
+  return true;
  }
  
  /// \param @VecReg The register which holds element zero of the vector
@@ -463,8 +505,8 @@ void SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg,
    Reg = RC->getRegister(RegIdx);
  }
  
-void SILowerControlFlow::IndirectSrc(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
    MachineBasicBlock &MBB = *MI.getParent();
    DebugLoc DL = MI.getDebugLoc();
  
@@ -480,11 +522,11 @@ void SILowerControlFlow::IndirectSrc(MachineInstr &MI) {
              .addReg(Reg)
              .addReg(Vec, RegState::Implicit);
  
-  LoadM0(MI, MovRel, Off);
+  return loadM0(MI, MovRel, Off);
  }
  
-void SILowerControlFlow::IndirectDst(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
    MachineBasicBlock &MBB = *MI.getParent();
    DebugLoc DL = MI.getDebugLoc();
  
@@ -501,7 +543,7 @@ void SILowerControlFlow::IndirectDst(MachineInstr &MI) {
              .addReg(Val)
              .addReg(Dst, RegState::Implicit);
  
-  LoadM0(MI, MovRel, Off);
+  return loadM0(MI, MovRel, Off);
  }
  
  bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
@@ -514,11 +556,14 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
    bool NeedFlat = false;
    unsigned Depth = 0;
  
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
+  MachineFunction::iterator NextBB;
  
-    MachineBasicBlock *EmptyMBBAtEnd = NULL;
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; BI = NextBB) {
+    NextBB = std::next(BI);
      MachineBasicBlock &MBB = *BI;
+
+    MachineBasicBlock *EmptyMBBAtEnd = nullptr;
      MachineBasicBlock::iterator I, Next;
      bool ExecModified = false;
  
@@ -591,7 +636,15 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
          case AMDGPU::SI_INDIRECT_SRC_V4:
          case AMDGPU::SI_INDIRECT_SRC_V8:
          case AMDGPU::SI_INDIRECT_SRC_V16:
-          IndirectSrc(MI);
+          if (indirectSrc(MI)) {
+            // The block was split at this point. We can safely skip the middle
+            // inserted block to the following which contains the rest of this
+            // block's instructions.
+            NextBB = std::next(BI);
+            BE = MF.end();
+            Next = MBB.end();
+          }
+
            break;
  
          case AMDGPU::SI_INDIRECT_DST_V1:
@@ -599,7 +652,15 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
          case AMDGPU::SI_INDIRECT_DST_V4:
          case AMDGPU::SI_INDIRECT_DST_V8:
          case AMDGPU::SI_INDIRECT_DST_V16:
-          IndirectDst(MI);
+          if (indirectDst(MI)) {
+            // The block was split at this point. We can safely skip the middle
+            // inserted block to the following which contains the rest of this
+            // block's instructions.
+            NextBB = std::next(BI);
+            BE = MF.end();
+            Next = MBB.end();
+          }
+
            break;
  
          case AMDGPU::S_ENDPGM: {
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll

index 54221e96d249997884bc75cd4d197f883ec3e646..836e402aa0b8eaff39b1fd43d61fba334faa87d3 100644 (file)
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -174,6 +174,213 @@ entry:
    ret void
  }
  
+; When the block is split to insert the loop, make sure any other
+; places that need to be expanded in the same block are also handled.
+
+; CHECK-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:
+
+; CHECK: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
+; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
+; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
+; CHECK: s_waitcnt vmcnt(0)
+
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
+
+; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; CHECK: s_mov_b64 exec, [[MASK]]
+; CHECK: s_mov_b64 [[MASK]], exec
+
+; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP1]]
+
+; CHECK: buffer_store_dword [[MOVREL0]]
+; CHECK: buffer_store_dword [[MOVREL1]]
+define void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %id.ext = zext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+  %idx0 = load volatile i32, i32 addrspace(1)* %gep
+  %idx1 = add i32 %idx0, 1
+  %val0 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx0
+  %val1 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx1
+  store volatile i32 %val0, i32 addrspace(1)* %out0
+  store volatile i32 %val1, i32 addrspace(1)* %out0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
+; CHECK-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
+; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], s[[S_ELT0]]
+; CHECK-DAG: v_mov_b32_e32 [[INS0:v[0-9]+]], 62
+; CHECK-DAG: s_waitcnt vmcnt(0)
+
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
+
+; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL0:[0-9]+]], [[INS0]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; CHECK: s_mov_b64 exec, [[MASK]]
+; CHECK: v_mov_b32_e32 [[INS1:v[0-9]+]], 63
+; CHECK: s_mov_b64 [[MASK]], exec
+
+; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL1:[0-9]+]], [[INS1]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP1]]
+
+; CHECK: buffer_store_dwordx4 v{{\[}}[[MOVREL0]]:
+define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %id.ext = zext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+  %idx0 = load volatile i32, i32 addrspace(1)* %gep
+  %idx1 = add i32 %idx0, 1
+  %vec1 = insertelement <4 x i32> %vec0, i32 62, i32 %idx0
+  %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
+  store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}extract_adjacent_blocks:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK: s_cmp_lg_i32
+; CHECK: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movrels_b32_e32
+; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[BB4]]:
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movrels_b32_e32
+
+; CHECK: [[ENDBB]]:
+; CHECK: buffer_store_dword
+; CHECK: s_endpgm
+define void @extract_adjacent_blocks(i32 %arg) #0 {
+bb:
+  %tmp = icmp eq i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb4
+
+bb1:
+  %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp3 = extractelement <4 x float> %tmp2, i32 undef
+  br label %bb7
+
+bb4:
+  %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp6 = extractelement <4 x float> %tmp5, i32 undef
+  br label %bb7
+
+bb7:
+  %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
+  store volatile float %tmp8, float addrspace(1)* undef
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_adjacent_blocks:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK: s_cmp_lg_i32
+; CHECK: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movreld_b32_e32
+; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[BB4]]:
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movreld_b32_e32
+
+; CHECK: [[ENDBB]]:
+; CHECK: buffer_store_dword
+; CHECK: s_endpgm
+define void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
+bb:
+  %tmp = icmp eq i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb4
+
+bb1:                                              ; preds = %bb
+  %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
+  br label %bb7
+
+bb4:                                              ; preds = %bb
+  %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+  %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
+  br label %bb7
+
+bb7:                                              ; preds = %bb4, %bb1
+  %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
+  store volatile <4 x float> %tmp8, <4 x float> addrspace(1)* undef
+  ret void
+}
+
+; FIXME: Should be able to fold zero input to movreld to inline imm?
+
+; CHECK-LABEL: {{^}}multi_same_block:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; CHECK-DAG: s_add_i32 m0, [[ARG]], -16
+; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, [[ZERO]]
+
+; CHECK: s_add_i32 m0, [[ARG]], -14
+; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+
+; CHECK: s_mov_b32 m0, -1
+; CHECK: ds_write_b32
+; CHECK: ds_write_b32
+; CHECK: s_endpgm
+define void @multi_same_block(i32 %arg) #0 {
+bb:
+  %tmp1 = add i32 %arg, -16
+  %tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 0.000000e+00, i32 %tmp1
+  %tmp3 = add i32 %arg, -16
+  %tmp4 = insertelement <6 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000>, float 0x3FB99999A0000000, i32 %tmp3
+  %tmp5 = bitcast <6 x float> %tmp2 to <6 x i32>
+  %tmp6 = extractelement <6 x i32> %tmp5, i32 1
+  %tmp7 = bitcast <6 x float> %tmp4 to <6 x i32>
+  %tmp8 = extractelement <6 x i32> %tmp7, i32 5
+  store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
+  store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
+  ret void
+}
+
  declare i32 @llvm.amdgcn.workitem.id.x() #1
  
+attributes #0 = { nounwind }
  attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll

index cb7ef17d7a355f5a1e7532dbc5470242662a24c8..1e1a757ec51dadd8cbde55e5b5f484c9e2491178 100644 (file)
--- a/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/test/CodeGen/AMDGPU/ret_jump.ll
@@ -1,17 +1,22 @@
  ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  
-target triple = "amdgcn--"
+; This should end with an no-op sequence of exec mask manipulations
+; Mask should be in original state after executed unreachable block
  
  ; GCN-LABEL: {{^}}main:
-; GCN: BB0_3:
-; GCN-NEXT: s_branch [[LASTBB:BB[0-9]*_[0-9]*]]
-; GCN-NEXT: BB0_
-; GCN: [[LASTBB]]
-; GCN-NEXT: .Lfunc_end0:
-; ModuleID = 'bugpoint-reduced-simplified.bc'
-target triple = "amdgcn--"
+; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
  
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
+; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: [[RET_BB]]:
+; GCN-NEXT: ; return
+
+; GCN-NEXT: [[UNREACHABLE_BB]]:
+; GCN-NEXT: s_or_b64 exec, exec, [[XOR_EXEC]]
+; GCN-NEXT: .Lfunc_end0
  define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
  main_body:
    %p83 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll

index ef9c96b5bd1c2220cf44010c2525394c2e220d8b..23b0ffd5b3da0469795dbaedbc2bbc20266460f5 100644 (file)
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -122,9 +122,13 @@ END:
  ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
  ;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
  ;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
-;CHECK-NEXT: %ELSE
-;CHECK: store
-;CHECK: %END
+;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]]
+;CHECK-NEXT: ; BB#3: ; %ELSE
+;CHECK: store_dword
+;CHECK: [[END_BB]]: ; %END
+;CHECK: s_or_b64 exec, exec,
+;CHECK: v_mov_b32_e32 v0
+;CHECK: ; return
  define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
  main_body:
    %cmp = icmp eq i32 %z, 0
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 22 Jun 2016 20:15:28 +0000 (20:15 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 22 Jun 2016 20:15:28 +0000 (20:15 +0000)
lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelLowering.h		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUInstrInfo.td		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUMCInstLower.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
lib/Target/AMDGPU/R600Instructions.td		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstrFormats.td		patch \| blob \| history
lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
lib/Target/AMDGPU/SILowerControlFlow.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/indirect-addressing-si.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ret_jump.ll		patch \| blob \| history
test/CodeGen/AMDGPU/wqm.ll		patch \| blob \| history