AMDGPU: Implement early ifcvt target hooks.

author Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 25 Jan 2017 04:25:02 +0000 (04:25 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 25 Jan 2017 04:25:02 +0000 (04:25 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 25 Jan 2017 04:25:02 +0000 (04:25 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 25 Jan 2017 04:25:02 +0000 (04:25 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h

index c055b8c5f79a4d682664831e2128b337a5bd278c..37a21c25a50d0bfb616e117808fd1387d81b2f51 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -526,6 +526,11 @@ public:
      this->GISel.reset(&GISel);
    }
  
+  // XXX - Why is this here if it isn't in the default pass set?
+  bool enableEarlyIfConversion() const override {
+    return true;
+  }
+
    void overrideSchedPolicy(MachineSchedPolicy &Policy,
                             unsigned NumRegionInstrs) const override;
  
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index 06adb3723057fcad011d44bcfa8190c398b56dd1..1b48d61ade35fd16169b23ba1c42f8f8072859d8 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -58,6 +58,11 @@ static cl::opt<bool> EnableSROA(
    cl::ReallyHidden,
    cl::init(true));
  
+static cl::opt<bool>
+EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
+                        cl::desc("Run early if-conversion"),
+                        cl::init(false));
+
  static cl::opt<bool> EnableR600IfConvert(
    "r600-if-convert",
    cl::desc("Use if conversion pass"),
@@ -360,6 +365,7 @@ public:
    void addIRPasses() override;
    bool addPreISel() override;
    void addMachineSSAOptimization() override;
+  bool addILPOpts() override;
    bool addInstSelector() override;
  #ifdef LLVM_BUILD_GLOBAL_ISEL
    bool addIRTranslator() override;
@@ -552,6 +558,14 @@ void GCNPassConfig::addMachineSSAOptimization() {
    addPass(&SILoadStoreOptimizerID);
  }
  
+bool GCNPassConfig::addILPOpts() {
+  if (EnableEarlyIfConversion)
+    addPass(&EarlyIfConverterID);
+
+  TargetPassConfig::addILPOpts();
+  return false;
+}
+
  void GCNPassConfig::addIRPasses() {
    // TODO: May want to move later or split into an early and late one.
    addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp

index a647e29c82c9eb7b790c28d347ca3591eb270d39..ca1fa94d81203be5ffe780554e3c806a5322c6f7 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1291,6 +1291,13 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
    return Count;
  }
  
+// Copy the flags onto the implicit condition register operand.
+static void preserveCondRegFlags(MachineOperand &CondReg,
+                                 const MachineOperand &OrigCond) {
+  CondReg.setIsUndef(OrigCond.isUndef());
+  CondReg.setIsKill(OrigCond.isKill());
+}
+
  unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
                                     MachineBasicBlock *TBB,
                                     MachineBasicBlock *FBB,
@@ -1318,9 +1325,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
        .addMBB(TBB);
  
      // Copy the flags onto the implicit condition register operand.
-    MachineOperand &CondReg = CondBr->getOperand(1);
-    CondReg.setIsUndef(Cond[1].isUndef());
-    CondReg.setIsKill(Cond[1].isKill());
+    preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
  
      if (BytesAdded)
        *BytesAdded = 4;
@@ -1352,6 +1357,136 @@ bool SIInstrInfo::reverseBranchCondition(
    return false;
  }
  
+bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
+                                  ArrayRef<MachineOperand> Cond,
+                                  unsigned TrueReg, unsigned FalseReg,
+                                  int &CondCycles,
+                                  int &TrueCycles, int &FalseCycles) const {
+  switch (Cond[0].getImm()) {
+  case VCCNZ:
+  case VCCZ: {
+    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
+    assert(MRI.getRegClass(FalseReg) == RC);
+
+    int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
+    CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
+
+    // Limit to equal cost for branch vs. N v_cndmask_b32s.
+    return !RI.isSGPRClass(RC) && NumInsts <= 6;
+  }
+  case SCC_TRUE:
+  case SCC_FALSE: {
+    // FIXME: We could insert for VGPRs if we could replace the original compare
+    // with a vector one.
+    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
+    assert(MRI.getRegClass(FalseReg) == RC);
+
+    int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
+
+    // Multiples of 8 can do s_cselect_b64
+    if (NumInsts % 2 == 0)
+      NumInsts /= 2;
+
+    CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
+    return RI.isSGPRClass(RC);
+  }
+  default:
+    return false;
+  }
+}
+
+void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, const DebugLoc &DL,
+                               unsigned DstReg, ArrayRef<MachineOperand> Cond,
+                               unsigned TrueReg, unsigned FalseReg) const {
+  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
+  if (Pred == VCCZ || Pred == SCC_FALSE) {
+    Pred = static_cast<BranchPredicate>(-Pred);
+    std::swap(TrueReg, FalseReg);
+  }
+
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+  unsigned DstSize = DstRC->getSize();
+
+  if (DstSize == 4) {
+    unsigned SelOp = Pred == SCC_TRUE ?
+      AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
+
+    // Instruction's operands are backwards from what is expected.
+    MachineInstr *Select =
+      BuildMI(MBB, I, DL, get(SelOp), DstReg)
+      .addReg(FalseReg)
+      .addReg(TrueReg);
+
+    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
+    return;
+  }
+
+  if (DstSize == 8 && Pred == SCC_TRUE) {
+    MachineInstr *Select =
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
+      .addReg(FalseReg)
+      .addReg(TrueReg);
+
+    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
+    return;
+  }
+
+  static const int16_t Sub0_15[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+  };
+
+  static const int16_t Sub0_15_64[] = {
+    AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+    AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
+    AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
+    AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
+  };
+
+  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
+  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
+  const int16_t *SubIndices = Sub0_15;
+  int NElts = DstSize / 4;
+
+  // 64-bit select is only avaialble for SALU.
+  if (Pred == SCC_TRUE) {
+    SelOp = AMDGPU::S_CSELECT_B64;
+    EltRC = &AMDGPU::SGPR_64RegClass;
+    SubIndices = Sub0_15_64;
+
+    assert(NElts % 2 == 0);
+    NElts /= 2;
+  }
+
+  MachineInstrBuilder MIB = BuildMI(
+    MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
+
+  I = MIB->getIterator();
+
+  SmallVector<unsigned, 8> Regs;
+  for (int Idx = 0; Idx != NElts; ++Idx) {
+    unsigned DstElt = MRI.createVirtualRegister(EltRC);
+    Regs.push_back(DstElt);
+
+    unsigned SubIdx = SubIndices[Idx];
+
+    MachineInstr *Select =
+      BuildMI(MBB, I, DL, get(SelOp), DstElt)
+      .addReg(FalseReg, 0, SubIdx)
+      .addReg(TrueReg, 0, SubIdx);
+    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
+
+    MIB.addReg(DstElt)
+       .addImm(SubIdx);
+  }
+}
+
  static void removeModOperands(MachineInstr &MI) {
    unsigned Opc = MI.getOpcode();
    int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h

index 7d31512ad520f9e090a2afc05401f64be9a22fa9..5f53fd189173c48b896a1d3ccb676c988d8ed35e 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -203,6 +203,18 @@ public:
    bool reverseBranchCondition(
      SmallVectorImpl<MachineOperand> &Cond) const override;
  
+
+  bool canInsertSelect(const MachineBasicBlock &MBB,
+                       ArrayRef<MachineOperand> Cond,
+                       unsigned TrueReg, unsigned FalseReg,
+                       int &CondCycles,
+                       int &TrueCycles, int &FalseCycles) const override;
+
+  void insertSelect(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator I, const DebugLoc &DL,
+                    unsigned DstReg, ArrayRef<MachineOperand> Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+
    bool
    areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                    AliasAnalysis *AA = nullptr) const override;
diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td

index be27966fd5f1fb6be20049faf02092c129c6e80b..0f02f5825cb0d1729475592b3b980749572d5714 100644 (file)
--- a/lib/Target/AMDGPU/SISchedule.td
+++ b/lib/Target/AMDGPU/SISchedule.td
@@ -53,6 +53,11 @@ class SISchedMachineModel : SchedMachineModel {
    let MicroOpBufferSize = 1;
    let IssueWidth = 1;
    let PostRAScheduler = 1;
+
+  // FIXME:Approximate 2 * branch cost.  Try to hack around bad
+  // early-ifcvt heuristics. These need improvement to avoid the OOE
+  // heuristics.
+  int MispredictPenalty = 20;
  }
  
  def SIFullSpeedModel : SISchedMachineModel;
diff --git a/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/test/CodeGen/AMDGPU/early-if-convert-cost.ll

new file mode 100644 (file)

index 0000000..d1624f8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -0,0 +1,110 @@
+; RUN: llc -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: Most of these cases that don't trigger because of broken cost
+; heuristics. Should not need -stress-early-ifcvt
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle64:
+; GCN: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GCN: v_cmp_neq_f64_e32 vcc, 1.0, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
+; GCN: v_add_f64 v{{\[}}[[ADD_LO:[0-9]+]]:[[ADD_HI:[0-9]+]]{{\]}}, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
+; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_LO:[0-9]+]], v[[ADD_LO]], v[[VAL_LO]], vcc
+; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_HI:[0-9]+]], v[[ADD_HI]], v[[VAL_HI]], vcc
+; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+define void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+entry:
+  %v = load double, double addrspace(1)* %in
+  %cc = fcmp oeq double %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fadd double %v, %v
+  br label %endif
+
+endif:
+  %r = phi double [ %v, %entry ], [ %u, %if ]
+  store double %r, double addrspace(1)* %out
+  ret void
+}
+
+; vcc branch with SGPR inputs
+; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle64:
+; GCN: v_cmp_neq_f64
+; GCN: v_add_f64
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+define void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
+entry:
+  %v = load double, double addrspace(2)* %in
+  %cc = fcmp oeq double %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fadd double %v, %v
+  br label %endif
+
+endif:
+  %r = phi double [ %v, %entry ], [ %u, %if ]
+  store double %r, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle96:
+; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
+
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: s_mov_b64 vcc, [[CMP]]
+
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+
+; GCN-DAG: buffer_store_dword v
+; GCN-DAG: buffer_store_dwordx2
+define void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
+entry:
+  %v = load <3 x i32>, <3 x i32> addrspace(1)* %in
+  %cc = fcmp oeq float %cnd, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add <3 x i32> %v, %v
+  br label %endif
+
+endif:
+  %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
+  store <3 x i32> %r, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle128:
+; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
+
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: v_add_i32_e32
+; GCN: s_mov_b64 vcc, [[CMP]]
+
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+
+; GCN: buffer_store_dwordx4
+define void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 {
+entry:
+  %v = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %cc = fcmp oeq float %cnd, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add <4 x i32> %v, %v
+  br label %endif
+
+endif:
+  %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
+  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/early-if-convert.ll b/test/CodeGen/AMDGPU/early-if-convert.ll

new file mode 100644 (file)

index 0000000..5ae1db8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -0,0 +1,454 @@
+; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: This leaves behind a now unnecessary and with exec
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %v = load float, float addrspace(1)* %in
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fadd float %v, %v
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]]
+; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
+; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
+; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %v = load float, float addrspace(1)* %in
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %else
+
+if:
+  %u0 = fadd float %v, %v
+  br label %endif
+
+else:
+  %u1 = fmul float %v, %v
+  br label %endif
+
+endif:
+  %r = phi float [ %u0, %if ], [ %u1, %else ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber:
+; GCN: ; clobber vcc
+; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
+; GCN: s_mov_b64 vcc, [[CMP]]
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
+define void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
+entry:
+  %v = load i32, i32 addrspace(1)* %in
+  %cc = fcmp oeq float %k, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  call void asm "; clobber $0", "~{VCC}"() #0
+  %u = add i32 %v, %v
+  br label %endif
+
+endif:
+  %r = phi i32 [ %v, %entry ], [ %u, %if ]
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+}
+
+; Longest chain of cheap instructions to convert
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap:
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_cndmask_b32_e32
+define void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %v = load float, float addrspace(1)* %in
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u.0 = fmul float %v, %v
+  %u.1 = fmul float %v, %u.0
+  %u.2 = fmul float %v, %u.1
+  %u.3 = fmul float %v, %u.2
+  %u.4 = fmul float %v, %u.3
+  %u.5 = fmul float %v, %u.4
+  %u.6 = fmul float %v, %u.5
+  %u.7 = fmul float %v, %u.6
+  %u.8 = fmul float %v, %u.7
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u.8, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; Short chain of cheap instructions to not convert
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive:
+; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+; GCN: v_mul_f32
+
+; GCN: [[ENDIF]]:
+; GCN: buffer_store_dword
+define void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %v = load float, float addrspace(1)* %in
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u.0 = fmul float %v, %v
+  %u.1 = fmul float %v, %u.0
+  %u.2 = fmul float %v, %u.1
+  %u.3 = fmul float %v, %u.2
+  %u.4 = fmul float %v, %u.3
+  %u.5 = fmul float %v, %u.4
+  %u.6 = fmul float %v, %u.5
+  %u.7 = fmul float %v, %u.6
+  %u.8 = fmul float %v, %u.7
+  %u.9 = fmul float %v, %u.8
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u.9, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; Should still branch over fdiv expansion
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive:
+; GCN: v_cmp_neq_f32_e32
+; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; GCN: v_div_scale_f32
+
+; GCN: [[ENDIF]]:
+; GCN: buffer_store_dword
+define void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+entry:
+  %v = load float, float addrspace(1)* %in
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fdiv float %v, %v
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; vcc branch with SGPR inputs
+; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle:
+; GCN: v_cmp_neq_f32_e64
+; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; GCN: s_add_i32
+
+; GCN: [[ENDIF]]:
+; GCN: buffer_store_dword
+define void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 {
+entry:
+  %v = load i32, i32 addrspace(2)* %in
+  %cc = fcmp oeq float %cnd, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add i32 %v, %v
+  br label %endif
+
+endif:
+  %r = phi i32 [ %v, %entry ], [ %u, %if ]
+  store i32 %r, i32 addrspace(1)* %out
+  ret void
+
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
+; GCN: v_cndmask_b32
+define void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 {
+entry:
+  %v = load float, float addrspace(2)* %in
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fadd float %v, %v
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; Due to broken cost heuristic, this is not if converted like
+; test_vccnz_ifcvt_triangle_constant_load even though it should be.
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
+; GCN: v_cndmask_b32
+define void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
+entry:
+  %cc = fcmp oeq float %v, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fadd float %v, %v
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; Scalar branch and scalar inputs
+; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle:
+; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
+; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
+; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
+; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]]
+define void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 {
+entry:
+  %v = load i32, i32 addrspace(2)* %in
+  %cc = icmp eq i32 %cond, 1
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add i32 %v, %v
+  br label %endif
+
+endif:
+  %r = phi i32 [ %v, %entry ], [ %u, %if ]
+  call void asm sideeffect "; reg use $0", "s"(i32 %r) #0
+  ret void
+}
+
+; FIXME: Should be able to use VALU compare and select
+; Scalar branch but VGPR select operands
+; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle:
+; GCN: s_cmp_lg_u32
+; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; GCN: v_add_f32_e32
+
+; GCN: [[ENDIF]]:
+; GCN: buffer_store_dword
+define void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
+entry:
+  %v = load float, float addrspace(1)* %in
+  %cc = icmp eq i32 %cond, 1
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = fadd float %v, %v
+  br label %endif
+
+endif:
+  %r = phi float [ %v, %entry ], [ %u, %if ]
+  store float %r, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64:
+; GCN: s_add_u32
+; GCN: s_addc_u32
+; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
+; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+define void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 {
+entry:
+  %v = load i64, i64 addrspace(2)* %in
+  %cc = icmp eq i32 %cond, 1
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add i64 %v, %v
+  br label %endif
+
+endif:
+  %r = phi i64 [ %v, %entry ], [ %u, %if ]
+  call void asm sideeffect "; reg use $0", "s"(i64 %r) #0
+  ret void
+}
+
+; TODO: Can do s_cselect_b64; s_cselect_b32
+; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96:
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
+; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+define void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 {
+entry:
+  %v = load <3 x i32>, <3 x i32> addrspace(2)* %in
+  %cc = icmp eq i32 %cond, 1
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add <3 x i32> %v, %v
+  br label %endif
+
+endif:
+  %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
+  %r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128:
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_add_i32
+; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
+; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+define void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 {
+entry:
+  %v = load <4 x i32>, <4 x i32> addrspace(2)* %in
+  %cc = icmp eq i32 %cond, 1
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add <4 x i32> %v, %v
+  br label %endif
+
+endif:
+  %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
+  call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
+; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
+; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}}
+define void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
+entry:
+  %cmp0 = icmp eq i32 %cond, 0
+  br i1 %cmp0, label %else, label %if
+
+if:
+  br label %done
+
+else:
+  br label %done
+
+done:
+  %value = phi i32 [0, %if], [1, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ifcvt_undef_scc:
+; GCN: {{^}}; BB#0:
+; GCN-NEXT: s_load_dwordx2
+; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0
+define void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
+entry:
+  br i1 undef, label %else, label %if
+
+if:
+  br label %done
+
+else:
+  br label %done
+
+done:
+  %value = phi i32 [0, %if], [1, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256:
+; GCN: v_cmp_neq_f32
+; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; GCN: v_add_i32
+; GCN: v_add_i32
+
+; GCN: [[ENDIF]]:
+; GCN: buffer_store_dword
+define void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
+entry:
+  %v = load <8 x i32>, <8 x i32> addrspace(1)* %in
+  %cc = fcmp oeq float %cnd, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add <8 x i32> %v, %v
+  br label %endif
+
+endif:
+  %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ]
+  store <8 x i32> %r, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512:
+; GCN: v_cmp_neq_f32
+; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
+
+; GCN: v_add_i32
+; GCN: v_add_i32
+
+; GCN: [[ENDIF]]:
+; GCN: buffer_store_dword
+define void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
+entry:
+  %v = load <16 x i32>, <16 x i32> addrspace(1)* %in
+  %cc = fcmp oeq float %cnd, 1.000000e+00
+  br i1 %cc, label %if, label %endif
+
+if:
+  %u = add <16 x i32> %v, %v
+  br label %endif
+
+endif:
+  %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ]
+  store <16 x i32> %r, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll b/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll

index 93a2c6998be4b91ed60cd5b76280fb171b0fa0b5..eb6007f21c10cbd440290b2e11f2be9460d0773a 100644 (file)
--- a/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll
+++ b/test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll
@@ -14,6 +14,7 @@ main_body:
  
  if:
    %u = fadd float %v, %v
+  call void asm sideeffect "", ""() #0 ; Prevent ifconversion
    br label %else
  
  else:
diff --git a/test/CodeGen/AMDGPU/uniform-cfg.ll b/test/CodeGen/AMDGPU/uniform-cfg.ll

index 154ac361e797e9eee36d9f41528c6f05c6982835..d3e431d1e35e86e3e0ecf8fa8756353052b8f84d 100644 (file)
--- a/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=verde -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
  
  ; GCN-LABEL: {{^}}uniform_if_scc:
  ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 25 Jan 2017 04:25:02 +0000 (04:25 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 25 Jan 2017 04:25:02 +0000 (04:25 +0000)
lib/Target/AMDGPU/AMDGPUSubtarget.h		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.h		patch \| blob \| history
lib/Target/AMDGPU/SISchedule.td		patch \| blob \| history
test/CodeGen/AMDGPU/early-if-convert-cost.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/AMDGPU/early-if-convert.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll		patch \| blob \| history
test/CodeGen/AMDGPU/uniform-cfg.ll		patch \| blob \| history