From e8e3365d5266760f3d7ad247f21496bb69cfef39 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 24 Jan 2017 22:18:39 +0000 Subject: [PATCH] AMDGPU: Remove spurious out branches after a kill The sequence like this: v_cmpx_le_f32_e32 vcc, 0, v0 s_branch BB0_30 s_cbranch_execnz BB0_30 ; BB#29: exp null off, off, off, off done vm s_endpgm BB0_30: ; %endif110 is likely wrong. The s_branch instruction will unconditionally jump to BB0_30 and the skip block (exp done + endpgm) inserted for performing the kill instruction will never be executed. This results in a GPU hang with Star Ruler 2. The s_branch instruction is added during the "Control Flow Optimizer" pass which seems to re-organize the basic blocks, and we assume that SI_KILL_TERMINATOR is always the last instruction inside a basic block. Thus, after inserting a skip block we just go to the next BB without looking at the subsequent instructions after the kill, and the s_branch op is never removed. Instead, we should remove the unconditional out branches and let skip the two instructions if the exec mask is non-zero. This patch fixes the GPU hang and doesn't introduce any regressions with "make check". Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=99019 Patch by Samuel Pitoiset git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@292985 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIInsertSkips.cpp | 11 ++++- .../AMDGPU/insert-skips-kill-uncond.mir | 40 +++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp index c6b420fce8a..9d6feaa94fb 100644 --- a/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -263,6 +263,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { BI != BE; BI = NextBB) { NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; + bool HaveSkipBlock = false; if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) { // Reached convergence point for last divergent branch. @@ -290,8 +291,14 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::S_BRANCH: // Optimize out branches to the next block. // FIXME: Shouldn't this be handled by BranchFolding? - if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) + if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { MI.eraseFromParent(); + } else if (HaveSkipBlock) { + // Remove the given unconditional branch when a skip block has been + // inserted after the current one and let skip the two instructions + // performing the kill if the exec mask is non-zero. + MI.eraseFromParent(); + } break; case AMDGPU::SI_KILL_TERMINATOR: @@ -300,9 +307,9 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { if (ExecBranchStack.empty()) { if (skipIfDead(MI, *NextBB)) { + HaveSkipBlock = true; NextBB = std::next(BI); BE = MF.end(); - Next = MBB.end(); } } else { HaveKill = true; diff --git a/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir b/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir new file mode 100644 index 00000000000..bd5f296affb --- /dev/null +++ b/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir @@ -0,0 +1,40 @@ +# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-insert-skips -amdgpu-skip-threshold=1 %s -o - | FileCheck %s +# https://bugs.freedesktop.org/show_bug.cgi?id=99019 +--- | + define amdgpu_ps void @kill_uncond_branch() { + ret void + } +... +--- + +# CHECK-LABEL: name: kill_uncond_branch + +# CHECK: bb.0: +# CHECK: S_CBRANCH_VCCNZ %bb.1, implicit %vcc + +# CHECK: bb.1: +# CHECK: V_CMPX_LE_F32_e32 +# CHECK-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit %exec + +# CHECK: bb.3: +# CHECK-NEXT: EXP_DONE +# CHECK: S_ENDPGM + +# CHECK: bb.2: +# CHECK: S_ENDPGM + +name: kill_uncond_branch + +body: | + bb.0: + successors: %bb.1 + S_CBRANCH_VCCNZ %bb.1, implicit %vcc + + bb.1: + successors: %bb.2 + %vgpr0 = V_MOV_B32_e32 0, implicit %exec + SI_KILL_TERMINATOR %vgpr0, implicit-def %exec, implicit-def %vcc, implicit %exec + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM -- 2.40.0