From: Matt Arsenault Date: Wed, 8 Mar 2017 01:06:58 +0000 (+0000) Subject: AMDGPU: Don't wait at end of block with a trivial successor X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f06f68a796f28576a5f1464753de31493b1b3da9;p=llvm AMDGPU: Don't wait at end of block with a trivial successor If there is only one successor, and that successor only has one predecessor the wait can obviously be delayed until uses or the end of the next block. This avoids code quality regressions when there are trivial fallthrough blocks inserted for structurization. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@297251 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp index 3daf13cb4b5..285bc8a8144 100644 --- a/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -524,6 +524,16 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, } } +/// Return true if \p MBB has one successor immediately following, and is its +/// only predecessor +static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) { + if (MBB.succ_size() != 1) + return false; + + const MachineBasicBlock *Succ = *MBB.succ_begin(); + return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ); +} + // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" // around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { @@ -642,8 +652,10 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { EndPgmBlocks.push_back(&MBB); } - // Wait for everything at the end of the MBB - Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); + // Wait for everything at the end of the MBB. If there is only one + // successor, we can defer this until the uses there. + if (!hasTrivialSuccessor(MBB)) + Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } if (HaveScalarStores) { diff --git a/test/CodeGen/AMDGPU/waitcnt.mir b/test/CodeGen/AMDGPU/waitcnt.mir index cb5de6a2419..afe651af093 100644 --- a/test/CodeGen/AMDGPU/waitcnt.mir +++ b/test/CodeGen/AMDGPU/waitcnt.mir @@ -7,6 +7,15 @@ <4 x i32> addrspace(4)* %flat16) { ret void } + + define void @single_fallthrough_successor_no_end_block_wait() { + ret void + } + + define void @single_branch_successor_not_next_block() { + ret void + } + ... --- @@ -21,18 +30,21 @@ # CHECK-LABEL: bb.1: # CHECK: FLAT_LOAD_DWORD +# CHECK: S_WAITCNT 3952 # CHECK: FLAT_LOAD_DWORDX4 # The first load has no mem operand, so we should assume it accesses the flat # address space. # s_waitcnt vmcnt(0) lgkmcnt(0) -# CHECK-NEXT: S_WAITCNT 112 +# CHECK-NEXT: S_WAITCNT 127 # CHECK-LABEL: bb.2: # CHECK: FLAT_LOAD_DWORD +# CHECK: S_WAITCNT 3952 # CHECK: FLAT_LOAD_DWORDX4 + # One outstand loads access the flat address space. # s_waitcnt vmcnt(0) lgkmcnt(0) -# CHECK-NEXT: S_WAITCNT 112 +# CHECK-NEXT: S_WAITCNT 127 name: flat_zero_waitcnt @@ -57,3 +69,60 @@ body: | %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec S_ENDPGM ... +--- +# There is only a single fallthrough successor block, so there's no +# need to wait immediately. + +# CHECK-LABEL: name: single_fallthrough_successor_no_end_block_wait +# CHECK: %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2 +# CHECK-NOT: S_WAITCNT + +# CHECK: bb.1: +# CHECK-NEXT: V_LSHLREV_B64 +# CHECK-NEXT: S_WAITCNT 112 +# CHECK-NEXT: FLAT_STORE_DWORD +name: single_fallthrough_successor_no_end_block_wait + +body: | + bb.0: + successors: %bb.1 + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr + + bb.1: + %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec + FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +... +--- +# The block has a single predecessor with a single successor, but it +# is not the next block so it's non-obvious that the wait is not needed. + + +# CHECK-LABEL: name: single_branch_successor_not_next_block +# CHECK: %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2 +# CHECK-NEXT: S_WAITCNT 112 + +# CHECK: bb.1 +# CHECK-NEXT: FLAT_STORE_DWORD +# CHECK-NEXT: S_ENDPGM + +# CHECK: bb.2: +# CHECK-NEXT: V_LSHLREV_B64 +# CHECK-NEXT: FLAT_STORE_DWORD +name: single_branch_successor_not_next_block + +body: | + bb.0: + successors: %bb.2 + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr + S_BRANCH %bb.2 + + bb.1: + FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM + + bb.2: + %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec + FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM +...