------------------------------------------------------------------------
r266105 | thomas.stellard | 2016-04-12 11:40:43 -0700 (Tue, 12 Apr 2016) | 15 lines
AMDGPU/SI: Insert wait states required after v_readfirstlane on SI
Summary:
We will be able to handle this case much better once the hazard
recognizer
is finished, but this conservative implementation fixes a hang with the
piglit
test:
spec/arb_arrays_of_arrays/execution/sampler/fs-nested-struct-arrays-nonconst-nested-arra
Reviewers: arsenm, nhaehnle
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D18988
------------------------------------------------------------------------
git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@271731
91177308-0d34-0410-b5e6-
96231b3b80d8
TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
TRI =
static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
-
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
MRI = &MF.getRegInfo();
WaitedOn = ZeroCounts;
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {
+ // Insert required wait states for SMRD reading an SGPR written by a VALU
+ // instruction.
+ if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+ I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32)
+ TII->insertWaitStates(std::next(I), 4);
+
// Wait for everything before a barrier.
if (I->getOpcode() == AMDGPU::S_BARRIER)
Changes |= insertWait(MBB, I, LastIssued);
; SI: buffer_store_dword
; SI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; SI-NEXT: s_nop
; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
; SI: buffer_store_dword
; SI: s_endpgm
; SI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0
; GCN-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; SI-NEXT: s_nop
; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8
; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]