From: Konstantin Zhuravlyov Date: Fri, 2 Jun 2017 17:40:26 +0000 (+0000) Subject: AMDGPU: Make auto waitcnt before barrier a feature X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e0fcf724678ba3bde203899113065a642088f7ab;p=llvm AMDGPU: Make auto waitcnt before barrier a feature Differential Revision: https://reviews.llvm.org/D33793 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304571 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index e7ebb37a9d6..b50e8d1d659 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -365,6 +365,13 @@ def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", "Force to generate flat instruction for global" >; +def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature < + "auto-waitcnt-before-barrier", + "AutoWaitcntBeforeBarrier", + "true", + "Hardware automatically inserts waitcnt before barrier" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 6e301b4ad52..8d157e2f98f 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -91,6 +91,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FPExceptions(false), DX10Clamp(false), FlatForGlobal(false), + AutoWaitcntBeforeBarrier(false), UnalignedScratchAccess(false), UnalignedBufferAccess(false), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 4f397ad1c48..ed9cbb994fa 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -110,6 +110,7 @@ protected: bool FPExceptions; bool DX10Clamp; bool FlatForGlobal; + bool AutoWaitcntBeforeBarrier; bool UnalignedScratchAccess; bool UnalignedBufferAccess; bool HasApertureRegs; @@ -364,6 +365,10 @@ public: return FlatForGlobal; } + bool hasAutoWaitcntBeforeBarrier() const { + return AutoWaitcntBeforeBarrier; + } + bool hasUnalignedBufferAccess() const { return UnalignedBufferAccess; } @@ -728,12 +733,6 @@ public: /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; - /// \returns True if waitcnt instruction is needed before barrier instruction, - /// false otherwise. - bool needWaitcntBeforeBarrier() const { - return true; - } - /// \returns true if the flat_scratch register should be initialized with the /// pointer to the wave's scratch memory rather than a size and offset. bool flatScratchIsPointer() const { diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index e22166d03e9..c10badba88f 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1009,7 +1009,8 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( // occurs before the instruction. Doing it here prevents any additional // S_WAITCNTs from being emitted if the instruction was marked as // requiring a WAITCNT beforehand. - if (MI.getOpcode() == AMDGPU::S_BARRIER && ST->needWaitcntBeforeBarrier()) { + if (MI.getOpcode() == AMDGPU::S_BARRIER && + !ST->hasAutoWaitcntBeforeBarrier()) { EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); EmitSwaitcnt |= ScoreBrackets->updateByWait( diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp index 9f32ecfa52f..bc86515d8b1 100644 --- a/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -630,7 +630,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { // but we also want to wait for any other outstanding transfers before // signalling other hardware blocks if ((I->getOpcode() == AMDGPU::S_BARRIER && - ST->needWaitcntBeforeBarrier()) || + !ST->hasAutoWaitcntBeforeBarrier()) || I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT) Required = LastIssued; diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index ef9cda14285..3d815cca5be 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -1,10 +1,13 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 -check-prefix=NOAUTO %s +; RUN: llc -march=amdgcn -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 -check-prefix=AUTO %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=NOAUTO %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=AUTO %s ; GCN-LABEL: {{^}}test_barrier: ; GFX8: buffer_store_dword ; GFX9: flat_store_dword -; GCN: s_waitcnt +; NOAUTO: s_waitcnt +; AUTO-NOT: s_waitcnt ; GCN: s_barrier define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 { entry: