From: Matt Arsenault Date: Thu, 23 May 2019 19:38:14 +0000 (+0000) Subject: AMDGPU: Correct maximum possible private allocation size X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=32a10ab5dd5d76c56e4900278bb2411b47474096;p=llvm AMDGPU: Correct maximum possible private allocation size We were assuming a much larger possible per-wave visible stack allocation than is possible: https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/faa3ae51388517353afcdaf9c16621f879ef0a59/src/core/runtime/amd_gpu_agent.cpp#L70 Based on this, we can assume the high 15 bits of a frame index or sret are 0. The frame index value is the per-lane offset, so the maximum frame index value is MAX_WAVE_SCRATCH / wavesize. Remove the corresponding subtarget feature and option that made this configurable. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@361541 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 341ef73a21c..9938eeaa528 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -458,13 +458,6 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>; def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; -def FeatureEnableHugePrivateBuffer : SubtargetFeature< - "huge-private-buffer", - "EnableHugePrivateBuffer", - "true", - "Enable private/scratch buffer sizes greater than 128 GB" ->; - def FeatureDumpCode : SubtargetFeature <"DumpCode", "DumpCode", "true", diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index a88218f68b5..09b806bd06a 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -190,7 +190,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, EnableCuMode(false), TrapHandler(false), - EnableHugePrivateBuffer(false), EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), EnableSIScheduler(false), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 1ef72622980..34166aacf41 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -299,7 +299,6 @@ protected: bool TrapHandler; // Used as options. - bool EnableHugePrivateBuffer; bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; bool EnableSIScheduler; @@ -377,6 +376,9 @@ private: SITargetLowering TLInfo; SIFrameLowering FrameLowering; + // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. + static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); + public: GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM); @@ -436,6 +438,11 @@ public: return Log2_32(WavefrontSize); } + /// Return the number of high bits known to be zero fror a frame index. + unsigned getKnownHighZeroBitsForFrameIndex() const { + return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); + } + int getLDSBankCount() const { return LDSBankCount; } @@ -526,10 +533,6 @@ public: return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; } - bool enableHugePrivateBuffer() const { - return EnableHugePrivateBuffer; - } - bool unsafeDSOffsetFoldingEnabled() const { return EnableUnsafeDSOffsetFolding; } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index c4c0e4047fc..c2cda5ef4d7 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -93,12 +93,6 @@ static cl::opt EnableVGPRIndexMode( cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); -static cl::opt AssumeFrameIndexHighZeroBits( - "amdgpu-frame-index-zero-bits", - cl::desc("High bits of frame index assumed to be zero"), - cl::init(5), - cl::ReallyHidden); - static cl::opt DisableLoopAlignment( "amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), @@ -2059,13 +2053,14 @@ SDValue SITargetLowering::LowerFormalArguments( Reg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) { + if (Arg.Flags.isSRet()) { // The return object should be reasonably addressable. // FIXME: This helps when the return is a real sret. If it is a // automatically inserted sret (i.e. CanLowerReturn returns false), an // extra copy is inserted in SelectionDAGBuilder which obscures this. - unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits; + unsigned NumBits + = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex(); Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); } @@ -9970,14 +9965,10 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, DAG, Depth); - if (getSubtarget()->enableHugePrivateBuffer()) - return; - - // Technically it may be possible to have a dispatch with a single workitem - // that uses the full private memory size, but that's not really useful. We - // can't use vaddr in MUBUF instructions if we don't know the address + // Set the high bits to zero based on the maximum allowed scratch size per + // wave. We can't use vaddr in MUBUF instructions if we don't know the address // calculation won't overflow, so assume the sign bit is never set. - Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); + Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); } unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { diff --git a/test/CodeGen/AMDGPU/frame-index-elimination.ll b/test/CodeGen/AMDGPU/frame-index-elimination.ll index 28521af83e0..92a255ceae6 100644 --- a/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -60,7 +60,7 @@ define void @func_add_constant_to_fi_i32() #0 { ; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6 ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] -; GCN-NEXT: v_mul_lo_u32 v0, v0, 9 +; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_other_fi_user_i32() #0 { @@ -172,7 +172,7 @@ ret: ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6 ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], s6, [[SCALED]] -; GCN: v_mul_lo_u32 [[VZ]], [[VZ]], 9 +; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] ; GCN: ds_write_b32 v0, [[VZ]] define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { %alloca0 = alloca [128 x i32], align 4, addrspace(5) @@ -196,7 +196,7 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[DIFF]] ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[OFFSET]], [[SCALED]] -; GCN: v_mul_lo_u32 [[VZ]], [[VZ]], 9 +; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] ; GCN: ds_write_b32 v0, [[VZ]] define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 { %alloca0 = alloca [128 x i32], align 4, addrspace(5) diff --git a/test/CodeGen/AMDGPU/function-returns.ll b/test/CodeGen/AMDGPU/function-returns.ll index f9631e615c9..8e73ee3c1f1 100644 --- a/test/CodeGen/AMDGPU/function-returns.ll +++ b/test/CodeGen/AMDGPU/function-returns.ll @@ -570,4 +570,24 @@ define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 { ret { <3 x float>, i32 } %insert.4 } +; GCN-LABEL: {{^}}void_func_sret_max_known_zero_bits: +; GCN: v_lshrrev_b32_e32 [[LSHR16:v[0-9]+]], 16, v0 +; GCN: ds_write_b32 {{v[0-9]+}}, [[LSHR16]] + +; GCN: v_mov_b32_e32 [[HIGH_BITS:v[0-9]+]], 0 +; GCN: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]] +; GCN-NEXT: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]] +define void @void_func_sret_max_known_zero_bits(i8 addrspace(5)* sret %arg0) #0 { + %arg0.int = ptrtoint i8 addrspace(5)* %arg0 to i32 + + %lshr0 = lshr i32 %arg0.int, 16 + %lshr1 = lshr i32 %arg0.int, 17 + %lshr2 = lshr i32 %arg0.int, 18 + + store volatile i32 %lshr0, i32 addrspace(3)* undef + store volatile i32 %lshr1, i32 addrspace(3)* undef + store volatile i32 %lshr2, i32 addrspace(3)* undef + ret void +} + attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/huge-private-buffer.ll b/test/CodeGen/AMDGPU/huge-private-buffer.ll index 8e54dcbd169..dfd75235f80 100644 --- a/test/CodeGen/AMDGPU/huge-private-buffer.ll +++ b/test/CodeGen/AMDGPU/huge-private-buffer.ll @@ -1,31 +1,42 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; GCN-LABEL: {{^}}scratch_buffer_known_high_bit_small: +; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16: +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 +; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xfffc, [[FI]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] +define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + %toint = ptrtoint i32 addrspace(5)* %alloca to i32 + %masked = and i32 %toint, 65535 + store volatile i32 %masked, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17: ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 ; GCN-NOT: [[FI]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]] -define amdgpu_kernel void @scratch_buffer_known_high_bit_small() #0 { +define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca %toint = ptrtoint i32 addrspace(5)* %alloca to i32 - %masked = and i32 %toint, 2147483647 + %masked = and i32 %toint, 131071 store volatile i32 %masked, i32 addrspace(1)* undef ret void } -; GCN-LABEL: {{^}}scratch_buffer_known_high_bit_huge: +; GCN-LABEL: {{^}}scratch_buffer_known_high_mask18: ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4 -; GCN-DAG: buffer_store_dword -; GCN-DAG: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x7ffffffc, [[FI]] -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]] -define amdgpu_kernel void @scratch_buffer_known_high_bit_huge() #1 { +; GCN-NOT: [[FI]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]] +define amdgpu_kernel void @scratch_buffer_known_high_mask18() #0 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca %toint = ptrtoint i32 addrspace(5)* %alloca to i32 - %masked = and i32 %toint, 2147483647 + %masked = and i32 %toint, 262143 store volatile i32 %masked, i32 addrspace(1)* undef ret void } attributes #0 = { nounwind } -attributes #1 = { nounwind "target-features"="+huge-private-buffer" }