From c6fa88b24e4aa7721a2ba019f6a3730f84008bb7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 14 Sep 2017 17:37:40 +0000 Subject: [PATCH] AMDGPU: Stop modifying SP in call sequences Because the stack growth direction and addressing is done in the same direction, modifying SP at the beginning of the call sequence was incorrect. If we had a stack passed argument, we would end up skipping that number of bytes before pushing arguments, leaving unused/inconsistent space. The callee creates fixed stack objects in its frame, so the space necessary for these is already logically allocated in the callee, so we just let the callee increment SP if it really requires it. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@313279 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIISelLowering.cpp | 6 +++--- test/CodeGen/AMDGPU/byval-frame-setup.ll | 14 +++++++------- test/CodeGen/AMDGPU/call-argument-types.ll | 14 +++++++------- test/CodeGen/AMDGPU/callee-special-input-vgprs.ll | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index a5c5ecc694e..99f7badde71 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2153,7 +2153,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) { - Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); unsigned OffsetReg = Info->getScratchWaveOffsetReg(); @@ -2359,8 +2359,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, InFlag = Chain.getValue(1); } - uint64_t CalleePopBytes = 0; - Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(NumBytes, DL, MVT::i32), + uint64_t CalleePopBytes = NumBytes; + Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32), DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32), InFlag, DL); if (!Ins.empty()) diff --git a/test/CodeGen/AMDGPU/byval-frame-setup.ll b/test/CodeGen/AMDGPU/byval-frame-setup.ll index bd354682784..398c7d32aee 100644 --- a/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -74,7 +74,6 @@ entry: ; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} ; GCN-DAG: v_writelane_b32 -; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}} ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 ; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 @@ -86,6 +85,7 @@ entry: ; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16 ; GCN: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20 +; GCN-NOT: s_add_u32 s32, s32, 0x800 ; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:4{{$}} ; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:8 @@ -107,8 +107,9 @@ entry: ; GCN: v_readlane_b32 ; GCN-NOT: v_readlane_b32 s32 -; GCN: s_sub_u32 s32, s32, 0x800{{$}} -; GCN-NEXT: s_sub_u32 s32, s32, 0xc00{{$}} +; GCN-NOT: s_sub_u32 s32, s32, 0x800 + +; GCN: s_sub_u32 s32, s32, 0xc00{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @call_void_func_byval_struct_func() #0 { @@ -138,7 +139,7 @@ entry: ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8 ; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24 -; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}} +; GCN-NOT: s_add_u32 s32, s32, 0x800 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8 ; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12 @@ -162,9 +163,8 @@ entry: ; GCN: s_swappc_b64 -; FIXME: Dead SP modfication -; GCN-NEXT: s_sub_u32 s32, s32, 0x800{{$}} -; GCN-NEXT: s_endpgm +; GCN-NOT: s_sub_u32 s32 +; GCN: s_endpgm define amdgpu_kernel void @call_void_func_byval_struct_kernel() #0 { entry: %arg0 = alloca %struct.ByValStruct, align 4 diff --git a/test/CodeGen/AMDGPU/call-argument-types.ll b/test/CodeGen/AMDGPU/call-argument-types.ll index 740a74a9d40..60c0480eaa7 100644 --- a/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/test/CodeGen/AMDGPU/call-argument-types.ll @@ -385,10 +385,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GCN-LABEL: {{^}}test_call_external_void_func_v32i32_i32: ; HSA-DAG: s_mov_b32 s33, s9 -; HSA-DAG: s_add_u32 [[SP_REG:s[0-9]+]], s33, 0x100{{$}} +; HSA-NOT: s_add_u32 s32 ; MESA-DAG: s_mov_b32 s33, s3{{$}} -; MESA-DAG: s_add_u32 [[SP_REG:s[0-9]+]], s33, 0x100{{$}} +; MESA-NOT: s_add_u32 s32 ; GCN-DAG: buffer_load_dword [[VAL1:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} ; GCN-DAG: buffer_load_dwordx4 v[0:3], off @@ -400,7 +400,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GCN-DAG: buffer_load_dwordx4 v[24:27], off ; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SP_REG]] offset:4{{$}} +; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}} ; GCN: s_waitcnt ; GCN-NEXT: s_swappc_b64 ; GCN-NEXT: s_endpgm @@ -447,7 +447,7 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], s33 offset:8 ; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], s33 offset:12 -; GCN: s_add_u32 [[SP]], [[SP]], 0x200 +; GCN-NOT: s_add_u32 [[SP]], ; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8 ; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12 @@ -463,7 +463,7 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; MESA: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:8 ; GCN-NEXT: s_swappc_b64 -; GCN-NEXT: s_sub_u32 [[SP]], [[SP]], 0x200 +; GCN-NOT: [[SP]] define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 { %val = alloca { i8, i32 }, align 4 %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %val, i32 0, i32 0 @@ -486,13 +486,13 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8 ; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12 -; GCN-DAG: s_add_u32 [[SP]], [[SP]], 0x200 +; GCN-NOT: s_add_u32 [[SP]] ; GCN: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4 ; GCN: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8 ; GCN-NEXT: s_swappc_b64 ; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16 ; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20 -; GCN: s_sub_u32 [[SP]], [[SP]], 0x200 +; GCN-NOT: s_sub_u32 [[SP]] ; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off ; GCN: buffer_store_dword [[LOAD_OUT_VAL1]], off diff --git a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 50da6b44625..f2db5733689 100644 --- a/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -425,7 +425,7 @@ define void @too_many_args_use_workitem_id_x_byval( ; GCN: s_mov_b32 s33, s7 ; GCN: s_add_u32 s32, s33, 0x200{{$}} -; GCN-DAG: s_add_u32 s32, s32, 0x100{{$}} +; GCN-NOT: s32 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4 ; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12 -- 2.50.0