From: Matt Arsenault Date: Thu, 14 Sep 2017 17:14:57 +0000 (+0000) Subject: AMDGPU: Make frame register caller preserved X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a15bb16493b6dd0ad88e421acfd0f4053102bf01;p=llvm AMDGPU: Make frame register caller preserved Using SplitCSR for the frame register was very broken. Often the copies in the prolog and epilog were optimized out, in addition to them being inserted after the true prolog where the FP was clobbered. I have a hacky solution which works that continues to use split CSR, but for now this is simpler and will get to working programs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@313274 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index f0900c8bb3e..8454dede0e1 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -59,16 +59,7 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( const MCPhysReg * SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { - // FIXME - static MCPhysReg Regs[2]; - - const SIMachineFunctionInfo *MFI = MF->getInfo(); - assert(!MFI->isEntryFunction()); - - Regs[0] = MFI->getFrameOffsetReg(); - Regs[1] = AMDGPU::NoRegister; - - return Regs; + return nullptr; } const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 48faee9bb99..a5c5ecc694e 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2148,6 +2148,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, MachineFrameInfo &MFI = MF.getFrameInfo(); SmallVector, 8> RegsToPass; + SDValue CallerSavedFP; + // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) { @@ -2164,6 +2166,13 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue ScratchWaveOffsetReg = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32); RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg); + + if (!Info->isEntryFunction()) { + // Avoid clobbering this function's FP value. In the current convention + // callee will overwrite this, so do save/restore around the call site. + CallerSavedFP = DAG.getCopyFromReg(Chain, DL, + Info->getFrameOffsetReg(), MVT::i32); + } } // Stack pointer relative accesses are done by changing the offset SGPR. This @@ -2344,6 +2353,12 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = Call.getValue(0); InFlag = Call.getValue(1); + if (CallerSavedFP) { + SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32); + Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag); + InFlag = Chain.getValue(1); + } + uint64_t CalleePopBytes = 0; Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(NumBytes, DL, MVT::i32), DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32), diff --git a/test/CodeGen/AMDGPU/call-preserved-registers.ll b/test/CodeGen/AMDGPU/call-preserved-registers.ll index 18122613d43..98a4f132084 100644 --- a/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -32,11 +32,13 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_ ; GCN: v_writelane_b32 v32, s37, 4 ; GCN: s_mov_b32 s33, s5 -; GCN: s_swappc_b64 +; GCN-NEXT: s_swappc_b64 +; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_swappc_b64 -; GCN: s_mov_b32 s5, s33 +; GCN-NEXT: s_mov_b32 s5, s33 ; GCN: v_readlane_b32 s37, v32, 4 ; GCN: v_readlane_b32 s36, v32, 3 ; GCN: v_readlane_b32 s35, v32, 2 @@ -50,6 +52,20 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa ret void } +; FIXME: Avoid extra restore of FP in between calls. +; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: +; GCN: s_mov_b32 s33, s5 +; GCN-NEXT: s_swappc_b64 +; GCN-NEXT: s_mov_b32 s5, s33 +; GCN-NEXT: s_mov_b32 s33, s5 +; GCN-NEXT: s_swappc_b64 +; GCN-NEXT: s_mov_b32 s5, s33 +define void @test_func_call_external_void_funcx2() #0 { + call void @external_void_func_void() + call void @external_void_func_void() + ret void +} + ; GCN-LABEL: {{^}}void_func_void_clobber_s30_s31: ; GCN: s_waitcnt ; GCN-NEXT: s_mov_b64 [[SAVEPC:s\[[0-9]+:[0-9]+\]]], s[30:31] diff --git a/test/CodeGen/AMDGPU/ipra.ll b/test/CodeGen/AMDGPU/ipra.ll index 803855cd032..9615ddd07cd 100644 --- a/test/CodeGen/AMDGPU/ipra.ll +++ b/test/CodeGen/AMDGPU/ipra.ll @@ -90,5 +90,19 @@ define void @func_call_tail_call() #1 { ret void } +define void @void_func_void() noinline { + ret void +} + +; Make sure we don't get save/restore of FP between calls. +; GCN-LABEL: {{^}}test_funcx2: +; GCN-NOT: s5 +; GCN-NOT: s32 +define void @test_funcx2() #0 { + call void @void_func_void() + call void @void_func_void() + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind noinline } diff --git a/test/CodeGen/AMDGPU/sibling-call.ll b/test/CodeGen/AMDGPU/sibling-call.ll index d5b56ace32c..833de07095b 100644 --- a/test/CodeGen/AMDGPU/sibling-call.ll +++ b/test/CodeGen/AMDGPU/sibling-call.ll @@ -13,8 +13,8 @@ define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_mov_b32 s5, s32 ; GCN: v_add_i32_e32 v0, vcc, v1, v +; GCN: s_mov_b32 s5, s32 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:24 ; GCN: s_waitcnt vmcnt(0) ; GCN: s_setpc_b64