From: Matt Arsenault Date: Mon, 26 Jun 2017 03:01:31 +0000 (+0000) Subject: AMDGPU: Partially fix implicit.buffer.ptr intrinsic handling X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ec6175c524ca3c409a950902ba51125b23586865;p=llvm AMDGPU: Partially fix implicit.buffer.ptr intrinsic handling This should not be treated as a different version of private_segment_buffer. These are distinct things with different uses and register classes, and requires the function argument info to have more context about the function's type and environment. Also add missing test coverage for the intrinsic, and emit an error for HSA. This also encovers that the intrinsic is broken unless there happen to be stack objects. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306264 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index b1bd14e421f..479a321d305 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -284,7 +284,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) { + if (ST.isAmdCodeObjectV2(MF)) { PreloadedPrivateBufferReg = TRI->getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); } @@ -363,14 +363,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, // Use relocations to get the pointer, and setup the other bits manually. uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - if (MFI->hasPrivateMemoryInputPtr()) { + if (MFI->hasImplicitBufferPtr()) { unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); BuildMI(MBB, I, DL, Mov64, Rsrc01) - .addReg(PreloadedPrivateBufferReg) + .addReg(MFI->getImplicitBufferPtrUserSGPR()) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); } else { const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); @@ -385,7 +385,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineMemOperand::MODereferenceable, 0, 0); BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) - .addReg(PreloadedPrivateBufferReg) + .addReg(MFI->getImplicitBufferPtrUserSGPR()) .addImm(0) // offset .addImm(0) // glc .addMemOperand(MMO) diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 000eaaf8975..81dfbe1a502 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1068,10 +1068,10 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { - if (Info.hasPrivateMemoryInputPtr()) { - unsigned PrivateMemoryPtrReg = Info.addPrivateMemoryPtr(TRI); - MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SGPR_64RegClass); - CCInfo.AllocateReg(PrivateMemoryPtrReg); + if (Info.hasImplicitBufferPtr()) { + unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); + MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(ImplicitBufferPtrReg); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? @@ -3005,7 +3005,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { case Intrinsic::amdgcn_implicit_buffer_ptr: { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + if (getSubtarget()->isAmdCodeObjectV2(MF)) + return emitNonHSAIntrinsicError(DAG, DL, VT); + + unsigned Reg = TRI->getPreloadedValue(MF, + SIRegisterInfo::IMPLICIT_BUFFER_PTR); return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); } case Intrinsic::amdgcn_dispatch_ptr: diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 18b197ddb7a..71f07bad347 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -74,7 +74,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false), - PrivateMemoryInputPtr(false) { + ImplicitBufferPtr(false) { const SISubtarget &ST = MF.getSubtarget(); const Function *F = MF.getFunction(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F); @@ -150,7 +150,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) DispatchID = true; } else if (ST.isMesaGfxShader(MF)) { if (HasStackObjects || MaySpill) - PrivateMemoryInputPtr = true; + ImplicitBufferPtr = true; } // We don't need to worry about accessing spills with flat instructions. @@ -203,11 +203,11 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { return FlatScratchInitUserSGPR; } -unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) { - PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg( +unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { + ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); NumUserSGPRs += 2; - return PrivateMemoryPtrUserSGPR; + return ImplicitBufferPtrUserSGPR; } /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 9fdb8caac6f..05aa249584b 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -97,7 +97,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { unsigned StackPtrOffsetReg; // Input registers for non-HSA ABI - unsigned PrivateMemoryPtrUserSGPR; + unsigned ImplicitBufferPtrUserSGPR; // Input registers setup for the HSA ABI. // User SGPRs in allocation order. @@ -179,7 +179,7 @@ private: // Private memory buffer // Compute directly in sgpr[0:1] // Other shaders indirect 64-bits at sgpr[0:1] - bool PrivateMemoryInputPtr : 1; + bool ImplicitBufferPtr : 1; MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); @@ -236,7 +236,7 @@ public: unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); unsigned addDispatchID(const SIRegisterInfo &TRI); unsigned addFlatScratchInit(const SIRegisterInfo &TRI); - unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI); + unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI); // Add system SGPRs. unsigned addWorkGroupIDX() { @@ -341,8 +341,8 @@ public: return WorkItemIDZ; } - bool hasPrivateMemoryInputPtr() const { - return PrivateMemoryInputPtr; + bool hasImplicitBufferPtr() const { + return ImplicitBufferPtr; } unsigned getNumUserSGPRs() const { @@ -396,8 +396,8 @@ public: return QueuePtrUserSGPR; } - unsigned getPrivateMemoryPtrUserSGPR() const { - return PrivateMemoryPtrUserSGPR; + unsigned getImplicitBufferPtrUserSGPR() const { + return ImplicitBufferPtrUserSGPR; } bool hasSpilledSGPRs() const { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 9018e3882d9..ef6ad4ad0c8 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1352,12 +1352,11 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: - if (ST.isAmdCodeObjectV2(MF)) { - assert(MFI->hasPrivateSegmentBuffer()); - return MFI->PrivateSegmentBufferUserSGPR; - } - assert(MFI->hasPrivateMemoryInputPtr()); - return MFI->PrivateMemoryPtrUserSGPR; + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; + case SIRegisterInfo::IMPLICIT_BUFFER_PTR: + assert(MFI->hasImplicitBufferPtr()); + return MFI->ImplicitBufferPtrUserSGPR; case SIRegisterInfo::KERNARG_SEGMENT_PTR: assert(MFI->hasKernargSegmentPtr()); return MFI->KernargSegmentPtrUserSGPR; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 8fed6d5f971..600cc886cb5 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -197,12 +197,13 @@ public: WORKGROUP_ID_Y = 11, WORKGROUP_ID_Z = 12, PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, + IMPLICIT_BUFFER_PTR = 15, // VGPRS: - FIRST_VGPR_VALUE = 15, + FIRST_VGPR_VALUE = 16, WORKITEM_ID_X = FIRST_VGPR_VALUE, - WORKITEM_ID_Y = 16, - WORKITEM_ID_Z = 17 + WORKITEM_ID_Y = 17, + WORKITEM_ID_Z = 18 }; /// \brief Returns the physical register that \p Value is stored in. diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll new file mode 100644 index 00000000000..437ce7f373d --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll @@ -0,0 +1,24 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +; ERROR: in function test_kernel{{.*}}: non-hsa intrinsic with hsa target +define amdgpu_kernel void @test_kernel(i32 addrspace(1)* %out) #1 { + %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() + %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* + %value = load i32, i32 addrspace(2)* %header_ptr + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; ERROR: in function test_func{{.*}}: non-hsa intrinsic with hsa target +define void @test_func(i32 addrspace(1)* %out) #1 { + %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() + %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* + %value = load i32, i32 addrspace(2)* %header_ptr + store i32 %value, i32 addrspace(1)* %out + ret void +} + +declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll new file mode 100644 index 00000000000..dda91bcfbeb --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll @@ -0,0 +1,35 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; FIXME: Requires stack object to not assert +; GCN-LABEL: {{^}}test_ps: +; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GCN: buffer_store_dword v0, off, s[4:7], s2 offset:4 +; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: ; return +define amdgpu_ps i32 @test_ps() #1 { + %alloca = alloca i32 + store volatile i32 0, i32* %alloca + %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() + %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* + %value = load volatile i32, i32 addrspace(2)* %buffer_ptr + ret i32 %value +} + +; GCN-LABEL: {{^}}test_cs: +; GCN: s_mov_b64 s[4:5], s[0:1] +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:4 +; GCN: s_load_dword s0, s[0:1], 0x0 +define amdgpu_cs i32 @test_cs() #1 { + %alloca = alloca i32 + store volatile i32 0, i32* %alloca + %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() + %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* + %value = load volatile i32, i32 addrspace(2)* %buffer_ptr + ret i32 %value +} + +declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind }