AMDGPU: Partially fix implicit.buffer.ptr intrinsic handling

author Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 26 Jun 2017 03:01:31 +0000 (03:01 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 26 Jun 2017 03:01:31 +0000 (03:01 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 26 Jun 2017 03:01:31 +0000 (03:01 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 26 Jun 2017 03:01:31 +0000 (03:01 +0000)
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp

index b1bd14e421f024587bec6a9919fe93e2ce872f11..479a321d305692131ec0552dfa10df008c0a7598 100644 (file)
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -284,7 +284,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
      MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
  
    unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
-  if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
+  if (ST.isAmdCodeObjectV2(MF)) {
      PreloadedPrivateBufferReg = TRI->getPreloadedValue(
        MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
    }
@@ -363,14 +363,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
      // Use relocations to get the pointer, and setup the other bits manually.
      uint64_t Rsrc23 = TII->getScratchRsrcWords23();
  
-    if (MFI->hasPrivateMemoryInputPtr()) {
+    if (MFI->hasImplicitBufferPtr()) {
        unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
  
        if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
          const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
  
          BuildMI(MBB, I, DL, Mov64, Rsrc01)
-          .addReg(PreloadedPrivateBufferReg)
+          .addReg(MFI->getImplicitBufferPtrUserSGPR())
            .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
        } else {
          const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
@@ -385,7 +385,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
                                             MachineMemOperand::MODereferenceable,
                                             0, 0);
          BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
-          .addReg(PreloadedPrivateBufferReg)
+          .addReg(MFI->getImplicitBufferPtrUserSGPR())
            .addImm(0) // offset
            .addImm(0) // glc
            .addMemOperand(MMO)
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index 000eaaf8975c1f8fb2903d72202b30b41fd4b223..81dfbe1a502c1370abf0c23f896a8bcfd8660944 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1068,10 +1068,10 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
                                   MachineFunction &MF,
                                   const SIRegisterInfo &TRI,
                                   SIMachineFunctionInfo &Info) {
-  if (Info.hasPrivateMemoryInputPtr()) {
-    unsigned PrivateMemoryPtrReg = Info.addPrivateMemoryPtr(TRI);
-    MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SGPR_64RegClass);
-    CCInfo.AllocateReg(PrivateMemoryPtrReg);
+  if (Info.hasImplicitBufferPtr()) {
+    unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
+    MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(ImplicitBufferPtrReg);
    }
  
    // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
@@ -3005,7 +3005,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
  
    switch (IntrinsicID) {
    case Intrinsic::amdgcn_implicit_buffer_ptr: {
-    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+    if (getSubtarget()->isAmdCodeObjectV2(MF))
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+    unsigned Reg = TRI->getPreloadedValue(MF,
+                                          SIRegisterInfo::IMPLICIT_BUFFER_PTR);
      return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
    }
    case Intrinsic::amdgcn_dispatch_ptr:
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

index 18b197ddb7ae7c04aaf76e5982cb413bafc014ea..71f07bad3478ec2f9672d1957d51031d581b3a9a 100644 (file)
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -74,7 +74,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
      WorkItemIDX(false),
      WorkItemIDY(false),
      WorkItemIDZ(false),
-    PrivateMemoryInputPtr(false) {
+    ImplicitBufferPtr(false) {
    const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    const Function *F = MF.getFunction();
    FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
@@ -150,7 +150,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
        DispatchID = true;
    } else if (ST.isMesaGfxShader(MF)) {
      if (HasStackObjects || MaySpill)
-      PrivateMemoryInputPtr = true;
+      ImplicitBufferPtr = true;
    }
  
    // We don't need to worry about accessing spills with flat instructions.
@@ -203,11 +203,11 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
    return FlatScratchInitUserSGPR;
  }
  
-unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
-  PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg(
+unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
+  ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg(
      getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
    NumUserSGPRs += 2;
-  return PrivateMemoryPtrUserSGPR;
+  return ImplicitBufferPtrUserSGPR;
  }
  
  /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h

index 9fdb8caac6f2132a1bc6e64a3dd73c3745c28b92..05aa249584bf13a2b87ba9dec4b7e39407e47caf 100644 (file)
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -97,7 +97,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
    unsigned StackPtrOffsetReg;
  
    // Input registers for non-HSA ABI
-  unsigned PrivateMemoryPtrUserSGPR;
+  unsigned ImplicitBufferPtrUserSGPR;
  
    // Input registers setup for the HSA ABI.
    // User SGPRs in allocation order.
@@ -179,7 +179,7 @@ private:
    // Private memory buffer
    // Compute directly in sgpr[0:1]
    // Other shaders indirect 64-bits at sgpr[0:1]
-  bool PrivateMemoryInputPtr : 1;
+  bool ImplicitBufferPtr : 1;
  
    MCPhysReg getNextUserSGPR() const {
      assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
@@ -236,7 +236,7 @@ public:
    unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
    unsigned addDispatchID(const SIRegisterInfo &TRI);
    unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
-  unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI);
+  unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI);
  
    // Add system SGPRs.
    unsigned addWorkGroupIDX() {
@@ -341,8 +341,8 @@ public:
      return WorkItemIDZ;
    }
  
-  bool hasPrivateMemoryInputPtr() const {
-    return PrivateMemoryInputPtr;
+  bool hasImplicitBufferPtr() const {
+    return ImplicitBufferPtr;
    }
  
    unsigned getNumUserSGPRs() const {
@@ -396,8 +396,8 @@ public:
      return QueuePtrUserSGPR;
    }
  
-  unsigned getPrivateMemoryPtrUserSGPR() const {
-    return PrivateMemoryPtrUserSGPR;
+  unsigned getImplicitBufferPtrUserSGPR() const {
+    return ImplicitBufferPtrUserSGPR;
    }
  
    bool hasSpilledSGPRs() const {
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp

index 9018e3882d96cdf28f27de21ca18d66b5a7d864c..ef6ad4ad0c8f3ffda4f64404558ea8fc8f456106 100644 (file)
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1352,12 +1352,11 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
    case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
      return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
    case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
-    if (ST.isAmdCodeObjectV2(MF)) {
-      assert(MFI->hasPrivateSegmentBuffer());
-      return MFI->PrivateSegmentBufferUserSGPR;
-    }
-    assert(MFI->hasPrivateMemoryInputPtr());
-    return MFI->PrivateMemoryPtrUserSGPR;
+    assert(MFI->hasPrivateSegmentBuffer());
+    return MFI->PrivateSegmentBufferUserSGPR;
+  case SIRegisterInfo::IMPLICIT_BUFFER_PTR:
+    assert(MFI->hasImplicitBufferPtr());
+    return MFI->ImplicitBufferPtrUserSGPR;
    case SIRegisterInfo::KERNARG_SEGMENT_PTR:
      assert(MFI->hasKernargSegmentPtr());
      return MFI->KernargSegmentPtrUserSGPR;
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h

index 8fed6d5f9710f84ba2d41bed2af95bc10a961935..600cc886cb5950023780bad16c14ec39e6933ab9 100644 (file)
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -197,12 +197,13 @@ public:
      WORKGROUP_ID_Y      = 11,
      WORKGROUP_ID_Z      = 12,
      PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+    IMPLICIT_BUFFER_PTR = 15,
  
      // VGPRS:
-    FIRST_VGPR_VALUE    = 15,
+    FIRST_VGPR_VALUE    = 16,
      WORKITEM_ID_X       = FIRST_VGPR_VALUE,
-    WORKITEM_ID_Y       = 16,
-    WORKITEM_ID_Z       = 17
+    WORKITEM_ID_Y       = 17,
+    WORKITEM_ID_Z       = 18
    };
  
    /// \brief Returns the physical register that \p Value is stored in.
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll

new file mode 100644 (file)

index 0000000..437ce7f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll
@@ -0,0 +1,24 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: in function test_kernel{{.*}}: non-hsa intrinsic with hsa target
+define amdgpu_kernel void @test_kernel(i32 addrspace(1)* %out) #1 {
+  %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
+  %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
+  %value = load i32, i32 addrspace(2)* %header_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; ERROR: in function test_func{{.*}}: non-hsa intrinsic with hsa target
+define void @test_func(i32 addrspace(1)* %out) #1 {
+  %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
+  %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
+  %value = load i32, i32 addrspace(2)* %header_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind  }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll

new file mode 100644 (file)

index 0000000..dda91bc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: Requires stack object to not assert
+; GCN-LABEL: {{^}}test_ps:
+; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GCN: buffer_store_dword v0, off, s[4:7], s2 offset:4
+; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: ; return
+define amdgpu_ps i32 @test_ps() #1 {
+  %alloca = alloca i32
+  store volatile i32 0, i32* %alloca
+  %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
+  %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
+  %value = load volatile i32, i32 addrspace(2)* %buffer_ptr
+  ret i32 %value
+}
+
+; GCN-LABEL: {{^}}test_cs:
+; GCN: s_mov_b64 s[4:5], s[0:1]
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:4
+; GCN: s_load_dword s0, s[0:1], 0x0
+define amdgpu_cs i32 @test_cs() #1 {
+  %alloca = alloca i32
+  store volatile i32 0, i32* %alloca
+  %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
+  %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
+  %value = load volatile i32, i32 addrspace(2)* %buffer_ptr
+  ret i32 %value
+}
+
+declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind }
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 26 Jun 2017 03:01:31 +0000 (03:01 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 26 Jun 2017 03:01:31 +0000 (03:01 +0000)
lib/Target/AMDGPU/SIFrameLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIMachineFunctionInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIMachineFunctionInfo.h		patch \| blob \| history
lib/Target/AMDGPU/SIRegisterInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIRegisterInfo.h		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll	[new file with mode: 0644]	patch \| blob