]> granicus.if.org Git - llvm/commitdiff
[AMDGPU] Emit metadata for hidden arguments for kernel enqueue
authorYaxun Liu <Yaxun.Liu@amd.com>
Mon, 30 Oct 2017 14:30:28 +0000 (14:30 +0000)
committerYaxun Liu <Yaxun.Liu@amd.com>
Mon, 30 Oct 2017 14:30:28 +0000 (14:30 +0000)
Identifies kernels which performs device side kernel enqueues and emit
metadata for the associated hidden kernel arguments. Such kernels are
marked with calls-enqueue-kernel function attribute by
AMDGPUOpenCLEnqueueKernelLowering pass and later on
hidden kernel arguments metadata HiddenDefaultQueue and
HiddenCompletionAction are emitted for them.

Differential Revision: https://reviews.llvm.org/D39255

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@316907 91177308-0d34-0410-b5e6-96231b3b80d8

docs/AMDGPUUsage.rst
lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp
test/CodeGen/AMDGPU/enqueue-kernel.ll
test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel.ll [new file with mode: 0644]
test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll

index c135aec73fc2770cc4e567a7aea1228550573350..7062d75d92ec6e0750d65ad8221f6881764d3092 100644 (file)
@@ -1039,10 +1039,10 @@ non-AMD key names should be prefixed by "*vendor-name*.".
                                                   passed in the kernarg.
 
                                                 "HiddenCompletionAction"
-                                                  *TBD*
-
-                                                  .. TODO
-                                                     Add description.
+                                                  A global address space pointer
+                                                  to help link enqueued kernels into
+                                                  the ancestor tree for determining
+                                                  when the parent kernel has finished.
 
      "ValueType"       string         Required  Kernel argument value type. Only
                                                 present if "ValueKind" is
index 68a204fca23e31f5852da8bb1301d1892154a9dd..04e2829c76b67c235fd1f0eae8ffd1ed9b83f6f4 100644 (file)
 // linkage does not work since optimization passes will try to replace loads
 // of the global variable with its initialization value.
 //
+// It also identifies the kernels directly or indirectly enqueues kernels
+// and adds "calls-enqueue-kernel" function attribute to them, which will
+// be used to determine whether to emit runtime metadata for the kernel
+// enqueue related hidden kernel arguments.
+//
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/User.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -66,7 +74,22 @@ ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
   return new AMDGPUOpenCLEnqueuedBlockLowering();
 }
 
+/// Collect direct or indrect callers of \p F and save them
+/// to \p Callers.
+static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
+  for (auto U : F->users()) {
+    if (auto *CI = dyn_cast<CallInst>(&*U)) {
+      auto *Caller = CI->getParent()->getParent();
+      if (Callers.count(Caller))
+        continue;
+      Callers.insert(Caller);
+      collectCallers(Caller, Callers);
+    }
+  }
+}
+
 bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
+  DenseSet<Function *> Callers;
   auto &C = M.getContext();
   auto AS = AMDGPU::getAMDGPUAS(M);
   bool Changed = false;
@@ -91,8 +114,23 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
       AddrCast->replaceAllUsesWith(NewPtr);
       F.addFnAttr("runtime-handle", RuntimeHandle);
       F.setLinkage(GlobalValue::ExternalLinkage);
+
+      // Collect direct or indirect callers of enqueue_kernel.
+      for (auto U : NewPtr->users()) {
+        if (auto *I = dyn_cast<Instruction>(&*U)) {
+          auto *F = I->getParent()->getParent();
+          Callers.insert(F);
+          collectCallers(F, Callers);
+        }
+      }
       Changed = true;
     }
   }
+
+  for (auto F : Callers) {
+    if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
+      continue;
+    F->addFnAttr("calls-enqueue-kernel");
+  }
   return Changed;
 }
index dacf5d37aa1ebf3958f2b85aabd88b2497a714f7..5a6dfb28b5059744815ef9071e90b6bd8d3a4983 100644 (file)
@@ -266,12 +266,21 @@ void MetadataStreamer::emitKernelArgs(const Function &Func) {
   emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
   emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
 
-  if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
-    return;
-
   auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
                                       AMDGPUASI.GLOBAL_ADDRESS);
-  emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+  auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts");
+  if (CallsPrintf)
+    emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+  if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+    if (!CallsPrintf) {
+      // Emit a dummy argument so that the remaining hidden arguments
+      // have a fixed position relative to the first hidden argument.
+      // This is to facilitate library code to access hidden arguments.
+      emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+    }
+    emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
+    emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
+  }
 }
 
 void MetadataStreamer::emitKernelArg(const Argument &Arg) {
index b1b83c2b4a10155073b6f7cb737c7ca095213b3a..a54453541ded5abf2aaa074ca5152f615150aa71 100644 (file)
@@ -9,7 +9,21 @@ target triple = "amdgcn-amdhsa-amd-opencl"
 %struct.ndrange_t = type { i32 }
 %opencl.queue_t = type opaque
 
-define amdgpu_kernel void @test(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
+; CHECK: define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr !kernel_arg_addr_space
+define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
+  !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER:[0-9]+]]
+define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
+  !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+  call void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d)
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER]]
+define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
   !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
 entry:
   %block = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8
@@ -77,6 +91,7 @@ entry:
   ret void
 }
 
+; CHECK: attributes #[[AT_CALLER]] = { "calls-enqueue-kernel" }
 ; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel_runtime_handle"
 ; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel_runtime_handle"
 
diff --git a/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel.ll b/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel.ll
new file mode 100644 (file)
index 0000000..c5121a7
--- /dev/null
@@ -0,0 +1,96 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+; CHECK: ---
+; CHECK:  Version: [ 1, 0 ]
+; CHECK-NOT:  Printf:
+; CHECK:  Kernels:
+
+; CHECK:      - Name:            test_non_enqueue_kernel_caller
+; CHECK-NEXT:   SymbolName:      'test_non_enqueue_kernel_caller@kd'
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - TypeName:      char
+; CHECK-NEXT:       Size:          1
+; CHECK-NEXT:       Align:         1
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NOT:        ValueKind:     HiddenNone
+; CHECK-NOT:        ValueKind:     HiddenDefaultQueue
+; CHECK-NOT:        ValueKind:     HiddenCompletionAction
+define amdgpu_kernel void @test_non_enqueue_kernel_caller(i8 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_enqueue_kernel_caller
+; CHECK-NEXT:   SymbolName:      'test_enqueue_kernel_caller@kd'
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - TypeName:      char
+; CHECK-NEXT:       Size:          1
+; CHECK-NEXT:       Align:         1
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenNone
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenDefaultQueue
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenCompletionAction
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #0
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+attributes #0 = { "calls-enqueue-kernel" }
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"char"}
+!4 = !{!""}
+
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
+
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
index 4ac9bacebe1c3f244b3073e2c9a28e3d8d572138..ea47f83aef3e5d5dc72c28e43aef96dc00f6b188 100644 (file)
@@ -51,6 +51,8 @@
 ; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
 ; CHECK-NEXT:       ValueType:     I8
 ; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NOT:        ValueKind:     HiddenDefaultQueue
+; CHECK-NOT:        ValueKind:     HiddenCompletionAction
 define amdgpu_kernel void @test_char(i8 %a)
     !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
     !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
@@ -1267,7 +1269,52 @@ define amdgpu_kernel void @__test_block_invoke_kernel(
   ret void
 }
 
+; CHECK:      - Name:            test_enqueue_kernel_caller
+; CHECK-NEXT:   SymbolName:      'test_enqueue_kernel_caller@kd'
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - TypeName:      char
+; CHECK-NEXT:       Size:          1
+; CHECK-NEXT:       Align:         1
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenDefaultQueue
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenCompletionAction
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
+    !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
+  ret void
+}
+
 attributes #0 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
+attributes #1 = { "calls-enqueue-kernel" }
 
 !llvm.printf.fmts = !{!100, !101}