passed in the kernarg.
"HiddenCompletionAction"
- *TBD*
-
- .. TODO
- Add description.
+ A global address space pointer
+ to help link enqueued kernels into
+ the ancestor tree for determining
+ when the parent kernel has finished.
"ValueType" string Required Kernel argument value type. Only
present if "ValueKind" is
// linkage does not work since optimization passes will try to replace loads
// of the global variable with its initialization value.
//
+// It also identifies the kernels directly or indirectly enqueues kernels
+// and adds "calls-enqueue-kernel" function attribute to them, which will
+// be used to determine whether to emit runtime metadata for the kernel
+// enqueue related hidden kernel arguments.
+//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/User.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
return new AMDGPUOpenCLEnqueuedBlockLowering();
}
+/// Collect direct or indrect callers of \p F and save them
+/// to \p Callers.
+static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
+ for (auto U : F->users()) {
+ if (auto *CI = dyn_cast<CallInst>(&*U)) {
+ auto *Caller = CI->getParent()->getParent();
+ if (Callers.count(Caller))
+ continue;
+ Callers.insert(Caller);
+ collectCallers(Caller, Callers);
+ }
+ }
+}
+
bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
+ DenseSet<Function *> Callers;
auto &C = M.getContext();
auto AS = AMDGPU::getAMDGPUAS(M);
bool Changed = false;
AddrCast->replaceAllUsesWith(NewPtr);
F.addFnAttr("runtime-handle", RuntimeHandle);
F.setLinkage(GlobalValue::ExternalLinkage);
+
+ // Collect direct or indirect callers of enqueue_kernel.
+ for (auto U : NewPtr->users()) {
+ if (auto *I = dyn_cast<Instruction>(&*U)) {
+ auto *F = I->getParent()->getParent();
+ Callers.insert(F);
+ collectCallers(F, Callers);
+ }
+ }
Changed = true;
}
}
+
+ for (auto F : Callers) {
+ if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
+ continue;
+ F->addFnAttr("calls-enqueue-kernel");
+ }
return Changed;
}
emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
- if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
- return;
-
auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
AMDGPUASI.GLOBAL_ADDRESS);
- emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+ auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts");
+ if (CallsPrintf)
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+ if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+ if (!CallsPrintf) {
+ // Emit a dummy argument so that the remaining hidden arguments
+ // have a fixed position relative to the first hidden argument.
+ // This is to facilitate library code to access hidden arguments.
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
+ }
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
+ emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
+ }
}
void MetadataStreamer::emitKernelArg(const Argument &Arg) {
%struct.ndrange_t = type { i32 }
%opencl.queue_t = type opaque
-define amdgpu_kernel void @test(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
+; CHECK: define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr !kernel_arg_addr_space
+define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
+ !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+ ret void
+}
+
+; CHECK: define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER:[0-9]+]]
+define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
+ !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+ call void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d)
+ ret void
+}
+
+; CHECK: define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER]]
+define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
!kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
entry:
%block = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8
ret void
}
+; CHECK: attributes #[[AT_CALLER]] = { "calls-enqueue-kernel" }
; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel_runtime_handle"
; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel_runtime_handle"
--- /dev/null
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+; CHECK: ---
+; CHECK: Version: [ 1, 0 ]
+; CHECK-NOT: Printf:
+; CHECK: Kernels:
+
+; CHECK: - Name: test_non_enqueue_kernel_caller
+; CHECK-NEXT: SymbolName: 'test_non_enqueue_kernel_caller@kd'
+; CHECK-NEXT: Language: OpenCL C
+; CHECK-NEXT: LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - TypeName: char
+; CHECK-NEXT: Size: 1
+; CHECK-NEXT: Align: 1
+; CHECK-NEXT: ValueKind: ByValue
+; CHECK-NEXT: ValueType: I8
+; CHECK-NEXT: AccQual: Default
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX
+; CHECK-NEXT: ValueType: I64
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY
+; CHECK-NEXT: ValueType: I64
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ
+; CHECK-NEXT: ValueType: I64
+; CHECK-NOT: ValueKind: HiddenNone
+; CHECK-NOT: ValueKind: HiddenDefaultQueue
+; CHECK-NOT: ValueKind: HiddenCompletionAction
+define amdgpu_kernel void @test_non_enqueue_kernel_caller(i8 %a)
+ !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+ !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK: - Name: test_enqueue_kernel_caller
+; CHECK-NEXT: SymbolName: 'test_enqueue_kernel_caller@kd'
+; CHECK-NEXT: Language: OpenCL C
+; CHECK-NEXT: LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - TypeName: char
+; CHECK-NEXT: Size: 1
+; CHECK-NEXT: Align: 1
+; CHECK-NEXT: ValueKind: ByValue
+; CHECK-NEXT: ValueType: I8
+; CHECK-NEXT: AccQual: Default
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX
+; CHECK-NEXT: ValueType: I64
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY
+; CHECK-NEXT: ValueType: I64
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ
+; CHECK-NEXT: ValueType: I64
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenNone
+; CHECK-NEXT: ValueType: I8
+; CHECK-NEXT: AddrSpaceQual: Global
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenDefaultQueue
+; CHECK-NEXT: ValueType: I8
+; CHECK-NEXT: AddrSpaceQual: Global
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenCompletionAction
+; CHECK-NEXT: ValueType: I8
+; CHECK-NEXT: AddrSpaceQual: Global
+define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #0
+ !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+ !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+attributes #0 = { "calls-enqueue-kernel" }
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"char"}
+!4 = !{!""}
+
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
+
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
; CHECK-NEXT: ValueKind: HiddenPrintfBuffer
; CHECK-NEXT: ValueType: I8
; CHECK-NEXT: AddrSpaceQual: Global
+; CHECK-NOT: ValueKind: HiddenDefaultQueue
+; CHECK-NOT: ValueKind: HiddenCompletionAction
define amdgpu_kernel void @test_char(i8 %a)
!kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
!kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
ret void
}
+; CHECK: - Name: test_enqueue_kernel_caller
+; CHECK-NEXT: SymbolName: 'test_enqueue_kernel_caller@kd'
+; CHECK-NEXT: Language: OpenCL C
+; CHECK-NEXT: LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT: Args:
+; CHECK-NEXT: - TypeName: char
+; CHECK-NEXT: Size: 1
+; CHECK-NEXT: Align: 1
+; CHECK-NEXT: ValueKind: ByValue
+; CHECK-NEXT: ValueType: I8
+; CHECK-NEXT: AccQual: Default
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX
+; CHECK-NEXT: ValueType: I64
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY
+; CHECK-NEXT: ValueType: I64
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ
+; CHECK-NEXT: ValueType: I64
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenPrintfBuffer
+; CHECK-NEXT: ValueType: I8
+; CHECK-NEXT: AddrSpaceQual: Global
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenDefaultQueue
+; CHECK-NEXT: ValueType: I8
+; CHECK-NEXT: AddrSpaceQual: Global
+; CHECK-NEXT: - Size: 8
+; CHECK-NEXT: Align: 8
+; CHECK-NEXT: ValueKind: HiddenCompletionAction
+; CHECK-NEXT: ValueType: I8
+; CHECK-NEXT: AddrSpaceQual: Global
+define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1
+ !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
+ !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
+ ret void
+}
+
attributes #0 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
+attributes #1 = { "calls-enqueue-kernel" }
!llvm.printf.fmts = !{!100, !101}