From: Yaxun Liu Date: Mon, 30 Oct 2017 14:30:28 +0000 (+0000) Subject: [AMDGPU] Emit metadata for hidden arguments for kernel enqueue X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a52756b2c94907e82ca13ff510f0b107753e2fde;p=llvm [AMDGPU] Emit metadata for hidden arguments for kernel enqueue Identifies kernels which performs device side kernel enqueues and emit metadata for the associated hidden kernel arguments. Such kernels are marked with calls-enqueue-kernel function attribute by AMDGPUOpenCLEnqueueKernelLowering pass and later on hidden kernel arguments metadata HiddenDefaultQueue and HiddenCompletionAction are emitted for them. Differential Revision: https://reviews.llvm.org/D39255 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@316907 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst index c135aec73fc..7062d75d92e 100644 --- a/docs/AMDGPUUsage.rst +++ b/docs/AMDGPUUsage.rst @@ -1039,10 +1039,10 @@ non-AMD key names should be prefixed by "*vendor-name*.". passed in the kernarg. "HiddenCompletionAction" - *TBD* - - .. TODO - Add description. + A global address space pointer + to help link enqueued kernels into + the ancestor tree for determining + when the parent kernel has finished. "ValueType" string Required Kernel argument value type. Only present if "ValueKind" is diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp index 68a204fca23..04e2829c76b 100644 --- a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -25,12 +25,20 @@ // linkage does not work since optimization passes will try to replace loads // of the global variable with its initialization value. // +// It also identifies the kernels directly or indirectly enqueues kernels +// and adds "calls-enqueue-kernel" function attribute to them, which will +// be used to determine whether to emit runtime metadata for the kernel +// enqueue related hidden kernel arguments. +// //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/IR/User.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -66,7 +74,22 @@ ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() { return new AMDGPUOpenCLEnqueuedBlockLowering(); } +/// Collect direct or indrect callers of \p F and save them +/// to \p Callers. +static void collectCallers(Function *F, DenseSet &Callers) { + for (auto U : F->users()) { + if (auto *CI = dyn_cast(&*U)) { + auto *Caller = CI->getParent()->getParent(); + if (Callers.count(Caller)) + continue; + Callers.insert(Caller); + collectCallers(Caller, Callers); + } + } +} + bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { + DenseSet Callers; auto &C = M.getContext(); auto AS = AMDGPU::getAMDGPUAS(M); bool Changed = false; @@ -91,8 +114,23 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { AddrCast->replaceAllUsesWith(NewPtr); F.addFnAttr("runtime-handle", RuntimeHandle); F.setLinkage(GlobalValue::ExternalLinkage); + + // Collect direct or indirect callers of enqueue_kernel. + for (auto U : NewPtr->users()) { + if (auto *I = dyn_cast(&*U)) { + auto *F = I->getParent()->getParent(); + Callers.insert(F); + collectCallers(F, Callers); + } + } Changed = true; } } + + for (auto F : Callers) { + if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL) + continue; + F->addFnAttr("calls-enqueue-kernel"); + } return Changed; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp index dacf5d37aa1..5a6dfb28b50 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp @@ -266,12 +266,21 @@ void MetadataStreamer::emitKernelArgs(const Function &Func) { emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY); emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ); - if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts")) - return; - auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), AMDGPUASI.GLOBAL_ADDRESS); - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); + auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts"); + if (CallsPrintf) + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); + if (Func.hasFnAttribute("calls-enqueue-kernel")) { + if (!CallsPrintf) { + // Emit a dummy argument so that the remaining hidden arguments + // have a fixed position relative to the first hidden argument. + // This is to facilitate library code to access hidden arguments. + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); + } + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue); + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction); + } } void MetadataStreamer::emitKernelArg(const Argument &Arg) { diff --git a/test/CodeGen/AMDGPU/enqueue-kernel.ll b/test/CodeGen/AMDGPU/enqueue-kernel.ll index b1b83c2b4a1..a54453541de 100644 --- a/test/CodeGen/AMDGPU/enqueue-kernel.ll +++ b/test/CodeGen/AMDGPU/enqueue-kernel.ll @@ -9,7 +9,21 @@ target triple = "amdgcn-amdhsa-amd-opencl" %struct.ndrange_t = type { i32 } %opencl.queue_t = type opaque -define amdgpu_kernel void @test(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr +; CHECK: define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr !kernel_arg_addr_space +define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr + !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { + ret void +} + +; CHECK: define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER:[0-9]+]] +define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr + !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { + call void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) + ret void +} + +; CHECK: define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER]] +define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { entry: %block = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8 @@ -77,6 +91,7 @@ entry: ret void } +; CHECK: attributes #[[AT_CALLER]] = { "calls-enqueue-kernel" } ; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel_runtime_handle" ; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel_runtime_handle" diff --git a/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel.ll b/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel.ll new file mode 100644 index 00000000000..c5121a7fd3b --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel.ll @@ -0,0 +1,96 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s + +; CHECK: --- +; CHECK: Version: [ 1, 0 ] +; CHECK-NOT: Printf: +; CHECK: Kernels: + +; CHECK: - Name: test_non_enqueue_kernel_caller +; CHECK-NEXT: SymbolName: 'test_non_enqueue_kernel_caller@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NOT: ValueKind: HiddenNone +; CHECK-NOT: ValueKind: HiddenDefaultQueue +; CHECK-NOT: ValueKind: HiddenCompletionAction +define amdgpu_kernel void @test_non_enqueue_kernel_caller(i8 %a) + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_enqueue_kernel_caller +; CHECK-NEXT: SymbolName: 'test_enqueue_kernel_caller@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenDefaultQueue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenCompletionAction +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 + !kernel_arg_base_type !3 !kernel_arg_type_qual !4 { + ret void +} + +attributes #0 = { "calls-enqueue-kernel" } + +!1 = !{i32 0} +!2 = !{!"none"} +!3 = !{!"char"} +!4 = !{!""} + +!opencl.ocl.version = !{!90} +!90 = !{i32 2, i32 0} + + +; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll index 4ac9bacebe1..ea47f83aef3 100644 --- a/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll +++ b/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll @@ -51,6 +51,8 @@ ; CHECK-NEXT: ValueKind: HiddenPrintfBuffer ; CHECK-NEXT: ValueType: I8 ; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NOT: ValueKind: HiddenDefaultQueue +; CHECK-NOT: ValueKind: HiddenCompletionAction define amdgpu_kernel void @test_char(i8 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { @@ -1267,7 +1269,52 @@ define amdgpu_kernel void @__test_block_invoke_kernel( ret void } +; CHECK: - Name: test_enqueue_kernel_caller +; CHECK-NEXT: SymbolName: 'test_enqueue_kernel_caller@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenDefaultQueue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenCompletionAction +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + attributes #0 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" } +attributes #1 = { "calls-enqueue-kernel" } !llvm.printf.fmts = !{!100, !101}