From: Yaxun Liu Date: Tue, 10 Oct 2017 19:39:48 +0000 (+0000) Subject: [AMDGPU] Lower enqueued blocks and generate runtime metadata X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=091c043b907bfb1a8756210f99b96d6fba7403fc;p=llvm [AMDGPU] Lower enqueued blocks and generate runtime metadata This patch adds a post-linking pass which replaces the function pointer of enqueued block kernel with a global variable (runtime handle) and adds runtime-handle attribute to the enqueued block kernel. In LLVM CodeGen the runtime-handle metadata will be translated to RuntimeHandle metadata in code object. Runtime allocates a global buffer for each kernel with RuntimeHandel metadata and saves the kernel address required for the AQL packet into the buffer. __enqueue_kernel function in device library knows that the invoke function pointer in the block literal is actually runtime handle and loads the kernel address from it and puts it into AQL packet for dispatching. This cannot be done in FE since FE cannot create a unique global variable with external linkage across LLVM modules. The global variable with internal linkage does not work since optimization passes will try to replace loads of the global variable with its initialization value. Differential Revision: https://reviews.llvm.org/D38610 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@315352 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst index ddcf40d30b5..12e97e97f93 100644 --- a/docs/AMDGPUUsage.rst +++ b/docs/AMDGPUUsage.rst @@ -930,6 +930,16 @@ non-AMD key names should be prefixed by "*vendor-name*.". Corresponds to the OpenCL ``vec_type_hint`` attribute. + + "RuntimeHandle" string The external symbol name + associated with a kernel. + OpenCL runtime allocates a + global buffer for the symbol + and saves the kernel's address + to it, which is used for + device side enqueueing. Only + available for device side + enqueued kernels. =================== ============== ========= ============================== .. diff --git a/include/llvm/Support/AMDGPUCodeObjectMetadata.h b/include/llvm/Support/AMDGPUCodeObjectMetadata.h index d274c5ee918..5f94a950b88 100644 --- a/include/llvm/Support/AMDGPUCodeObjectMetadata.h +++ b/include/llvm/Support/AMDGPUCodeObjectMetadata.h @@ -115,6 +115,8 @@ constexpr char ReqdWorkGroupSize[] = "ReqdWorkGroupSize"; constexpr char WorkGroupSizeHint[] = "WorkGroupSizeHint"; /// \brief Key for Kernel::Attr::Metadata::mVecTypeHint. constexpr char VecTypeHint[] = "VecTypeHint"; +/// \brief Key for Kernel::Attr::Metadata::mRuntimeHandle. +constexpr char RuntimeHandle[] = "RuntimeHandle"; } // end namespace Key /// \brief In-memory representation of kernel attributes metadata. @@ -125,15 +127,17 @@ struct Metadata final { std::vector mWorkGroupSizeHint = std::vector(); /// \brief 'vec_type_hint' attribute. Optional. std::string mVecTypeHint = std::string(); + /// \brief External symbol created by runtime to store the kernel address + /// for enqueued blocks. + std::string mRuntimeHandle = std::string(); /// \brief Default constructor. Metadata() = default; /// \returns True if kernel attributes metadata is empty, false otherwise. bool empty() const { - return mReqdWorkGroupSize.empty() && - mWorkGroupSizeHint.empty() && - mVecTypeHint.empty(); + return mReqdWorkGroupSize.empty() && mWorkGroupSizeHint.empty() && + mVecTypeHint.empty() && mRuntimeHandle.empty(); } /// \returns True if kernel attributes metadata is not empty, false otherwise. diff --git a/lib/Support/AMDGPUCodeObjectMetadata.cpp b/lib/Support/AMDGPUCodeObjectMetadata.cpp index 863093ab7de..1872a003058 100644 --- a/lib/Support/AMDGPUCodeObjectMetadata.cpp +++ b/lib/Support/AMDGPUCodeObjectMetadata.cpp @@ -96,6 +96,8 @@ struct MappingTraits { MD.mWorkGroupSizeHint, std::vector()); YIO.mapOptional(Kernel::Attrs::Key::VecTypeHint, MD.mVecTypeHint, std::string()); + YIO.mapOptional(Kernel::Attrs::Key::RuntimeHandle, MD.mRuntimeHandle, + std::string()); } }; diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 6bf8cdcb849..e8f7476dd76 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -186,6 +186,10 @@ void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &); Pass *createAMDGPUFunctionInliningPass(); void initializeAMDGPUInlinerPass(PassRegistry&); +ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass(); +void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); +extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; + Target &getTheAMDGPUTarget(); Target &getTheGCNTarget(); diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp new file mode 100644 index 00000000000..68a204fca23 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -0,0 +1,98 @@ +//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file +// \brief This post-linking pass replaces the function pointer of enqueued +// block kernel with a global variable (runtime handle) and adds +// "runtime-handle" attribute to the enqueued block kernel. +// +// In LLVM CodeGen the runtime-handle metadata will be translated to +// RuntimeHandle metadata in code object. Runtime allocates a global buffer +// for each kernel with RuntimeHandel metadata and saves the kernel address +// required for the AQL packet into the buffer. __enqueue_kernel function +// in device library knows that the invoke function pointer in the block +// literal is actually runtime handle and loads the kernel address from it +// and put it into AQL packet for dispatching. +// +// This cannot be done in FE since FE cannot create a unique global variable +// with external linkage across LLVM modules. The global variable with internal +// linkage does not work since optimization passes will try to replace loads +// of the global variable with its initialization value. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-lower-enqueued-block" + +using namespace llvm; + +namespace { + +/// \brief Lower enqueued blocks. +class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass { +public: + static char ID; + + explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {} + +private: + bool runOnModule(Module &M) override; +}; + +} // end anonymous namespace + +char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0; + +char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID = + AMDGPUOpenCLEnqueuedBlockLowering::ID; + +INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE, + "Lower OpenCL enqueued blocks", false, false) + +ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() { + return new AMDGPUOpenCLEnqueuedBlockLowering(); +} + +bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { + auto &C = M.getContext(); + auto AS = AMDGPU::getAMDGPUAS(M); + bool Changed = false; + for (auto &F : M.functions()) { + if (F.hasFnAttribute("enqueued-block")) { + if (!F.hasOneUse() || !F.user_begin()->hasOneUse() || + !isa(*F.user_begin()) || + !isa(*F.user_begin()->user_begin())) { + continue; + } + auto *BitCast = cast(*F.user_begin()); + auto *AddrCast = cast(*BitCast->user_begin()); + auto RuntimeHandle = (F.getName() + "_runtime_handle").str(); + auto *GV = new GlobalVariable( + M, Type::getInt8Ty(C)->getPointerTo(AS.GLOBAL_ADDRESS), + /*IsConstant=*/true, GlobalValue::ExternalLinkage, + /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr, + GlobalValue::NotThreadLocal, AS.GLOBAL_ADDRESS, + /*IsExternallyInitialized=*/true); + DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); + auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType()); + AddrCast->replaceAllUsesWith(NewPtr); + F.addFnAttr("runtime-handle", RuntimeHandle); + F.setLinkage(GlobalValue::ExternalLinkage); + Changed = true; + } + } + return Changed; +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 8a6b5aeaebc..2fdb012243a 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -161,6 +161,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); + initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); @@ -610,6 +611,9 @@ void AMDGPUPassConfig::addIRPasses() { // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + // Replace OpenCL enqueued block function pointers with global variables. + addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); + if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createInferAddressSpacesPass()); addPass(createAMDGPUPromoteAlloca()); diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 450835f414a..baefbd3ae05 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -39,6 +39,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMachineModuleInfo.cpp AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp + AMDGPUOpenCLEnqueuedBlockLowering.cpp AMDGPUOpenCLImageTypeLoweringPass.cpp AMDGPUPromoteAlloca.cpp AMDGPURegAsmNames.inc.cpp diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp index 4e828a791e0..4a576ca5c0b 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp @@ -244,6 +244,10 @@ void MetadataStreamer::emitKernelAttrs(const Function &Func) { cast(Node->getOperand(0))->getType(), mdconst::extract(Node->getOperand(1))->getZExtValue()); } + if (Func.hasFnAttribute("runtime-handle")) { + Attrs.mRuntimeHandle = + Func.getFnAttribute("runtime-handle").getValueAsString().str(); + } } void MetadataStreamer::emitKernelArgs(const Function &Func) { diff --git a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll index 37fd08242fb..ae557875959 100644 --- a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll +++ b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll @@ -14,6 +14,8 @@ %struct.B = type { i32 addrspace(1)*} %opencl.clk_event_t = type opaque +@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* + ; CHECK: --- ; CHECK: Version: [ 1, 0 ] ; CHECK: Printf: @@ -1197,6 +1199,44 @@ define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a, ret void } +; CHECK: - Name: __test_block_invoke_kernel +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Attrs: +; CHECK-NEXT: RuntimeHandle: __test_block_invoke_kernel_runtime_handle +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 25 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: __block_literal +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @__test_block_invoke_kernel( + <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg) #1 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !110 + !kernel_arg_base_type !110 !kernel_arg_type_qual !4 { + ret void +} + +attributes #1 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" } + !llvm.printf.fmts = !{!100, !101} !1 = !{i32 0} @@ -1250,13 +1290,14 @@ define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a, !94 = !{!"", !"", !"", !"", !"", !"", !""} !100 = !{!"1:1:4:%d\5Cn"} !101 = !{!"2:1:8:%g\5Cn"} +!110 = !{!"__block_literal"} ; NOTES: Displaying notes found at file offset 0x{{[0-9]+}} ; NOTES-NEXT: Owner Data size Description ; NOTES-NEXT: AMD 0x00000008 Unknown note type: (0x00000001) ; NOTES-NEXT: AMD 0x0000001b Unknown note type: (0x00000003) -; GFX700: AMD 0x00008b0a Unknown note type: (0x0000000a) -; GFX800: AMD 0x00008e6e Unknown note type: (0x0000000a) -; GFX900: AMD 0x00008b0a Unknown note type: (0x0000000a) +; GFX700: AMD 0x00008f64 Unknown note type: (0x0000000a) +; GFX800: AMD 0x000092e4 Unknown note type: (0x0000000a) +; GFX900: AMD 0x00008f64 Unknown note type: (0x0000000a) ; PARSER: AMDGPU Code Object Metadata Parser Test: PASS diff --git a/test/CodeGen/AMDGPU/enqueue-kernel.ll b/test/CodeGen/AMDGPU/enqueue-kernel.ll new file mode 100644 index 00000000000..b1b83c2b4a1 --- /dev/null +++ b/test/CodeGen/AMDGPU/enqueue-kernel.ll @@ -0,0 +1,92 @@ +; RUN: opt -amdgpu-lower-enqueued-block -S < %s | FileCheck %s + +; CHECK: @__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* +; CHECK: @__test_block_invoke_2_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* + +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target triple = "amdgcn-amdhsa-amd-opencl" + +%struct.ndrange_t = type { i32 } +%opencl.queue_t = type opaque + +define amdgpu_kernel void @test(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr + !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { +entry: + %block = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8 + %tmp = alloca %struct.ndrange_t, align 4 + %block2 = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, align 8 + %tmp3 = alloca %struct.ndrange_t, align 4 + %block.size = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 0 + store i32 25, i32* %block.size, align 8 + %block.align = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 1 + store i32 8, i32* %block.align, align 4 + %block.invoke = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 2 + store i8 addrspace(4)* addrspacecast (i8* bitcast (void (<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>)* @__test_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke, align 8 + %block.captured = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 3 + store i8 addrspace(1)* %a, i8 addrspace(1)** %block.captured, align 8 + %block.captured1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 4 + store i8 %b, i8* %block.captured1, align 8 + %tmp1 = bitcast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block to void ()* + %tmp2 = bitcast void ()* %tmp1 to i8* + %tmp4 = addrspacecast i8* %tmp2 to i8 addrspace(4)* + %tmp5 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t* byval nonnull %tmp, i8 addrspace(4)* nonnull %tmp4) #2 + %block.size4 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 0 + store i32 41, i32* %block.size4, align 8 + %block.align5 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 1 + store i32 8, i32* %block.align5, align 4 + %block.invoke6 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 2 + store i8 addrspace(4)* addrspacecast (i8* bitcast (void (<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>)* @__test_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke6, align 8 + %block.captured7 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 3 + store i8 addrspace(1)* %a, i8 addrspace(1)** %block.captured7, align 8 + %block.captured8 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 6 + store i8 %b, i8* %block.captured8, align 8 + %block.captured9 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 4 + store i64 addrspace(1)* %c, i64 addrspace(1)** %block.captured9, align 8 + %block.captured10 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 5 + store i64 %d, i64* %block.captured10, align 8 + %tmp6 = bitcast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2 to void ()* + %tmp7 = bitcast void ()* %tmp6 to i8* + %tmp8 = addrspacecast i8* %tmp7 to i8 addrspace(4)* + %tmp9 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t* byval nonnull %tmp3, i8 addrspace(4)* nonnull %tmp8) #2 + ret void +} + +; CHECK: define amdgpu_kernel void @__test_block_invoke_kernel({{.*}}) #[[AT1:[0-9]+]] +define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg) #0 + !kernel_arg_addr_space !14 !kernel_arg_access_qual !15 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 { +entry: + %.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg, 3 + %.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg, 4 + store i8 %.fca.4.extract, i8 addrspace(1)* %.fca.3.extract, align 1 + ret void +} + +declare i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)*, i32, %struct.ndrange_t*, i8 addrspace(4)*) local_unnamed_addr + +; CHECK: define amdgpu_kernel void @__test_block_invoke_2_kernel({{.*}}) #[[AT2:[0-9]+]] +define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, + i64 addrspace(1)*, i64, i8 }> %arg) #0 !kernel_arg_addr_space !14 !kernel_arg_access_qual !15 + !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 { +entry: + %.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 3 + %.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 4 + %.fca.5.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 5 + %.fca.6.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 6 + store i8 %.fca.6.extract, i8 addrspace(1)* %.fca.3.extract, align 1 + store i64 %.fca.5.extract, i64 addrspace(1)* %.fca.4.extract, align 8 + ret void +} + +; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel_runtime_handle" +; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel_runtime_handle" + +attributes #0 = { "enqueued-block" } + +!3 = !{i32 1, i32 0, i32 1, i32 0} +!4 = !{!"none", !"none", !"none", !"none"} +!5 = !{!"char*", !"char", !"long*", !"long"} +!6 = !{!"", !"", !"", !""} +!14 = !{i32 0} +!15 = !{!"none"} +!16 = !{!"__block_literal"} +!17 = !{!""}