From: Yaxun Liu <Yaxun.Liu@amd.com>
Date: Tue, 10 Oct 2017 19:39:48 +0000 (+0000)
Subject: [AMDGPU] Lower enqueued blocks and generate runtime metadata
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=091c043b907bfb1a8756210f99b96d6fba7403fc;p=llvm

[AMDGPU] Lower enqueued blocks and generate runtime metadata

This patch adds a post-linking pass which replaces the function pointer of enqueued
block kernel with a global variable (runtime handle) and adds
runtime-handle attribute to the enqueued block kernel.

In LLVM CodeGen the runtime-handle metadata will be translated to
RuntimeHandle metadata in code object. Runtime allocates a global buffer
for each kernel with RuntimeHandel metadata and saves the kernel address
required for the AQL packet into the buffer. __enqueue_kernel function
in device library knows that the invoke function pointer in the block
literal is actually runtime handle and loads the kernel address from it
and puts it into AQL packet for dispatching.

This cannot be done in FE since FE cannot create a unique global variable
with external linkage across LLVM modules. The global variable with internal
linkage does not work since optimization passes will try to replace loads
of the global variable with its initialization value.

Differential Revision: https://reviews.llvm.org/D38610


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@315352 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst
index ddcf40d30b5..12e97e97f93 100644
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@@ -930,6 +930,16 @@ non-AMD key names should be prefixed by "*vendor-name*.".
 
                                                   Corresponds to the OpenCL
                                                   ``vec_type_hint`` attribute.
+
+     "RuntimeHandle"     string                   The external symbol name
+                                                  associated with a kernel.
+                                                  OpenCL runtime allocates a
+                                                  global buffer for the symbol
+                                                  and saves the kernel's address
+                                                  to it, which is used for
+                                                  device side enqueueing. Only
+                                                  available for device side
+                                                  enqueued kernels.
      =================== ============== ========= ==============================
 
 ..
diff --git a/include/llvm/Support/AMDGPUCodeObjectMetadata.h b/include/llvm/Support/AMDGPUCodeObjectMetadata.h
index d274c5ee918..5f94a950b88 100644
--- a/include/llvm/Support/AMDGPUCodeObjectMetadata.h
+++ b/include/llvm/Support/AMDGPUCodeObjectMetadata.h
@@ -115,6 +115,8 @@ constexpr char ReqdWorkGroupSize[] = "ReqdWorkGroupSize";
 constexpr char WorkGroupSizeHint[] = "WorkGroupSizeHint";
 /// \brief Key for Kernel::Attr::Metadata::mVecTypeHint.
 constexpr char VecTypeHint[] = "VecTypeHint";
+/// \brief Key for Kernel::Attr::Metadata::mRuntimeHandle.
+constexpr char RuntimeHandle[] = "RuntimeHandle";
 } // end namespace Key
 
 /// \brief In-memory representation of kernel attributes metadata.
@@ -125,15 +127,17 @@ struct Metadata final {
   std::vector<uint32_t> mWorkGroupSizeHint = std::vector<uint32_t>();
   /// \brief 'vec_type_hint' attribute. Optional.
   std::string mVecTypeHint = std::string();
+  /// \brief External symbol created by runtime to store the kernel address
+  /// for enqueued blocks.
+  std::string mRuntimeHandle = std::string();
 
   /// \brief Default constructor.
   Metadata() = default;
 
   /// \returns True if kernel attributes metadata is empty, false otherwise.
   bool empty() const {
-    return mReqdWorkGroupSize.empty() &&
-           mWorkGroupSizeHint.empty() &&
-           mVecTypeHint.empty();
+    return mReqdWorkGroupSize.empty() && mWorkGroupSizeHint.empty() &&
+           mVecTypeHint.empty() && mRuntimeHandle.empty();
   }
 
   /// \returns True if kernel attributes metadata is not empty, false otherwise.
diff --git a/lib/Support/AMDGPUCodeObjectMetadata.cpp b/lib/Support/AMDGPUCodeObjectMetadata.cpp
index 863093ab7de..1872a003058 100644
--- a/lib/Support/AMDGPUCodeObjectMetadata.cpp
+++ b/lib/Support/AMDGPUCodeObjectMetadata.cpp
@@ -96,6 +96,8 @@ struct MappingTraits<Kernel::Attrs::Metadata> {
                     MD.mWorkGroupSizeHint, std::vector<uint32_t>());
     YIO.mapOptional(Kernel::Attrs::Key::VecTypeHint,
                     MD.mVecTypeHint, std::string());
+    YIO.mapOptional(Kernel::Attrs::Key::RuntimeHandle, MD.mRuntimeHandle,
+                    std::string());
   }
 };
 
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 6bf8cdcb849..e8f7476dd76 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -186,6 +186,10 @@ void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
 Pass *createAMDGPUFunctionInliningPass();
 void initializeAMDGPUInlinerPass(PassRegistry&);
 
+ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
+void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
+extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
+
 Target &getTheAMDGPUTarget();
 Target &getTheGCNTarget();
 
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
new file mode 100644
index 00000000000..68a204fca23
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -0,0 +1,98 @@
+//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// \brief This post-linking pass replaces the function pointer of enqueued
+// block kernel with a global variable (runtime handle) and adds
+// "runtime-handle" attribute to the enqueued block kernel.
+//
+// In LLVM CodeGen the runtime-handle metadata will be translated to
+// RuntimeHandle metadata in code object. Runtime allocates a global buffer
+// for each kernel with RuntimeHandel metadata and saves the kernel address
+// required for the AQL packet into the buffer. __enqueue_kernel function
+// in device library knows that the invoke function pointer in the block
+// literal is actually runtime handle and loads the kernel address from it
+// and put it into AQL packet for dispatching.
+//
+// This cannot be done in FE since FE cannot create a unique global variable
+// with external linkage across LLVM modules. The global variable with internal
+// linkage does not work since optimization passes will try to replace loads
+// of the global variable with its initialization value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-lower-enqueued-block"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief Lower enqueued blocks.
+class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
+public:
+  static char ID;
+
+  explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}
+
+private:
+  bool runOnModule(Module &M) override;
+};
+
+} // end anonymous namespace
+
+char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0;
+
+char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID =
+    AMDGPUOpenCLEnqueuedBlockLowering::ID;
+
+INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
+                "Lower OpenCL enqueued blocks", false, false)
+
+ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
+  return new AMDGPUOpenCLEnqueuedBlockLowering();
+}
+
+bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
+  auto &C = M.getContext();
+  auto AS = AMDGPU::getAMDGPUAS(M);
+  bool Changed = false;
+  for (auto &F : M.functions()) {
+    if (F.hasFnAttribute("enqueued-block")) {
+      if (!F.hasOneUse() || !F.user_begin()->hasOneUse() ||
+          !isa<ConstantExpr>(*F.user_begin()) ||
+          !isa<ConstantExpr>(*F.user_begin()->user_begin())) {
+        continue;
+      }
+      auto *BitCast = cast<ConstantExpr>(*F.user_begin());
+      auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin());
+      auto RuntimeHandle = (F.getName() + "_runtime_handle").str();
+      auto *GV = new GlobalVariable(
+          M, Type::getInt8Ty(C)->getPointerTo(AS.GLOBAL_ADDRESS),
+          /*IsConstant=*/true, GlobalValue::ExternalLinkage,
+          /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr,
+          GlobalValue::NotThreadLocal, AS.GLOBAL_ADDRESS,
+          /*IsExternallyInitialized=*/true);
+      DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
+      auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType());
+      AddrCast->replaceAllUsesWith(NewPtr);
+      F.addFnAttr("runtime-handle", RuntimeHandle);
+      F.setLinkage(GlobalValue::ExternalLinkage);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 8a6b5aeaebc..2fdb012243a 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -161,6 +161,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
   initializeAMDGPULowerIntrinsicsPass(*PR);
+  initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
   initializeAMDGPUPromoteAllocaPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -610,6 +611,9 @@ void AMDGPUPassConfig::addIRPasses() {
   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
   addPass(createAMDGPUOpenCLImageTypeLoweringPass());
 
+  // Replace OpenCL enqueued block function pointers with global variables.
+  addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
+
   if (TM.getOptLevel() > CodeGenOpt::None) {
     addPass(createInferAddressSpacesPass());
     addPass(createAMDGPUPromoteAlloca());
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 450835f414a..baefbd3ae05 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -39,6 +39,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMachineModuleInfo.cpp
   AMDGPUMacroFusion.cpp
   AMDGPUMCInstLower.cpp
+  AMDGPUOpenCLEnqueuedBlockLowering.cpp
   AMDGPUOpenCLImageTypeLoweringPass.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPURegAsmNames.inc.cpp
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
index 4e828a791e0..4a576ca5c0b 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
@@ -244,6 +244,10 @@ void MetadataStreamer::emitKernelAttrs(const Function &Func) {
         cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
         mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue());
   }
+  if (Func.hasFnAttribute("runtime-handle")) {
+    Attrs.mRuntimeHandle =
+        Func.getFnAttribute("runtime-handle").getValueAsString().str();
+  }
 }
 
 void MetadataStreamer::emitKernelArgs(const Function &Func) {
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
index 37fd08242fb..ae557875959 100644
--- a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
+++ b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
@@ -14,6 +14,8 @@
 %struct.B = type { i32 addrspace(1)*}
 %opencl.clk_event_t = type opaque
 
+@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
+
 ; CHECK: ---
 ; CHECK:  Version: [ 1, 0 ]
 ; CHECK:  Printf:
@@ -1197,6 +1199,44 @@ define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a,
   ret void
 }
 
+; CHECK:      - Name:            __test_block_invoke_kernel
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       RuntimeHandle:       __test_block_invoke_kernel_runtime_handle
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          25
+; CHECK-NEXT:       Align:         1
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      __block_literal
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @__test_block_invoke_kernel(
+    <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg) #1
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !110
+    !kernel_arg_base_type !110 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+attributes #1 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
+
 !llvm.printf.fmts = !{!100, !101}
 
 !1 = !{i32 0}
@@ -1250,13 +1290,14 @@ define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a,
 !94 = !{!"", !"", !"", !"", !"", !"", !""}
 !100 = !{!"1:1:4:%d\5Cn"}
 !101 = !{!"2:1:8:%g\5Cn"}
+!110 = !{!"__block_literal"}
 
 ; NOTES: Displaying notes found at file offset 0x{{[0-9]+}}
 ; NOTES-NEXT: Owner    Data size    Description
 ; NOTES-NEXT: AMD      0x00000008   Unknown note type: (0x00000001)
 ; NOTES-NEXT: AMD      0x0000001b   Unknown note type: (0x00000003)
-; GFX700:     AMD      0x00008b0a   Unknown note type: (0x0000000a)
-; GFX800:     AMD      0x00008e6e   Unknown note type: (0x0000000a)
-; GFX900:     AMD      0x00008b0a   Unknown note type: (0x0000000a)
+; GFX700:     AMD      0x00008f64   Unknown note type: (0x0000000a)
+; GFX800:     AMD      0x000092e4   Unknown note type: (0x0000000a)
+; GFX900:     AMD      0x00008f64   Unknown note type: (0x0000000a)
 
 ; PARSER: AMDGPU Code Object Metadata Parser Test: PASS
diff --git a/test/CodeGen/AMDGPU/enqueue-kernel.ll b/test/CodeGen/AMDGPU/enqueue-kernel.ll
new file mode 100644
index 00000000000..b1b83c2b4a1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/enqueue-kernel.ll
@@ -0,0 +1,92 @@
+; RUN: opt -amdgpu-lower-enqueued-block -S < %s | FileCheck %s
+
+; CHECK: @__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
+; CHECK: @__test_block_invoke_2_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
+
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target triple = "amdgcn-amdhsa-amd-opencl"
+
+%struct.ndrange_t = type { i32 }
+%opencl.queue_t = type opaque
+
+define amdgpu_kernel void @test(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
+  !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+entry:
+  %block = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8
+  %tmp = alloca %struct.ndrange_t, align 4
+  %block2 = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, align 8
+  %tmp3 = alloca %struct.ndrange_t, align 4
+  %block.size = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 0
+  store i32 25, i32* %block.size, align 8
+  %block.align = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 1
+  store i32 8, i32* %block.align, align 4
+  %block.invoke = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 2
+  store i8 addrspace(4)* addrspacecast (i8* bitcast (void (<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>)* @__test_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke, align 8
+  %block.captured = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 3
+  store i8 addrspace(1)* %a, i8 addrspace(1)** %block.captured, align 8
+  %block.captured1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 4
+  store i8 %b, i8* %block.captured1, align 8
+  %tmp1 = bitcast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block to void ()*
+  %tmp2 = bitcast void ()* %tmp1 to i8*
+  %tmp4 = addrspacecast i8* %tmp2 to i8 addrspace(4)*
+  %tmp5 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t* byval nonnull %tmp, i8 addrspace(4)* nonnull %tmp4) #2
+  %block.size4 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 0
+  store i32 41, i32* %block.size4, align 8
+  %block.align5 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 1
+  store i32 8, i32* %block.align5, align 4
+  %block.invoke6 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 2
+  store i8 addrspace(4)* addrspacecast (i8* bitcast (void (<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>)* @__test_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke6, align 8
+  %block.captured7 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 3
+  store i8 addrspace(1)* %a, i8 addrspace(1)** %block.captured7, align 8
+  %block.captured8 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 6
+  store i8 %b, i8* %block.captured8, align 8
+  %block.captured9 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 4
+  store i64 addrspace(1)* %c, i64 addrspace(1)** %block.captured9, align 8
+  %block.captured10 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 5
+  store i64 %d, i64* %block.captured10, align 8
+  %tmp6 = bitcast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2 to void ()*
+  %tmp7 = bitcast void ()* %tmp6 to i8*
+  %tmp8 = addrspacecast i8* %tmp7 to i8 addrspace(4)*
+  %tmp9 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t* byval nonnull %tmp3, i8 addrspace(4)* nonnull %tmp8) #2
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @__test_block_invoke_kernel({{.*}}) #[[AT1:[0-9]+]]
+define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg) #0
+  !kernel_arg_addr_space !14 !kernel_arg_access_qual !15 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 {
+entry:
+  %.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg, 3
+  %.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg, 4
+  store i8 %.fca.4.extract, i8 addrspace(1)* %.fca.3.extract, align 1
+  ret void
+}
+
+declare i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)*, i32, %struct.ndrange_t*, i8 addrspace(4)*) local_unnamed_addr
+
+; CHECK: define amdgpu_kernel void @__test_block_invoke_2_kernel({{.*}}) #[[AT2:[0-9]+]]
+define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*,
+  i64 addrspace(1)*, i64, i8 }> %arg) #0 !kernel_arg_addr_space !14 !kernel_arg_access_qual !15
+  !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 {
+entry:
+  %.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 3
+  %.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 4
+  %.fca.5.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 5
+  %.fca.6.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 6
+  store i8 %.fca.6.extract, i8 addrspace(1)* %.fca.3.extract, align 1
+  store i64 %.fca.5.extract, i64 addrspace(1)* %.fca.4.extract, align 8
+  ret void
+}
+
+; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel_runtime_handle"
+; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel_runtime_handle"
+
+attributes #0 = { "enqueued-block" }
+
+!3 = !{i32 1, i32 0, i32 1, i32 0}
+!4 = !{!"none", !"none", !"none", !"none"}
+!5 = !{!"char*", !"char", !"long*", !"long"}
+!6 = !{!"", !"", !"", !""}
+!14 = !{i32 0}
+!15 = !{!"none"}
+!16 = !{!"__block_literal"}
+!17 = !{!""}