[AMDGPU] Lower enqueued blocks and generate runtime metadata

author Yaxun Liu <Yaxun.Liu@amd.com>

Tue, 10 Oct 2017 19:39:48 +0000 (19:39 +0000)

committer Yaxun Liu <Yaxun.Liu@amd.com>

Tue, 10 Oct 2017 19:39:48 +0000 (19:39 +0000)
author Yaxun Liu <Yaxun.Liu@amd.com>
Tue, 10 Oct 2017 19:39:48 +0000 (19:39 +0000)
committer Yaxun Liu <Yaxun.Liu@amd.com>
Tue, 10 Oct 2017 19:39:48 +0000 (19:39 +0000)
diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst

index ddcf40d30b5d8a3e29b1efb75e338badf31e77ff..12e97e97f9332432e2dacaaf544c7bd4683d4849 100644 (file)
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@@ -930,6 +930,16 @@ non-AMD key names should be prefixed by "*vendor-name*.".
  
                                                    Corresponds to the OpenCL
                                                    ``vec_type_hint`` attribute.
+
+     "RuntimeHandle"     string                   The external symbol name
+                                                  associated with a kernel.
+                                                  OpenCL runtime allocates a
+                                                  global buffer for the symbol
+                                                  and saves the kernel's address
+                                                  to it, which is used for
+                                                  device side enqueueing. Only
+                                                  available for device side
+                                                  enqueued kernels.
       =================== ============== ========= ==============================
  
  ..
diff --git a/include/llvm/Support/AMDGPUCodeObjectMetadata.h b/include/llvm/Support/AMDGPUCodeObjectMetadata.h

index d274c5ee9184269aae881a9c7f78f3ff4851793b..5f94a950b88f136ea55ba995992fbbb7060187d4 100644 (file)
--- a/include/llvm/Support/AMDGPUCodeObjectMetadata.h
+++ b/include/llvm/Support/AMDGPUCodeObjectMetadata.h
@@ -115,6 +115,8 @@ constexpr char ReqdWorkGroupSize[] = "ReqdWorkGroupSize";
  constexpr char WorkGroupSizeHint[] = "WorkGroupSizeHint";
  /// \brief Key for Kernel::Attr::Metadata::mVecTypeHint.
  constexpr char VecTypeHint[] = "VecTypeHint";
+/// \brief Key for Kernel::Attr::Metadata::mRuntimeHandle.
+constexpr char RuntimeHandle[] = "RuntimeHandle";
  } // end namespace Key
  
  /// \brief In-memory representation of kernel attributes metadata.
@@ -125,15 +127,17 @@ struct Metadata final {
    std::vector<uint32_t> mWorkGroupSizeHint = std::vector<uint32_t>();
    /// \brief 'vec_type_hint' attribute. Optional.
    std::string mVecTypeHint = std::string();
+  /// \brief External symbol created by runtime to store the kernel address
+  /// for enqueued blocks.
+  std::string mRuntimeHandle = std::string();
  
    /// \brief Default constructor.
    Metadata() = default;
  
    /// \returns True if kernel attributes metadata is empty, false otherwise.
    bool empty() const {
-    return mReqdWorkGroupSize.empty() &&
-           mWorkGroupSizeHint.empty() &&
-           mVecTypeHint.empty();
+    return mReqdWorkGroupSize.empty() && mWorkGroupSizeHint.empty() &&
+           mVecTypeHint.empty() && mRuntimeHandle.empty();
    }
  
    /// \returns True if kernel attributes metadata is not empty, false otherwise.
diff --git a/lib/Support/AMDGPUCodeObjectMetadata.cpp b/lib/Support/AMDGPUCodeObjectMetadata.cpp

index 863093ab7def763e7ea8412a21b4c6e3da5d9059..1872a00305835dd7e50d74eba3a821fd2a7d9713 100644 (file)
--- a/lib/Support/AMDGPUCodeObjectMetadata.cpp
+++ b/lib/Support/AMDGPUCodeObjectMetadata.cpp
@@ -96,6 +96,8 @@ struct MappingTraits<Kernel::Attrs::Metadata> {
                      MD.mWorkGroupSizeHint, std::vector<uint32_t>());
      YIO.mapOptional(Kernel::Attrs::Key::VecTypeHint,
                      MD.mVecTypeHint, std::string());
+    YIO.mapOptional(Kernel::Attrs::Key::RuntimeHandle, MD.mRuntimeHandle,
+                    std::string());
    }
  };
  
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h

index 6bf8cdcb849a2a36f068faf33896cea1e7cc31bf..e8f7476dd76ef45bc07073d1b7fe4dc80c8fc4b6 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -186,6 +186,10 @@ void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
  Pass *createAMDGPUFunctionInliningPass();
  void initializeAMDGPUInlinerPass(PassRegistry&);
  
+ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
+void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
+extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
+
  Target &getTheAMDGPUTarget();
  Target &getTheGCNTarget();
  
diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp

new file mode 100644 (file)

index 0000000..68a204f
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -0,0 +1,98 @@
+//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// \brief This post-linking pass replaces the function pointer of enqueued
+// block kernel with a global variable (runtime handle) and adds
+// "runtime-handle" attribute to the enqueued block kernel.
+//
+// In LLVM CodeGen the runtime-handle metadata will be translated to
+// RuntimeHandle metadata in code object. Runtime allocates a global buffer
+// for each kernel with RuntimeHandel metadata and saves the kernel address
+// required for the AQL packet into the buffer. __enqueue_kernel function
+// in device library knows that the invoke function pointer in the block
+// literal is actually runtime handle and loads the kernel address from it
+// and put it into AQL packet for dispatching.
+//
+// This cannot be done in FE since FE cannot create a unique global variable
+// with external linkage across LLVM modules. The global variable with internal
+// linkage does not work since optimization passes will try to replace loads
+// of the global variable with its initialization value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-lower-enqueued-block"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief Lower enqueued blocks.
+class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
+public:
+  static char ID;
+
+  explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}
+
+private:
+  bool runOnModule(Module &M) override;
+};
+
+} // end anonymous namespace
+
+char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0;
+
+char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID =
+    AMDGPUOpenCLEnqueuedBlockLowering::ID;
+
+INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
+                "Lower OpenCL enqueued blocks", false, false)
+
+ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
+  return new AMDGPUOpenCLEnqueuedBlockLowering();
+}
+
+bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
+  auto &C = M.getContext();
+  auto AS = AMDGPU::getAMDGPUAS(M);
+  bool Changed = false;
+  for (auto &F : M.functions()) {
+    if (F.hasFnAttribute("enqueued-block")) {
+      if (!F.hasOneUse() || !F.user_begin()->hasOneUse() ||
+          !isa<ConstantExpr>(*F.user_begin()) ||
+          !isa<ConstantExpr>(*F.user_begin()->user_begin())) {
+        continue;
+      }
+      auto *BitCast = cast<ConstantExpr>(*F.user_begin());
+      auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin());
+      auto RuntimeHandle = (F.getName() + "_runtime_handle").str();
+      auto *GV = new GlobalVariable(
+          M, Type::getInt8Ty(C)->getPointerTo(AS.GLOBAL_ADDRESS),
+          /*IsConstant=*/true, GlobalValue::ExternalLinkage,
+          /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr,
+          GlobalValue::NotThreadLocal, AS.GLOBAL_ADDRESS,
+          /*IsExternallyInitialized=*/true);
+      DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
+      auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType());
+      AddrCast->replaceAllUsesWith(NewPtr);
+      F.addFnAttr("runtime-handle", RuntimeHandle);
+      F.setLinkage(GlobalValue::ExternalLinkage);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index 8a6b5aeaebc68902894cb957ff1578471167cb96..2fdb012243a883fd0ad3f9480f804f5850b81c2b 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -161,6 +161,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
    initializeAMDGPUAnnotateUniformValuesPass(*PR);
    initializeAMDGPUArgumentUsageInfoPass(*PR);
    initializeAMDGPULowerIntrinsicsPass(*PR);
+  initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
    initializeAMDGPUPromoteAllocaPass(*PR);
    initializeAMDGPUCodeGenPreparePass(*PR);
    initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -610,6 +611,9 @@ void AMDGPUPassConfig::addIRPasses() {
    // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
    addPass(createAMDGPUOpenCLImageTypeLoweringPass());
  
+  // Replace OpenCL enqueued block function pointers with global variables.
+  addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
+
    if (TM.getOptLevel() > CodeGenOpt::None) {
      addPass(createInferAddressSpacesPass());
      addPass(createAMDGPUPromoteAlloca());
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt

index 450835f414ac2bdf55f9bbc20393f9a3693fa9b2..baefbd3ae057cc00b8536946eeed7bea331b979f 100644 (file)
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -39,6 +39,7 @@ add_llvm_target(AMDGPUCodeGen
    AMDGPUMachineModuleInfo.cpp
    AMDGPUMacroFusion.cpp
    AMDGPUMCInstLower.cpp
+  AMDGPUOpenCLEnqueuedBlockLowering.cpp
    AMDGPUOpenCLImageTypeLoweringPass.cpp
    AMDGPUPromoteAlloca.cpp
    AMDGPURegAsmNames.inc.cpp
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp

index 4e828a791e09f74912106a34f784b5d64940b5e5..4a576ca5c0b83737e071dcbd41e9034e19be4f82 100644 (file)
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
@@ -244,6 +244,10 @@ void MetadataStreamer::emitKernelAttrs(const Function &Func) {
          cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
          mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue());
    }
+  if (Func.hasFnAttribute("runtime-handle")) {
+    Attrs.mRuntimeHandle =
+        Func.getFnAttribute("runtime-handle").getValueAsString().str();
+  }
  }
  
  void MetadataStreamer::emitKernelArgs(const Function &Func) {
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll

index 37fd08242fbaa305fef6e1dd20a9a277c5055138..ae557875959d0d8e8394b4b15058998ba86d2db8 100644 (file)
--- a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
+++ b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
@@ -14,6 +14,8 @@
  %struct.B = type { i32 addrspace(1)*}
  %opencl.clk_event_t = type opaque
  
+@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
+
  ; CHECK: ---
  ; CHECK:  Version: [ 1, 0 ]
  ; CHECK:  Printf:
@@ -1197,6 +1199,44 @@ define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a,
    ret void
  }
  
+; CHECK:      - Name:            __test_block_invoke_kernel
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       RuntimeHandle:       __test_block_invoke_kernel_runtime_handle
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          25
+; CHECK-NEXT:       Align:         1
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      __block_literal
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @__test_block_invoke_kernel(
+    <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg) #1
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !110
+    !kernel_arg_base_type !110 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+attributes #1 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
+
  !llvm.printf.fmts = !{!100, !101}
  
  !1 = !{i32 0}
@@ -1250,13 +1290,14 @@ define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a,
  !94 = !{!"", !"", !"", !"", !"", !"", !""}
  !100 = !{!"1:1:4:%d\5Cn"}
  !101 = !{!"2:1:8:%g\5Cn"}
+!110 = !{!"__block_literal"}
  
  ; NOTES: Displaying notes found at file offset 0x{{[0-9]+}}
  ; NOTES-NEXT: Owner    Data size    Description
  ; NOTES-NEXT: AMD      0x00000008   Unknown note type: (0x00000001)
  ; NOTES-NEXT: AMD      0x0000001b   Unknown note type: (0x00000003)
-; GFX700:     AMD      0x00008b0a   Unknown note type: (0x0000000a)
-; GFX800:     AMD      0x00008e6e   Unknown note type: (0x0000000a)
-; GFX900:     AMD      0x00008b0a   Unknown note type: (0x0000000a)
+; GFX700:     AMD      0x00008f64   Unknown note type: (0x0000000a)
+; GFX800:     AMD      0x000092e4   Unknown note type: (0x0000000a)
+; GFX900:     AMD      0x00008f64   Unknown note type: (0x0000000a)
  
  ; PARSER: AMDGPU Code Object Metadata Parser Test: PASS
diff --git a/test/CodeGen/AMDGPU/enqueue-kernel.ll b/test/CodeGen/AMDGPU/enqueue-kernel.ll

new file mode 100644 (file)

index 0000000..b1b83c2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/enqueue-kernel.ll
@@ -0,0 +1,92 @@
+; RUN: opt -amdgpu-lower-enqueued-block -S < %s | FileCheck %s
+
+; CHECK: @__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
+; CHECK: @__test_block_invoke_2_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)*
+
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target triple = "amdgcn-amdhsa-amd-opencl"
+
+%struct.ndrange_t = type { i32 }
+%opencl.queue_t = type opaque
+
+define amdgpu_kernel void @test(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
+  !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+entry:
+  %block = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8
+  %tmp = alloca %struct.ndrange_t, align 4
+  %block2 = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, align 8
+  %tmp3 = alloca %struct.ndrange_t, align 4
+  %block.size = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 0
+  store i32 25, i32* %block.size, align 8
+  %block.align = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 1
+  store i32 8, i32* %block.align, align 4
+  %block.invoke = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 2
+  store i8 addrspace(4)* addrspacecast (i8* bitcast (void (<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>)* @__test_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke, align 8
+  %block.captured = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 3
+  store i8 addrspace(1)* %a, i8 addrspace(1)** %block.captured, align 8
+  %block.captured1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 4
+  store i8 %b, i8* %block.captured1, align 8
+  %tmp1 = bitcast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block to void ()*
+  %tmp2 = bitcast void ()* %tmp1 to i8*
+  %tmp4 = addrspacecast i8* %tmp2 to i8 addrspace(4)*
+  %tmp5 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t* byval nonnull %tmp, i8 addrspace(4)* nonnull %tmp4) #2
+  %block.size4 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 0
+  store i32 41, i32* %block.size4, align 8
+  %block.align5 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 1
+  store i32 8, i32* %block.align5, align 4
+  %block.invoke6 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 2
+  store i8 addrspace(4)* addrspacecast (i8* bitcast (void (<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>)* @__test_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke6, align 8
+  %block.captured7 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 3
+  store i8 addrspace(1)* %a, i8 addrspace(1)** %block.captured7, align 8
+  %block.captured8 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 6
+  store i8 %b, i8* %block.captured8, align 8
+  %block.captured9 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 4
+  store i64 addrspace(1)* %c, i64 addrspace(1)** %block.captured9, align 8
+  %block.captured10 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 5
+  store i64 %d, i64* %block.captured10, align 8
+  %tmp6 = bitcast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2 to void ()*
+  %tmp7 = bitcast void ()* %tmp6 to i8*
+  %tmp8 = addrspacecast i8* %tmp7 to i8 addrspace(4)*
+  %tmp9 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t* byval nonnull %tmp3, i8 addrspace(4)* nonnull %tmp8) #2
+  ret void
+}
+
+; CHECK: define amdgpu_kernel void @__test_block_invoke_kernel({{.*}}) #[[AT1:[0-9]+]]
+define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg) #0
+  !kernel_arg_addr_space !14 !kernel_arg_access_qual !15 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 {
+entry:
+  %.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg, 3
+  %.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg, 4
+  store i8 %.fca.4.extract, i8 addrspace(1)* %.fca.3.extract, align 1
+  ret void
+}
+
+declare i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)*, i32, %struct.ndrange_t*, i8 addrspace(4)*) local_unnamed_addr
+
+; CHECK: define amdgpu_kernel void @__test_block_invoke_2_kernel({{.*}}) #[[AT2:[0-9]+]]
+define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*,
+  i64 addrspace(1)*, i64, i8 }> %arg) #0 !kernel_arg_addr_space !14 !kernel_arg_access_qual !15
+  !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 {
+entry:
+  %.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 3
+  %.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 4
+  %.fca.5.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 5
+  %.fca.6.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 6
+  store i8 %.fca.6.extract, i8 addrspace(1)* %.fca.3.extract, align 1
+  store i64 %.fca.5.extract, i64 addrspace(1)* %.fca.4.extract, align 8
+  ret void
+}
+
+; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel_runtime_handle"
+; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel_runtime_handle"
+
+attributes #0 = { "enqueued-block" }
+
+!3 = !{i32 1, i32 0, i32 1, i32 0}
+!4 = !{!"none", !"none", !"none", !"none"}
+!5 = !{!"char*", !"char", !"long*", !"long"}
+!6 = !{!"", !"", !"", !""}
+!14 = !{i32 0}
+!15 = !{!"none"}
+!16 = !{!"__block_literal"}
+!17 = !{!""}
author	Yaxun Liu <Yaxun.Liu@amd.com>
	Tue, 10 Oct 2017 19:39:48 +0000 (19:39 +0000)
committer	Yaxun Liu <Yaxun.Liu@amd.com>
	Tue, 10 Oct 2017 19:39:48 +0000 (19:39 +0000)
docs/AMDGPUUsage.rst		patch \| blob \| history
include/llvm/Support/AMDGPUCodeObjectMetadata.h		patch \| blob \| history
lib/Support/AMDGPUCodeObjectMetadata.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPU.h		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp	[new file with mode: 0644]	patch \| blob
lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
lib/Target/AMDGPU/CMakeLists.txt		patch \| blob \| history
lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll		patch \| blob \| history
test/CodeGen/AMDGPU/enqueue-kernel.ll	[new file with mode: 0644]	patch \| blob