[HIP] Support offloading by linker script

author Yaxun Liu <Yaxun.Liu@amd.com>

Fri, 18 May 2018 15:07:56 +0000 (15:07 +0000)

committer Yaxun Liu <Yaxun.Liu@amd.com>

Fri, 18 May 2018 15:07:56 +0000 (15:07 +0000)
author Yaxun Liu <Yaxun.Liu@amd.com>
Fri, 18 May 2018 15:07:56 +0000 (15:07 +0000)
committer Yaxun Liu <Yaxun.Liu@amd.com>
Fri, 18 May 2018 15:07:56 +0000 (15:07 +0000)
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td

index 0b03974cd5555fe0ee819fa409e6e3a48bd5bfe5..5a2b784c92588560b9a2551f968fa3452b5818a0 100644 (file)
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -586,6 +586,8 @@ def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">;
  def fcuda_short_ptr : Flag<["-"], "fcuda-short-ptr">, Flags<[CC1Option]>,
    HelpText<"Use 32-bit pointers for accessing const/local/shared address spaces.">;
  def fno_cuda_short_ptr : Flag<["-"], "fno-cuda-short-ptr">;
+def fhip_dump_offload_linker_script : Flag<["-"], "fhip-dump-offload-linker-script">,
+  Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>;
  def dA : Flag<["-"], "dA">, Group<d_Group>;
  def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
    HelpText<"Print macro definitions in -E mode in addition to normal output">;
diff --git a/lib/CodeGen/CGCUDANV.cpp b/lib/CodeGen/CGCUDANV.cpp

index e9e5483e5ac1e64cb526f09cdd24646ef9e70c4a..4c2e10b65cc810b62c10e95014d8032364c4121e 100644 (file)
--- a/lib/CodeGen/CGCUDANV.cpp
+++ b/lib/CodeGen/CGCUDANV.cpp
@@ -27,6 +27,8 @@ using namespace clang;
  using namespace CodeGen;
  
  namespace {
+constexpr unsigned CudaFatMagic = 0x466243b1;
+constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
  
  class CGNVCUDARuntime : public CGCUDARuntime {
  
@@ -310,19 +312,20 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
  /// }
  /// \endcode
  llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
+  bool IsHIP = CGM.getLangOpts().HIP;
    // No need to generate ctors/dtors if there is no GPU binary.
-  std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
-  if (GpuBinaryFileName.empty())
+  StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
+  if (CudaGpuBinaryFileName.empty() && !IsHIP)
      return nullptr;
  
-  // void __cuda_register_globals(void* handle);
+  // void __{cuda|hip}_register_globals(void* handle);
    llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
    // We always need a function to pass in as callback. Create a dummy
    // implementation if we don't need to register anything.
    if (RelocatableDeviceCode && !RegisterGlobalsFunc)
      RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
  
-  // void ** __cudaRegisterFatBinary(void *);
+  // void ** __{cuda|hip}RegisterFatBinary(void *);
    llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
        llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
        addUnderscoredPrefixToName("RegisterFatBinary"));
@@ -334,12 +337,16 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
    // global variable and save a reference in GpuBinaryHandle to be cleaned up
    // in destructor on exit. Then associate all known kernels with the GPU binary
    // handle so CUDA runtime can figure out what to call on the GPU side.
-  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
-      llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
-  if (std::error_code EC = GpuBinaryOrErr.getError()) {
-    CGM.getDiags().Report(diag::err_cannot_open_file)
-        << GpuBinaryFileName << EC.message();
-    return nullptr;
+  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary;
+  if (!IsHIP) {
+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
+        llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
+    if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
+      CGM.getDiags().Report(diag::err_cannot_open_file)
+          << CudaGpuBinaryFileName << EC.message();
+      return nullptr;
+    }
+    CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
    }
  
    llvm::Function *ModuleCtorFunc = llvm::Function::Create(
@@ -353,28 +360,60 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
    CtorBuilder.SetInsertPoint(CtorEntryBB);
  
    const char *FatbinConstantName;
-  if (RelocatableDeviceCode)
+  const char *FatbinSectionName;
+  const char *ModuleIDSectionName;
+  StringRef ModuleIDPrefix;
+  llvm::Constant *FatBinStr;
+  unsigned FatMagic;
+  if (IsHIP) {
+    FatbinConstantName = ".hip_fatbin";
+    FatbinSectionName = ".hipFatBinSegment";
+
+    ModuleIDSectionName = "__hip_module_id";
+    ModuleIDPrefix = "__hip_";
+
+    // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin.
+    // The external symbol is supposed to contain the fat binary but will be
+    // populated somewhere else, e.g. by lld through link script.
+    FatBinStr = new llvm::GlobalVariable(
+        CGM.getModule(), CGM.Int8Ty,
+        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
+        "__hip_fatbin", nullptr,
+        llvm::GlobalVariable::NotThreadLocal);
+    cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
+
+    FatMagic = HIPFatMagic;
+  } else {
+    if (RelocatableDeviceCode)
+      // TODO: Figure out how this is called on mac OS!
+      FatbinConstantName = "__nv_relfatbin";
+    else
+      FatbinConstantName =
+          CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+    // NVIDIA's cuobjdump looks for fatbins in this section.
+    FatbinSectionName =
+        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+
      // TODO: Figure out how this is called on mac OS!
-    FatbinConstantName = "__nv_relfatbin";
-  else
-    FatbinConstantName =
-        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
-  // NVIDIA's cuobjdump looks for fatbins in this section.
-  const char *FatbinSectionName =
-      CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
-  // TODO: Figure out how this is called on mac OS!
-  const char *NVModuleIDSectionName = "__nv_module_id";
+    ModuleIDSectionName = "__nv_module_id";
+    ModuleIDPrefix = "__nv_";
+
+    // For CUDA, create a string literal containing the fat binary loaded from
+    // the given file.
+    FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
+                                   FatbinConstantName, 8);
+    FatMagic = CudaFatMagic;
+  }
  
    // Create initialized wrapper structure that points to the loaded GPU binary
    ConstantInitBuilder Builder(CGM);
    auto Values = Builder.beginStruct(FatbinWrapperTy);
    // Fatbin wrapper magic.
-  Values.addInt(IntTy, 0x466243b1);
+  Values.addInt(IntTy, FatMagic);
    // Fatbin version.
    Values.addInt(IntTy, 1);
    // Data.
-  Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",
-                                FatbinConstantName, 8));
+  Values.add(FatBinStr);
    // Unused in fatbin v1.
    Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
    llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
@@ -382,10 +421,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
        /*constant*/ true);
    FatbinWrapper->setSection(FatbinSectionName);
  
-  // Register binary with CUDA runtime. This is substantially different in
+  // Register binary with CUDA/HIP runtime. This is substantially different in
    // default mode vs. separate compilation!
    if (!RelocatableDeviceCode) {
-    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
+    // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper);
      llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
          RegisterFatbinFunc,
          CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
@@ -397,34 +436,34 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
      CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
                                     CGM.getPointerAlign());
  
-    // Call __cuda_register_globals(GpuBinaryHandle);
+    // Call __{cuda|hip}_register_globals(GpuBinaryHandle);
      if (RegisterGlobalsFunc)
        CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
    } else {
      // Generate a unique module ID.
-    SmallString<64> NVModuleID;
-    llvm::raw_svector_ostream OS(NVModuleID);
-    OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID());
-    llvm::Constant *NVModuleIDConstant =
-        makeConstantString(NVModuleID.str(), "", NVModuleIDSectionName, 32);
-
-    // Create an alias for the FatbinWrapper that nvcc will look for.
+    SmallString<64> ModuleID;
+    llvm::raw_svector_ostream OS(ModuleID);
+    OS << ModuleIDPrefix << llvm::format("%x", FatbinWrapper->getGUID());
+    llvm::Constant *ModuleIDConstant =
+        makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
+
+    // Create an alias for the FatbinWrapper that nvcc or hip backend will
+    // look for.
      llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
-                              Twine("__fatbinwrap") + NVModuleID,
-                              FatbinWrapper);
+                              Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
  
-    // void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *,
+    // void __{cuda|hip}RegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
      // void *, void (*)(void **))
      SmallString<128> RegisterLinkedBinaryName(
          addUnderscoredPrefixToName("RegisterLinkedBinary"));
-    RegisterLinkedBinaryName += NVModuleID;
+    RegisterLinkedBinaryName += ModuleID;
      llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
          getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
  
      assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
      llvm::Value *Args[] = {RegisterGlobalsFunc,
                             CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
-                           NVModuleIDConstant,
+                           ModuleIDConstant,
                             makeDummyFunction(getCallbackFnTy())};
      CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
    }
diff --git a/lib/Driver/ToolChains/CommonArgs.cpp b/lib/Driver/ToolChains/CommonArgs.cpp

index 9a3ab8d4b2fd725e361a2b07b88ab40a63fe2e23..b7e86cb0436d594214709990255ac6304a45591b 100644 (file)
--- a/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/lib/Driver/ToolChains/CommonArgs.cpp
@@ -146,12 +146,14 @@ void tools::AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs,
    Args.AddAllArgValues(CmdArgs, options::OPT_Zlinker_input);
  
    for (const auto &II : Inputs) {
-    // If the current tool chain refers to an OpenMP offloading host, we should
-    // ignore inputs that refer to OpenMP offloading devices - they will be
-    // embedded according to a proper linker script.
+    // If the current tool chain refers to an OpenMP or HIP offloading host, we
+    // should ignore inputs that refer to OpenMP or HIP offloading devices -
+    // they will be embedded according to a proper linker script.
      if (auto *IA = II.getAction())
-      if (JA.isHostOffloading(Action::OFK_OpenMP) &&
-          IA->isDeviceOffloading(Action::OFK_OpenMP))
+      if ((JA.isHostOffloading(Action::OFK_OpenMP) &&
+           IA->isDeviceOffloading(Action::OFK_OpenMP)) ||
+          (JA.isHostOffloading(Action::OFK_HIP) &&
+           IA->isDeviceOffloading(Action::OFK_HIP)))
          continue;
  
      if (!TC.HasNativeLLVMSupport() && types::isLLVMIR(II.getType()))
@@ -1288,6 +1290,124 @@ void tools::AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
    Lksf << LksBuffer;
  }
  
+/// Add HIP linker script arguments at the end of the argument list so that
+/// the fat binary is built by embedding the device images into the host. The
+/// linker script also defines a symbol required by the code generation so that
+/// the image can be retrieved at runtime. This should be used only in tool
+/// chains that support linker scripts.
+void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
+                               const InputInfo &Output,
+                               const InputInfoList &Inputs, const ArgList &Args,
+                               ArgStringList &CmdArgs, const JobAction &JA,
+                               const Tool &T) {
+
+  // If this is not a HIP host toolchain, we don't need to do anything.
+  if (!JA.isHostOffloading(Action::OFK_HIP))
+    return;
+
+  // Create temporary linker script. Keep it if save-temps is enabled.
+  const char *LKS;
+  SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
+  if (C.getDriver().isSaveTempsEnabled()) {
+    llvm::sys::path::replace_extension(Name, "lk");
+    LKS = C.getArgs().MakeArgString(Name.c_str());
+  } else {
+    llvm::sys::path::replace_extension(Name, "");
+    Name = C.getDriver().GetTemporaryPath(Name, "lk");
+    LKS = C.addTempFile(C.getArgs().MakeArgString(Name.c_str()));
+  }
+
+  // Add linker script option to the command.
+  CmdArgs.push_back("-T");
+  CmdArgs.push_back(LKS);
+
+  // Create a buffer to write the contents of the linker script.
+  std::string LksBuffer;
+  llvm::raw_string_ostream LksStream(LksBuffer);
+
+  // Get the HIP offload tool chain.
+  auto *HIPTC = static_cast<const toolchains::CudaToolChain *>(
+      C.getSingleOffloadToolChain<Action::OFK_HIP>());
+  assert(HIPTC->getTriple().getArch() == llvm::Triple::amdgcn &&
+         "Wrong platform");
+
+  // Construct clang-offload-bundler command to bundle object files for
+  // for different GPU archs.
+  ArgStringList BundlerArgs;
+  BundlerArgs.push_back(Args.MakeArgString("-type=o"));
+
+  // ToDo: Remove the dummy host binary entry which is required by
+  // clang-offload-bundler.
+  std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
+  std::string BundlerInputArg = "-inputs=/dev/null";
+
+  for (const auto &II : Inputs) {
+    const Action *A = II.getAction();
+    // Is this a device linking action?
+    if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
+      BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
+                         StringRef(A->getOffloadingArch()).str();
+      BundlerInputArg = BundlerInputArg + "," + II.getFilename();
+    }
+  }
+  BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
+  BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
+
+  std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o");
+  const char *BundleFile =
+      C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str()));
+  auto BundlerOutputArg =
+      Args.MakeArgString(std::string("-outputs=").append(BundleFile));
+  BundlerArgs.push_back(BundlerOutputArg);
+
+  SmallString<128> BundlerPath(C.getDriver().Dir);
+  llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
+  const char *Bundler = Args.MakeArgString(BundlerPath);
+  C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+
+  // Add commands to embed target binaries. We ensure that each section and
+  // image is 16-byte aligned. This is not mandatory, but increases the
+  // likelihood of data to be aligned with a cache block in several main host
+  // machines.
+  LksStream << "/*\n";
+  LksStream << "       HIP Offload Linker Script\n";
+  LksStream << " *** Automatically generated by Clang ***\n";
+  LksStream << "*/\n";
+  LksStream << "TARGET(binary)\n";
+  LksStream << "INPUT(" << BundleFileName << ")\n";
+  LksStream << "SECTIONS\n";
+  LksStream << "{\n";
+  LksStream << "  .hip_fatbin :\n";
+  LksStream << "  ALIGN(0x10)\n";
+  LksStream << "  {\n";
+  LksStream << "    PROVIDE_HIDDEN(__hip_fatbin = .);\n";
+  LksStream << "    " << BundleFileName << "\n";
+  LksStream << "  }\n";
+  LksStream << "}\n";
+  LksStream << "INSERT BEFORE .data\n";
+  LksStream.flush();
+
+  // Dump the contents of the linker script if the user requested that. We
+  // support this option to enable testing of behavior with -###.
+  if (C.getArgs().hasArg(options::OPT_fhip_dump_offload_linker_script))
+    llvm::errs() << LksBuffer;
+
+  // If this is a dry run, do not create the linker script file.
+  if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
+    return;
+
+  // Open script file and write the contents.
+  std::error_code EC;
+  llvm::raw_fd_ostream Lksf(LKS, EC, llvm::sys::fs::F_None);
+
+  if (EC) {
+    C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
+    return;
+  }
+
+  Lksf << LksBuffer;
+}
+
  SmallString<128> tools::getStatsFileName(const llvm::opt::ArgList &Args,
                                           const InputInfo &Output,
                                           const InputInfo &Input,
diff --git a/lib/Driver/ToolChains/CommonArgs.h b/lib/Driver/ToolChains/CommonArgs.h

index 00bb2e4ec47ec66c51138595982241f8e1e8f717..e8ebe2225e1cfc568d1bd5c61907741102eeea11 100644 (file)
--- a/lib/Driver/ToolChains/CommonArgs.h
+++ b/lib/Driver/ToolChains/CommonArgs.h
@@ -52,6 +52,12 @@ void AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
                             llvm::opt::ArgStringList &CmdArgs,
                             const JobAction &JA);
  
+void AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
+                        const InputInfo &Output, const InputInfoList &Inputs,
+                        const llvm::opt::ArgList &Args,
+                        llvm::opt::ArgStringList &CmdArgs, const JobAction &JA,
+                        const Tool &T);
+
  const char *SplitDebugName(const llvm::opt::ArgList &Args,
                             const InputInfo &Input);
  
diff --git a/lib/Driver/ToolChains/Gnu.cpp b/lib/Driver/ToolChains/Gnu.cpp

index 56c2a97dc4cb0f1d260a60df0329d26e10ad8447..cc925c702760232aca77a44e91d79d8ec481b139 100644 (file)
--- a/lib/Driver/ToolChains/Gnu.cpp
+++ b/lib/Driver/ToolChains/Gnu.cpp
@@ -535,6 +535,10 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
    // Add OpenMP offloading linker script args if required.
    AddOpenMPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA);
  
+  // Add HIP offloading linker script args if required.
+  AddHIPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA,
+                     *this);
+
    C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
  }
  
diff --git a/test/CodeGenCUDA/device-stub.cu b/test/CodeGenCUDA/device-stub.cu

index 8339d872ad95b465d87db5304d283fa9d77378ce..894b7205c305595a7f8991b155ecc9d032b28fb7 100644 (file)
--- a/test/CodeGenCUDA/device-stub.cu
+++ b/test/CodeGenCUDA/device-stub.cu
@@ -1,13 +1,13 @@
  // RUN: echo "GPU binary would be here" > %t
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
  // RUN:     -fcuda-include-gpubinary %t -o - \
-// RUN:   | FileCheck %s --check-prefixes=ALL,NORDC,CUDA
+// RUN:   | FileCheck %s --check-prefixes=ALL,NORDC,CUDA,CUDANORDC
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
  // RUN:     -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
-// RUN:   | FileCheck %s -check-prefix=NOGLOBALS
+// RUN:   | FileCheck %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
  // RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - \
-// RUN:   | FileCheck %s --check-prefixes=ALL,RDC,CUDA
+// RUN:   | FileCheck %s --check-prefixes=ALL,RDC,CUDA,CUDARDC
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
  // RUN:   | FileCheck %s -check-prefix=NOGPUBIN
  
@@ -16,10 +16,10 @@
  // RUN:   | FileCheck %s --check-prefixes=ALL,NORDC,HIP
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
  // RUN:     -fcuda-include-gpubinary %t -o -  -DNOGLOBALS -x hip \
-// RUN:   | FileCheck %s -check-prefix=NOGLOBALS
+// RUN:   | FileCheck %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
  // RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \
-// RUN:   | FileCheck %s --check-prefixes=ALL,RDC,HIP
+// RUN:   | FileCheck %s --check-prefixes=ALL,RDC,HIP,HIPRDC
  // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
  // RUN:   | FileCheck %s -check-prefix=NOGPUBIN
  
@@ -64,21 +64,26 @@ void use_pointers() {
  // * constant unnamed string with the kernel name
  // ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
  // * constant unnamed string with GPU binary
-// ALL: private unnamed_addr constant{{.*GPU binary would be here.*}}\00"
-// NORDC-SAME: section ".nv_fatbin", align 8
-// RDC-SAME: section "__nv_relfatbin", align 8
+// HIP: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
+// CUDA: @[[FATBIN:.*]] = private unnamed_addr constant{{.*GPU binary would be here.*}}\00",
+// CUDANORDC-SAME: section ".nv_fatbin", align 8
+// CUDARDC-SAME: section "__nv_relfatbin", align 8
  // * constant struct that wraps GPU binary
-// CUDA: @__[[PREFIX:cuda]]_fatbin_wrapper = internal constant
-// CUDA-SAME: { i32, i32, i8*, i8* }
-// HIP: @__[[PREFIX:hip]]_fatbin_wrapper = internal constant
-// HIP-SAME:  { i32, i32, i8*, i8* }
-// ALL-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
-// ALL-SAME: section ".nvFatBinSegment"
+// ALL: @__[[PREFIX:cuda|hip]]_fatbin_wrapper = internal constant
+// ALL-SAME: { i32, i32, i8*, i8* }
+// CUDA-SAME: { i32 1180844977, i32 1,
+// HIP-SAME: { i32 1212764230, i32 1,
+// CUDA-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),
+// HIP-SAME:  i8* @[[FATBIN]],
+// ALL-SAME: i8* null }
+// CUDA-SAME: section ".nvFatBinSegment"
+// HIP-SAME: section ".hipFatBinSegment"
  // * variable to save GPU binary handle after initialization
  // NORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
  // * constant unnamed string with NVModuleID
  // RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant
-// RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
+// CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
+// HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32
  // * Make sure our constructor was added to global ctor list.
  // ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
  // * In separate mode we also register a destructor.
@@ -136,9 +141,10 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
  // There should be no __[[PREFIX]]_register_globals if we have no
  // device-side globals, but we still need to register GPU binary.
  // Skip GPU binary string first.
-// NOGLOBALS: @0 = private unnamed_addr constant{{.*}}
+// CUDANOGLOBALS: @{{.*}} = private unnamed_addr constant{{.*}}
+// HIPNOGLOBALS: @{{.*}} = external constant{{.*}}
  // NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals
-// NOGLOBALS: define internal void @__[[PREFIX:.*]]_module_ctor
+// NOGLOBALS: define internal void @__[[PREFIX:cuda|hip]]_module_ctor
  // NOGLOBALS: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
  // NOGLOBALS-NOT: call void @__[[PREFIX]]_register_globals
  // NOGLOBALS: define internal void @__[[PREFIX]]_module_dtor
author	Yaxun Liu <Yaxun.Liu@amd.com>
	Fri, 18 May 2018 15:07:56 +0000 (15:07 +0000)
committer	Yaxun Liu <Yaxun.Liu@amd.com>
	Fri, 18 May 2018 15:07:56 +0000 (15:07 +0000)
include/clang/Driver/Options.td		patch \| blob \| history
lib/CodeGen/CGCUDANV.cpp		patch \| blob \| history
lib/Driver/ToolChains/CommonArgs.cpp		patch \| blob \| history
lib/Driver/ToolChains/CommonArgs.h		patch \| blob \| history
lib/Driver/ToolChains/Gnu.cpp		patch \| blob \| history
test/CodeGenCUDA/device-stub.cu		patch \| blob \| history