def fcuda_short_ptr : Flag<["-"], "fcuda-short-ptr">, Flags<[CC1Option]>,
HelpText<"Use 32-bit pointers for accessing const/local/shared address spaces.">;
def fno_cuda_short_ptr : Flag<["-"], "fno-cuda-short-ptr">;
+def fhip_dump_offload_linker_script : Flag<["-"], "fhip-dump-offload-linker-script">,
+ Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>;
def dA : Flag<["-"], "dA">, Group<d_Group>;
def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
HelpText<"Print macro definitions in -E mode in addition to normal output">;
using namespace CodeGen;
namespace {
+constexpr unsigned CudaFatMagic = 0x466243b1;
+constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
class CGNVCUDARuntime : public CGCUDARuntime {
/// }
/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
+ bool IsHIP = CGM.getLangOpts().HIP;
// No need to generate ctors/dtors if there is no GPU binary.
- std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
- if (GpuBinaryFileName.empty())
+ StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
+ if (CudaGpuBinaryFileName.empty() && !IsHIP)
return nullptr;
- // void __cuda_register_globals(void* handle);
+ // void __{cuda|hip}_register_globals(void* handle);
llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
// We always need a function to pass in as callback. Create a dummy
// implementation if we don't need to register anything.
if (RelocatableDeviceCode && !RegisterGlobalsFunc)
RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
- // void ** __cudaRegisterFatBinary(void *);
+ // void ** __{cuda|hip}RegisterFatBinary(void *);
llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
addUnderscoredPrefixToName("RegisterFatBinary"));
// global variable and save a reference in GpuBinaryHandle to be cleaned up
// in destructor on exit. Then associate all known kernels with the GPU binary
// handle so CUDA runtime can figure out what to call on the GPU side.
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
- llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
- if (std::error_code EC = GpuBinaryOrErr.getError()) {
- CGM.getDiags().Report(diag::err_cannot_open_file)
- << GpuBinaryFileName << EC.message();
- return nullptr;
+ std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary;
+ if (!IsHIP) {
+ llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
+ llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
+ if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
+ CGM.getDiags().Report(diag::err_cannot_open_file)
+ << CudaGpuBinaryFileName << EC.message();
+ return nullptr;
+ }
+ CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
}
llvm::Function *ModuleCtorFunc = llvm::Function::Create(
CtorBuilder.SetInsertPoint(CtorEntryBB);
const char *FatbinConstantName;
- if (RelocatableDeviceCode)
+ const char *FatbinSectionName;
+ const char *ModuleIDSectionName;
+ StringRef ModuleIDPrefix;
+ llvm::Constant *FatBinStr;
+ unsigned FatMagic;
+ if (IsHIP) {
+ FatbinConstantName = ".hip_fatbin";
+ FatbinSectionName = ".hipFatBinSegment";
+
+ ModuleIDSectionName = "__hip_module_id";
+ ModuleIDPrefix = "__hip_";
+
+ // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin.
+ // The external symbol is supposed to contain the fat binary but will be
+ // populated somewhere else, e.g. by lld through link script.
+ FatBinStr = new llvm::GlobalVariable(
+ CGM.getModule(), CGM.Int8Ty,
+ /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
+ "__hip_fatbin", nullptr,
+ llvm::GlobalVariable::NotThreadLocal);
+ cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
+
+ FatMagic = HIPFatMagic;
+ } else {
+ if (RelocatableDeviceCode)
+ // TODO: Figure out how this is called on mac OS!
+ FatbinConstantName = "__nv_relfatbin";
+ else
+ FatbinConstantName =
+ CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+ // NVIDIA's cuobjdump looks for fatbins in this section.
+ FatbinSectionName =
+ CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+
// TODO: Figure out how this is called on mac OS!
- FatbinConstantName = "__nv_relfatbin";
- else
- FatbinConstantName =
- CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
- // NVIDIA's cuobjdump looks for fatbins in this section.
- const char *FatbinSectionName =
- CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
- // TODO: Figure out how this is called on mac OS!
- const char *NVModuleIDSectionName = "__nv_module_id";
+ ModuleIDSectionName = "__nv_module_id";
+ ModuleIDPrefix = "__nv_";
+
+ // For CUDA, create a string literal containing the fat binary loaded from
+ // the given file.
+ FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
+ FatbinConstantName, 8);
+ FatMagic = CudaFatMagic;
+ }
// Create initialized wrapper structure that points to the loaded GPU binary
ConstantInitBuilder Builder(CGM);
auto Values = Builder.beginStruct(FatbinWrapperTy);
// Fatbin wrapper magic.
- Values.addInt(IntTy, 0x466243b1);
+ Values.addInt(IntTy, FatMagic);
// Fatbin version.
Values.addInt(IntTy, 1);
// Data.
- Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",
- FatbinConstantName, 8));
+ Values.add(FatBinStr);
// Unused in fatbin v1.
Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
/*constant*/ true);
FatbinWrapper->setSection(FatbinSectionName);
- // Register binary with CUDA runtime. This is substantially different in
+ // Register binary with CUDA/HIP runtime. This is substantially different in
// default mode vs. separate compilation!
if (!RelocatableDeviceCode) {
- // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
+ // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper);
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
RegisterFatbinFunc,
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
CGM.getPointerAlign());
- // Call __cuda_register_globals(GpuBinaryHandle);
+ // Call __{cuda|hip}_register_globals(GpuBinaryHandle);
if (RegisterGlobalsFunc)
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
} else {
// Generate a unique module ID.
- SmallString<64> NVModuleID;
- llvm::raw_svector_ostream OS(NVModuleID);
- OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID());
- llvm::Constant *NVModuleIDConstant =
- makeConstantString(NVModuleID.str(), "", NVModuleIDSectionName, 32);
-
- // Create an alias for the FatbinWrapper that nvcc will look for.
+ SmallString<64> ModuleID;
+ llvm::raw_svector_ostream OS(ModuleID);
+ OS << ModuleIDPrefix << llvm::format("%x", FatbinWrapper->getGUID());
+ llvm::Constant *ModuleIDConstant =
+ makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
+
+ // Create an alias for the FatbinWrapper that nvcc or hip backend will
+ // look for.
llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
- Twine("__fatbinwrap") + NVModuleID,
- FatbinWrapper);
+ Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
- // void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *,
+ // void __{cuda|hip}RegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
// void *, void (*)(void **))
SmallString<128> RegisterLinkedBinaryName(
addUnderscoredPrefixToName("RegisterLinkedBinary"));
- RegisterLinkedBinaryName += NVModuleID;
+ RegisterLinkedBinaryName += ModuleID;
llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
llvm::Value *Args[] = {RegisterGlobalsFunc,
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
- NVModuleIDConstant,
+ ModuleIDConstant,
makeDummyFunction(getCallbackFnTy())};
CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
}
Args.AddAllArgValues(CmdArgs, options::OPT_Zlinker_input);
for (const auto &II : Inputs) {
- // If the current tool chain refers to an OpenMP offloading host, we should
- // ignore inputs that refer to OpenMP offloading devices - they will be
- // embedded according to a proper linker script.
+ // If the current tool chain refers to an OpenMP or HIP offloading host, we
+ // should ignore inputs that refer to OpenMP or HIP offloading devices -
+ // they will be embedded according to a proper linker script.
if (auto *IA = II.getAction())
- if (JA.isHostOffloading(Action::OFK_OpenMP) &&
- IA->isDeviceOffloading(Action::OFK_OpenMP))
+ if ((JA.isHostOffloading(Action::OFK_OpenMP) &&
+ IA->isDeviceOffloading(Action::OFK_OpenMP)) ||
+ (JA.isHostOffloading(Action::OFK_HIP) &&
+ IA->isDeviceOffloading(Action::OFK_HIP)))
continue;
if (!TC.HasNativeLLVMSupport() && types::isLLVMIR(II.getType()))
Lksf << LksBuffer;
}
+/// Add HIP linker script arguments at the end of the argument list so that
+/// the fat binary is built by embedding the device images into the host. The
+/// linker script also defines a symbol required by the code generation so that
+/// the image can be retrieved at runtime. This should be used only in tool
+/// chains that support linker scripts.
+void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
+ const InputInfo &Output,
+ const InputInfoList &Inputs, const ArgList &Args,
+ ArgStringList &CmdArgs, const JobAction &JA,
+ const Tool &T) {
+
+ // If this is not a HIP host toolchain, we don't need to do anything.
+ if (!JA.isHostOffloading(Action::OFK_HIP))
+ return;
+
+ // Create temporary linker script. Keep it if save-temps is enabled.
+ const char *LKS;
+ SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
+ if (C.getDriver().isSaveTempsEnabled()) {
+ llvm::sys::path::replace_extension(Name, "lk");
+ LKS = C.getArgs().MakeArgString(Name.c_str());
+ } else {
+ llvm::sys::path::replace_extension(Name, "");
+ Name = C.getDriver().GetTemporaryPath(Name, "lk");
+ LKS = C.addTempFile(C.getArgs().MakeArgString(Name.c_str()));
+ }
+
+ // Add linker script option to the command.
+ CmdArgs.push_back("-T");
+ CmdArgs.push_back(LKS);
+
+ // Create a buffer to write the contents of the linker script.
+ std::string LksBuffer;
+ llvm::raw_string_ostream LksStream(LksBuffer);
+
+ // Get the HIP offload tool chain.
+ auto *HIPTC = static_cast<const toolchains::CudaToolChain *>(
+ C.getSingleOffloadToolChain<Action::OFK_HIP>());
+ assert(HIPTC->getTriple().getArch() == llvm::Triple::amdgcn &&
+ "Wrong platform");
+
+ // Construct clang-offload-bundler command to bundle object files for
+ // for different GPU archs.
+ ArgStringList BundlerArgs;
+ BundlerArgs.push_back(Args.MakeArgString("-type=o"));
+
+ // ToDo: Remove the dummy host binary entry which is required by
+ // clang-offload-bundler.
+ std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
+ std::string BundlerInputArg = "-inputs=/dev/null";
+
+ for (const auto &II : Inputs) {
+ const Action *A = II.getAction();
+ // Is this a device linking action?
+ if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
+ BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
+ StringRef(A->getOffloadingArch()).str();
+ BundlerInputArg = BundlerInputArg + "," + II.getFilename();
+ }
+ }
+ BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
+ BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
+
+ std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o");
+ const char *BundleFile =
+ C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str()));
+ auto BundlerOutputArg =
+ Args.MakeArgString(std::string("-outputs=").append(BundleFile));
+ BundlerArgs.push_back(BundlerOutputArg);
+
+ SmallString<128> BundlerPath(C.getDriver().Dir);
+ llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
+ const char *Bundler = Args.MakeArgString(BundlerPath);
+ C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+
+ // Add commands to embed target binaries. We ensure that each section and
+ // image is 16-byte aligned. This is not mandatory, but increases the
+ // likelihood of data to be aligned with a cache block in several main host
+ // machines.
+ LksStream << "/*\n";
+ LksStream << " HIP Offload Linker Script\n";
+ LksStream << " *** Automatically generated by Clang ***\n";
+ LksStream << "*/\n";
+ LksStream << "TARGET(binary)\n";
+ LksStream << "INPUT(" << BundleFileName << ")\n";
+ LksStream << "SECTIONS\n";
+ LksStream << "{\n";
+ LksStream << " .hip_fatbin :\n";
+ LksStream << " ALIGN(0x10)\n";
+ LksStream << " {\n";
+ LksStream << " PROVIDE_HIDDEN(__hip_fatbin = .);\n";
+ LksStream << " " << BundleFileName << "\n";
+ LksStream << " }\n";
+ LksStream << "}\n";
+ LksStream << "INSERT BEFORE .data\n";
+ LksStream.flush();
+
+ // Dump the contents of the linker script if the user requested that. We
+ // support this option to enable testing of behavior with -###.
+ if (C.getArgs().hasArg(options::OPT_fhip_dump_offload_linker_script))
+ llvm::errs() << LksBuffer;
+
+ // If this is a dry run, do not create the linker script file.
+ if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
+ return;
+
+ // Open script file and write the contents.
+ std::error_code EC;
+ llvm::raw_fd_ostream Lksf(LKS, EC, llvm::sys::fs::F_None);
+
+ if (EC) {
+ C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
+ return;
+ }
+
+ Lksf << LksBuffer;
+}
+
SmallString<128> tools::getStatsFileName(const llvm::opt::ArgList &Args,
const InputInfo &Output,
const InputInfo &Input,
llvm::opt::ArgStringList &CmdArgs,
const JobAction &JA);
+void AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
+ const InputInfo &Output, const InputInfoList &Inputs,
+ const llvm::opt::ArgList &Args,
+ llvm::opt::ArgStringList &CmdArgs, const JobAction &JA,
+ const Tool &T);
+
const char *SplitDebugName(const llvm::opt::ArgList &Args,
const InputInfo &Input);
// Add OpenMP offloading linker script args if required.
AddOpenMPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA);
+ // Add HIP offloading linker script args if required.
+ AddHIPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA,
+ *this);
+
C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
}
// RUN: echo "GPU binary would be here" > %t
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-include-gpubinary %t -o - \
-// RUN: | FileCheck %s --check-prefixes=ALL,NORDC,CUDA
+// RUN: | FileCheck %s --check-prefixes=ALL,NORDC,CUDA,CUDANORDC
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS \
-// RUN: | FileCheck %s -check-prefix=NOGLOBALS
+// RUN: | FileCheck %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - \
-// RUN: | FileCheck %s --check-prefixes=ALL,RDC,CUDA
+// RUN: | FileCheck %s --check-prefixes=ALL,RDC,CUDA,CUDARDC
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
// RUN: | FileCheck %s -check-prefix=NOGPUBIN
// RUN: | FileCheck %s --check-prefixes=ALL,NORDC,HIP
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS -x hip \
-// RUN: | FileCheck %s -check-prefix=NOGLOBALS
+// RUN: | FileCheck %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \
-// RUN: | FileCheck %s --check-prefixes=ALL,RDC,HIP
+// RUN: | FileCheck %s --check-prefixes=ALL,RDC,HIP,HIPRDC
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
// RUN: | FileCheck %s -check-prefix=NOGPUBIN
// * constant unnamed string with the kernel name
// ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
// * constant unnamed string with GPU binary
-// ALL: private unnamed_addr constant{{.*GPU binary would be here.*}}\00"
-// NORDC-SAME: section ".nv_fatbin", align 8
-// RDC-SAME: section "__nv_relfatbin", align 8
+// HIP: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
+// CUDA: @[[FATBIN:.*]] = private unnamed_addr constant{{.*GPU binary would be here.*}}\00",
+// CUDANORDC-SAME: section ".nv_fatbin", align 8
+// CUDARDC-SAME: section "__nv_relfatbin", align 8
// * constant struct that wraps GPU binary
-// CUDA: @__[[PREFIX:cuda]]_fatbin_wrapper = internal constant
-// CUDA-SAME: { i32, i32, i8*, i8* }
-// HIP: @__[[PREFIX:hip]]_fatbin_wrapper = internal constant
-// HIP-SAME: { i32, i32, i8*, i8* }
-// ALL-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
-// ALL-SAME: section ".nvFatBinSegment"
+// ALL: @__[[PREFIX:cuda|hip]]_fatbin_wrapper = internal constant
+// ALL-SAME: { i32, i32, i8*, i8* }
+// CUDA-SAME: { i32 1180844977, i32 1,
+// HIP-SAME: { i32 1212764230, i32 1,
+// CUDA-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),
+// HIP-SAME: i8* @[[FATBIN]],
+// ALL-SAME: i8* null }
+// CUDA-SAME: section ".nvFatBinSegment"
+// HIP-SAME: section ".hipFatBinSegment"
// * variable to save GPU binary handle after initialization
// NORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
// * constant unnamed string with NVModuleID
// RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant
-// RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
+// CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
+// HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32
// * Make sure our constructor was added to global ctor list.
// ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
// * In separate mode we also register a destructor.
// There should be no __[[PREFIX]]_register_globals if we have no
// device-side globals, but we still need to register GPU binary.
// Skip GPU binary string first.
-// NOGLOBALS: @0 = private unnamed_addr constant{{.*}}
+// CUDANOGLOBALS: @{{.*}} = private unnamed_addr constant{{.*}}
+// HIPNOGLOBALS: @{{.*}} = external constant{{.*}}
// NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals
-// NOGLOBALS: define internal void @__[[PREFIX:.*]]_module_ctor
+// NOGLOBALS: define internal void @__[[PREFIX:cuda|hip]]_module_ctor
// NOGLOBALS: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
// NOGLOBALS-NOT: call void @__[[PREFIX]]_register_globals
// NOGLOBALS: define internal void @__[[PREFIX]]_module_dtor