LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
-LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code")
+LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
-def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option]>,
+def fgpu_rdc : Flag<["-"], "fgpu-rdc">, Flags<[CC1Option]>,
HelpText<"Generate relocatable device code, also known as separate compilation mode.">;
-def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">;
+def fno_gpu_rdc : Flag<["-"], "fno-gpu-rdc">;
+def : Flag<["-"], "fcuda-rdc">, Alias<fgpu_rdc>;
+def : Flag<["-"], "fno-cuda-rdc">, Alias<fno_gpu_rdc>;
def fcuda_short_ptr : Flag<["-"], "fcuda-short-ptr">, Flags<[CC1Option]>,
HelpText<"Use 32-bit pointers for accessing const/local/shared address spaces.">;
def fno_cuda_short_ptr : Flag<["-"], "fno-cuda-short-ptr">;
TYPE("dSYM", dSYM, INVALID, "dSYM", "A")
TYPE("dependencies", Dependencies, INVALID, "d", "")
TYPE("cuda-fatbin", CUDA_FATBIN, INVALID, "fatbin","A")
+TYPE("hip-fatbin", HIP_FATBIN, INVALID, "hipfb", "A")
TYPE("none", Nothing, INVALID, nullptr, "u")
//
// With CUDA relocatable device code enabled, these variables don't get
// special handling; they're treated like regular extern variables.
- if (LangOpts.CUDA && !LangOpts.CUDARelocatableDeviceCode &&
+ if (LangOpts.CUDA && !LangOpts.GPURelocatableDeviceCode &&
hasExternalStorage() && hasAttr<CUDASharedAttr>() &&
isa<IncompleteArrayType>(getType()))
return true;
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
: CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
TheModule(CGM.getModule()),
- RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) {
+ RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode) {
CodeGen::CodeGenTypes &Types = CGM.getTypes();
ASTContext &Ctx = CGM.getContext();
// global variable and save a reference in GpuBinaryHandle to be cleaned up
// in destructor on exit. Then associate all known kernels with the GPU binary
// handle so CUDA runtime can figure out what to call on the GPU side.
- std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary;
- if (!IsHIP) {
+ std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
+ if (!CudaGpuBinaryFileName.empty()) {
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
ModuleIDSectionName = "__hip_module_id";
ModuleIDPrefix = "__hip_";
- // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin.
- // The external symbol is supposed to contain the fat binary but will be
- // populated somewhere else, e.g. by lld through link script.
- FatBinStr = new llvm::GlobalVariable(
+ if (CudaGpuBinary) {
+ // If fatbin is available from early finalization, create a string
+ // literal containing the fat binary loaded from the given file.
+ FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
+ FatbinConstantName, 8);
+ } else {
+ // If fatbin is not available, create an external symbol
+ // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
+ // to contain the fat binary but will be populated somewhere else,
+ // e.g. by lld through link script.
+ FatBinStr = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty,
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
"__hip_fatbin", nullptr,
llvm::GlobalVariable::NotThreadLocal);
- cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
+ cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
+ }
FatMagic = HIPFatMagic;
} else {
// thread safety of the loaded program. Therefore we can assume sequential
// execution of constructor functions here.
if (IsHIP) {
+ auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
+ llvm::GlobalValue::LinkOnceAnyLinkage;
llvm::BasicBlock *IfBlock =
llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
llvm::BasicBlock *ExitBlock =
// of HIP ABI.
GpuBinaryHandle = new llvm::GlobalVariable(
TheModule, VoidPtrPtrTy, /*isConstant=*/false,
- llvm::GlobalValue::LinkOnceAnyLinkage,
+ Linkage,
/*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
"__hip_gpubin_handle");
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
// Prevent the weak symbol in different shared libraries being merged.
- GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
+ if (Linkage != llvm::GlobalValue::InternalLinkage)
+ GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
Address GpuBinaryAddr(
GpuBinaryHandle,
CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
class HIPActionBuilder final : public CudaActionBuilderBase {
/// The linker inputs obtained for each device arch.
SmallVector<ActionList, 8> DeviceLinkerInputs;
+ bool Relocatable;
public:
HIPActionBuilder(Compilation &C, DerivedArgList &Args,
const Driver::InputList &Inputs)
- : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {}
+ : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP),
+ Relocatable(false) {}
bool canUseBundlerUnbundler() const override { return true; }
phases::ID CurPhase, phases::ID FinalPhase,
PhasesTy &Phases) override {
// amdgcn does not support linking of object files, therefore we skip
- // backend and assemble phases to output LLVM IR.
- if (CudaDeviceActions.empty() || CurPhase == phases::Backend ||
+ // backend and assemble phases to output LLVM IR. Except for generating
+ // non-relocatable device coee, where we generate fat binary for device
+ // code and pass to host in Backend phase.
+ if (CudaDeviceActions.empty() ||
+ (CurPhase == phases::Backend && Relocatable) ||
CurPhase == phases::Assemble)
return ABRT_Success;
- assert((CurPhase == phases::Link ||
+ assert(((CurPhase == phases::Link && Relocatable) ||
CudaDeviceActions.size() == GpuArchList.size()) &&
"Expecting one action per GPU architecture.");
assert(!CompileHostOnly &&
"Not expecting CUDA actions in host-only compilation.");
- // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
- // This happens to each device action originated from each input file.
- // Later on, device actions in DeviceLinkerInputs are used to create
- // device link actions in appendLinkDependences and the created device
- // link actions are passed to the offload action as device dependence.
- if (CurPhase == phases::Link) {
+ if (!Relocatable && CurPhase == phases::Backend) {
+ // If we are in backend phase, we attempt to generate the fat binary.
+ // We compile each arch to IR and use a link action to generate code
+ // object containing ISA. Then we use a special "link" action to create
+ // a fat binary containing all the code objects for different GPU's.
+ // The fat binary is then an input to the host action.
+ for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+ // Create a link action to link device IR with device library
+ // and generate ISA.
+ ActionList AL;
+ AL.push_back(CudaDeviceActions[I]);
+ CudaDeviceActions[I] =
+ C.MakeAction<LinkJobAction>(AL, types::TY_Image);
+
+ // OffloadingActionBuilder propagates device arch until an offload
+ // action. Since the next action for creating fatbin does
+ // not have device arch, whereas the above link action and its input
+ // have device arch, an offload action is needed to stop the null
+ // device arch of the next action being propagated to the above link
+ // action.
+ OffloadAction::DeviceDependences DDep;
+ DDep.add(*CudaDeviceActions[I], *ToolChains.front(),
+ CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
+ CudaDeviceActions[I] = C.MakeAction<OffloadAction>(
+ DDep, CudaDeviceActions[I]->getType());
+ }
+ // Create HIP fat binary with a special "link" action.
+ CudaFatBinary =
+ C.MakeAction<LinkJobAction>(CudaDeviceActions,
+ types::TY_HIP_FATBIN);
+
+ DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+ AssociatedOffloadKind);
+ // Clear the fat binary, it is already a dependence to an host
+ // action.
+ CudaFatBinary = nullptr;
+
+ // Remove the CUDA actions as they are already connected to an host
+ // action or fat binary.
+ CudaDeviceActions.clear();
+
+ return ABRT_Success;
+ } else if (CurPhase == phases::Link) {
+ // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
+ // This happens to each device action originated from each input file.
+ // Later on, device actions in DeviceLinkerInputs are used to create
+ // device link actions in appendLinkDependences and the created device
+ // link actions are passed to the offload action as device dependence.
DeviceLinkerInputs.resize(CudaDeviceActions.size());
auto LI = DeviceLinkerInputs.begin();
for (auto *A : CudaDeviceActions) {
++I;
}
}
+
+ bool initialize() override {
+ Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+ options::OPT_fno_gpu_rdc, /*Default=*/false);
+
+ return CudaActionBuilderBase::initialize();
+ }
};
/// OpenMP action builder. The host bitcode is passed to the device frontend
CmdArgs.push_back(Args.MakeArgString(Flags));
}
- if (IsCuda) {
- // Host-side cuda compilation receives all device-side outputs in a single
- // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
- if (CudaDeviceInput) {
+ // Host-side cuda compilation receives all device-side outputs in a single
+ // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
+ if ((IsCuda || IsHIP) && CudaDeviceInput) {
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(CudaDeviceInput->getFilename());
- }
+ if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+ CmdArgs.push_back("-fgpu-rdc");
+ }
- if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
- CmdArgs.push_back("-fcuda-rdc");
+ if (IsCuda) {
if (Args.hasFlag(options::OPT_fcuda_short_ptr,
options::OPT_fno_cuda_short_ptr, false))
CmdArgs.push_back("-fcuda-short-ptr");
#include "Arch/SystemZ.h"
#include "Arch/X86.h"
#include "Hexagon.h"
+#include "HIP.h"
#include "InputInfo.h"
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/LangOptions.h"
if (!JA.isHostOffloading(Action::OFK_HIP))
return;
+ InputInfoList DeviceInputs;
+ for (const auto &II : Inputs) {
+ const Action *A = II.getAction();
+ // Is this a device linking action?
+ if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
+ DeviceInputs.push_back(II);
+ }
+ }
+
+ if (DeviceInputs.empty())
+ return;
+
// Create temporary linker script. Keep it if save-temps is enabled.
const char *LKS;
SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
"Wrong platform");
(void)HIPTC;
- // Construct clang-offload-bundler command to bundle object files for
- // for different GPU archs.
- ArgStringList BundlerArgs;
- BundlerArgs.push_back(Args.MakeArgString("-type=o"));
-
- // ToDo: Remove the dummy host binary entry which is required by
- // clang-offload-bundler.
- std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
- std::string BundlerInputArg = "-inputs=/dev/null";
-
- for (const auto &II : Inputs) {
- const Action *A = II.getAction();
- // Is this a device linking action?
- if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
- BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
- StringRef(A->getOffloadingArch()).str();
- BundlerInputArg = BundlerInputArg + "," + II.getFilename();
- }
- }
- BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
- BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
-
- std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o");
+ // The output file name needs to persist through the compilation, therefore
+ // it needs to be created through MakeArgString.
+ std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "hipfb");
const char *BundleFile =
C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str()));
- auto BundlerOutputArg =
- Args.MakeArgString(std::string("-outputs=").append(BundleFile));
- BundlerArgs.push_back(BundlerOutputArg);
-
- SmallString<128> BundlerPath(C.getDriver().Dir);
- llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
- const char *Bundler = Args.MakeArgString(BundlerPath);
- C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+ AMDGCN::constructHIPFatbinCommand(C, JA, BundleFile, DeviceInputs, Args, T);
// Add commands to embed target binaries. We ensure that each section and
// image is 16-byte aligned. This is not mandatory, but increases the
options::OPT_fnoopenmp_relocatable_target,
/*Default=*/true);
else if (JA.isOffloading(Action::OFK_Cuda))
- Relocatable = Args.hasFlag(options::OPT_fcuda_rdc,
- options::OPT_fno_cuda_rdc, /*Default=*/false);
+ Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+ options::OPT_fno_gpu_rdc, /*Default=*/false);
if (Relocatable)
CmdArgs.push_back("-c");
options::OPT_fno_cuda_approx_transcendentals, false))
CC1Args.push_back("-fcuda-approx-transcendentals");
- if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+ if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false))
- CC1Args.push_back("-fcuda-rdc");
+ CC1Args.push_back("-fgpu-rdc");
}
if (DriverArgs.hasArg(options::OPT_nocudalib))
C.addCommand(llvm::make_unique<Command>(JA, *this, Lld, LldArgs, Inputs));
}
+// Construct a clang-offload-bundler command to bundle code objects for
+// different GPU's into a HIP fat binary.
+void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
+ StringRef OutputFileName, const InputInfoList &Inputs,
+ const llvm::opt::ArgList &Args, const Tool& T) {
+ // Construct clang-offload-bundler command to bundle object files for
+ // for different GPU archs.
+ ArgStringList BundlerArgs;
+ BundlerArgs.push_back(Args.MakeArgString("-type=o"));
+
+ // ToDo: Remove the dummy host binary entry which is required by
+ // clang-offload-bundler.
+ std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
+ std::string BundlerInputArg = "-inputs=/dev/null";
+
+ for (const auto &II : Inputs) {
+ const auto* A = II.getAction();
+ BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
+ StringRef(A->getOffloadingArch()).str();
+ BundlerInputArg = BundlerInputArg + "," + II.getFilename();
+ }
+ BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
+ BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
+
+ auto BundlerOutputArg =
+ Args.MakeArgString(std::string("-outputs=").append(OutputFileName));
+ BundlerArgs.push_back(BundlerOutputArg);
+
+ SmallString<128> BundlerPath(C.getDriver().Dir);
+ llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
+ const char *Bundler = Args.MakeArgString(BundlerPath);
+ C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+}
+
// For amdgcn the inputs of the linker job are device bitcode and output is
// object file. It calls llvm-link, opt, llc, then lld steps.
void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
const ArgList &Args,
const char *LinkingOutput) const {
+ if (JA.getType() == types::TY_HIP_FATBIN)
+ return constructHIPFatbinCommand(C, JA, Output.getFilename(), Inputs, Args, *this);
+
assert(getToolChain().getTriple().getArch() == llvm::Triple::amdgcn &&
"Unsupported target");
options::OPT_fno_cuda_approx_transcendentals, false))
CC1Args.push_back("-fcuda-approx-transcendentals");
- if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+ if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false))
- CC1Args.push_back("-fcuda-rdc");
+ CC1Args.push_back("-fgpu-rdc");
// Default to "hidden" visibility, as object level linking will not be
// supported for the foreseeable future.
namespace tools {
namespace AMDGCN {
+ // Construct command for creating HIP fatbin.
+ void constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
+ StringRef OutputFileName, const InputInfoList &Inputs,
+ const llvm::opt::ArgList &TCArgs, const Tool& T);
+
// Runs llvm-link/opt/llc/lld, which links multiple LLVM bitcode, together with
// device library, then compiles it to ISA in a shared object.
class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
Opts.CUDADeviceApproxTranscendentals = 1;
- Opts.CUDARelocatableDeviceCode = Args.hasArg(OPT_fcuda_rdc);
+ Opts.GPURelocatableDeviceCode = Args.hasArg(OPT_fgpu_rdc);
if (Opts.ObjC1) {
if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
const auto *VD = cast<VarDecl>(D);
// extern __shared__ is only allowed on arrays with no length (e.g.
// "int x[]").
- if (!S.getLangOpts().CUDARelocatableDeviceCode && VD->hasExternalStorage() &&
+ if (!S.getLangOpts().GPURelocatableDeviceCode && VD->hasExternalStorage() &&
!isa<IncompleteArrayType>(VD->getType())) {
S.Diag(AL.getLoc(), diag::err_cuda_extern_shared) << VD;
return;
// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
-// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - \
+// RUN: -fgpu-rdc -fcuda-include-gpubinary %t -o - \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,RDC,CUDA,CUDARDC
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-include-gpubinary %t -o - -x hip\
-// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP
+// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP,HIPEF
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS -x hip \
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
-// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \
-// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP
+// RUN: -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \
+// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP,HIPEF
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
-// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
+// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,NORDC,HIP,HIPNEF
#include "Inputs/cuda.h"
// * constant unnamed string with the kernel name
// ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
// * constant unnamed string with GPU binary
-// HIP: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
// CUDA: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
+// HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
+// HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
// CUDANORDC-SAME: section ".nv_fatbin", align 8
// CUDARDC-SAME: section "__nv_relfatbin", align 8
// * constant struct that wraps GPU binary
// CUDA-SAME: { i32 1180844977, i32 1,
// HIP-SAME: { i32 1212764230, i32 1,
// CUDA-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),
-// HIP-SAME: i8* @[[FATBIN]],
+// HIPEF-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0),
+// HIPNEF-SAME: i8* @[[FATBIN]],
// ALL-SAME: i8* null }
// CUDA-SAME: section ".nvFatBinSegment"
// HIP-SAME: section ".hipFatBinSegment"
// * variable to save GPU binary handle after initialization
// CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
-// HIP: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null
+// HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null
// * constant unnamed string with NVModuleID
// RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
// CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
// device-side globals, but we still need to register GPU binary.
// Skip GPU binary string first.
// CUDANOGLOBALS: @{{.*}} = private constant{{.*}}
-// HIPNOGLOBALS: @{{.*}} = external constant{{.*}}
+// HIPNOGLOBALS: @{{.*}} = internal constant{{.*}}
// NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals
// NOGLOBALS: define internal void @__[[PREFIX:cuda|hip]]_module_ctor
// NOGLOBALS: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
// RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s
// Generating relocatable device code
-// RUN: %clang -### -target x86_64-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu -fgpu-rdc -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
// With debugging enabled, ptxas should be run with with no ptxas optimizations.
// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s
// Separate compilation targeting sm_35.
-// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
// 32-bit compile.
// RUN: %clang -### -target i386-linux-gnu -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
// 32-bit compile when generating relocatable device code.
-// RUN: %clang -### -target i386-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target i386-linux-gnu -fgpu-rdc -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
// Compile with -fintegrated-as. This should still cause us to invoke ptxas.
// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s
// Check that we still pass -c when generating relocatable device code.
-// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fgpu-rdc -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
// Check -Xcuda-ptxas and -Xcuda-fatbinary
// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
// Check relocatable device code generation on MacOS.
-// RUN: %clang -### -target x86_64-apple-macosx -O0 -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-apple-macosx -O0 -fgpu-rdc -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
-// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
-// RUN: %clang -### -target i386-apple-macosx -fcuda-rdc -c %s 2>&1 \
+// RUN: %clang -### -target i386-apple-macosx -fgpu-rdc -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
// Check that CLANG forwards the -v flag to PTXAS.
// CHECK: "-cc1"
// ARCH64-SAME: "-triple" "nvptx64-nvidia-cuda"
// ARCH32-SAME: "-triple" "nvptx-nvidia-cuda"
+// RDC-SAME: "-fgpu-rdc"
+// CHECK-NOT: "-fgpu-rdc"
// SM20-SAME: "-target-cpu" "sm_20"
// SM35-SAME: "-target-cpu" "sm_35"
// SM20-SAME: "-o" "[[PTXFILE:[^"]*]]"
// SM35-SAME: "-o" "[[PTXFILE:[^"]*]]"
-// RDC-SAME: "-fcuda-rdc"
-// CHECK-NOT: "-fcuda-rdc"
// Match the call to ptxas (which assembles PTX to SASS).
// CHECK: ptxas
// ARCH64-SAME: "-triple" "x86_64-
// ARCH32-SAME: "-triple" "i386-
// CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
-// RDC-SAME: "-fcuda-rdc"
-// CHECK-NOT: "-fcuda-rdc"
+// RDC-SAME: "-fgpu-rdc"
+// CHECK-NOT: "-fgpu-rdc"
// CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v"
//
// Test single gpu architecture with complete compilation.
//
+// Test CUDA NVPTX phases.
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
// RUN: --cuda-gpu-arch=sm_30 %s 2>&1 \
// RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s
+//
+// Test HIP AMDGPU -fgpu-rdc phases.
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD,BIN_AMD_RDC %s
+//
+// Test HIP AMDGPU -fno-gpu-rdc phases (default).
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
// RUN: --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s
+// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD,BIN_AMD_NRDC %s
+//
// BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
// BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
// BIN_NV-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-[[T]])
// BIN_NV-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P10]]}, ir
// BIN_NV-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// BIN_AMD-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN_AMD_RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN_AMD_NRDC-DAG: [[P6:[0-9]+]]: linker, {[[P5]]}, image, (device-hip, [[ARCH]])
+// BIN_AMD_NRDC-DAG: [[P7:[0-9]+]]: offload, "device-hip (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, image
+// BIN_AMD_NRDC-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, hip-fatbin, (device-hip)
+// BIN_AMD_NRDC-DAG: [[P11:[0-9]+]]: offload, "host-hip (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-hip (amdgcn-amd-amdhsa)" {[[P8]]}, ir
+// BIN_AMD_NRDC-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// BIN_AMD-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]])
-// BIN_AMD-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]},
-// BIN_AMD-DAG-SAME: "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object
+// BIN_AMD_RDC-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]])
+// BIN_AMD_RDC-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]},
+// BIN_AMD_RDC-DAG-SAME: "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object
//
// Test single gpu architecture up to the assemble phase.
// RUN: --cuda-gpu-arch=sm_30 %s -S 2>&1 \
// RUN: | FileCheck -check-prefixes=ASM,ASM_NV %s
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 %s -S 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 -fgpu-rdc %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 -fcuda-rdc %s -S 2>&1 \
// RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s
// ASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
// ASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
// RUN: | FileCheck -check-prefixes=BIN2,BIN2_NV %s
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -fgpu-rdc %s 2>&1 \
// RUN: | FileCheck -check-prefixes=BIN2,BIN2_AMD %s
// BIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
// BIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
// RUN: | FileCheck -check-prefixes=ASM2,ASM2_NV %s
// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -fgpu-rdc %s -S 2>&1 \
// RUN: | FileCheck -check-prefixes=ASM2,ASM2_AMD %s
// ASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH1:sm_30]])
// ASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH1:gfx803]])
// REQUIRES: x86-registered-target
// REQUIRES: amdgpu-registered-target
-// RUN: %clang -### -c -target x86_64-linux-gnu \
+// RUN: %clang -### -c -target x86_64-linux-gnu -fgpu-rdc \
// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
// RUN: 2>&1 | FileCheck %s
--- /dev/null
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang -### -target x86_64-linux-gnu -fno-gpu-rdc \
+// RUN: -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
+// RUN: --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \
+// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
+// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \
+// RUN: -fuse-ld=lld \
+// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+
+//
+// Compile device code in a.cu to code object for gfx803.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[A_BC_803:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC_803]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_A_803:".*-gfx803-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_A_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx803"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_A_803:".*-gfx803-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_A_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV_A_803:".*-gfx803-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
+
+//
+// Compile device code in a.cu to code object for gfx900.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[A_BC_900:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[A_BC_900]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_A_900:".*-gfx900-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_A_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx900"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_A_900:".*-gfx900-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_A_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV_A_900:".*-gfx900-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
+
+//
+// Bundle and embed device code in host object for a.cu.
+//
+
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_A_803]],[[IMG_DEV_A_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
+// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC]]
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+
+//
+// Compile device code in b.hip to code object for gfx803.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[B_BC_803:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[B_BC_803]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_B_803:".*-gfx803-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_B_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx803"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_B_803:".*-gfx803-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_B_803]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx803" "-o" [[OBJ_DEV_B_803:".*-gfx803-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]]
+
+//
+// Compile device code in b.hip to code object for gfx900.
+//
+
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900"
+// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: {{.*}} "-o" [[B_BC_900:".*bc"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC]]
+
+// CHECK: [[LLVM_LINK:"*.llvm-link"]] [[B_BC_900]]
+// CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
+// CHECK-SAME: "-o" [[LINKED_BC_DEV_B_900:".*-gfx900-linked-.*bc"]]
+
+// CHECK: [[OPT:".*opt"]] [[LINKED_BC_DEV_B_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-mcpu=gfx900"
+// CHECK-SAME: "-o" [[OPT_BC_DEV_B_900:".*-gfx900-optimized.*bc"]]
+
+// CHECK: [[LLC: ".*llc"]] [[OPT_BC_DEV_B_900]] "-mtriple=amdgcn-amd-amdhsa"
+// CHECK-SAME: "-filetype=obj" "-mcpu=gfx900" "-o" [[OBJ_DEV_B_900:".*-gfx900-.*o"]]
+
+// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]]
+
+//
+// Bundle and embed device code in host object for b.hip.
+//
+
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_B_803]],[[IMG_DEV_B_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" "-emit-obj"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
+// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC]]
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+
+//
+// Link host objects.
+//
+
+// CHECK: [[LD:".*ld.*"]] {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]]
+// CHECK-NOT: "-T" "{{.*}}.lk"
// RUN: --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \
// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \
-// RUN: -fuse-ld=lld \
+// RUN: -fuse-ld=lld -fgpu-rdc \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
// RUN: 2>&1 | FileCheck %s
// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
// CHECK-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
-// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fvisibility" "hidden"
// CHECK-SAME: {{.*}} "-o" [[A_BC:".*bc"]] "-x" "hip"
// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
// CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc"
// CHECK-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
-// CHECK-SAME: "-fcuda-is-device" "-fvisibility" "hidden"
+// CHECK-SAME: "-fcuda-is-device" "-fgpu-rdc" "-fvisibility" "hidden"
// CHECK-SAME: {{.*}} "-o" [[B_BC:".*bc"]] "-x" "hip"
// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
-// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*o]]"
+// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]"
// CHECK: [[LD:".*ld.*"]] {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]]
// CHECK-SAME: {{.*}} "-T" "{{.*}}.lk"
// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -verify %s
// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -verify %s
-// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-rdc -verify=rdc %s
-// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -fcuda-rdc -verify=rdc %s
+// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fgpu-rdc -verify=rdc %s
+// RUN: %clang_cc1 -fsyntax-only -Wundefined-internal -fcuda-is-device -fgpu-rdc -verify=rdc %s
// Most of these declarations are fine in separate compilation mode.