class CudaDeviceAction : public Action {
virtual void anchor();
- /// GPU architecture to bind. Always of the form /sm_\d+/.
+ /// GPU architecture to bind. Always of the form /sm_\d+/ or null (when the
+ /// action applies to multiple architectures).
const char *GpuArchName;
/// True when action results are not consumed by the host action (e.g when
/// -fsyntax-only or --cuda-device-only options are used).
const char *getGpuArchName() const { return GpuArchName; }
- /// Gets the compute_XX that corresponds to getGpuArchName().
+ /// Gets the compute_XX that corresponds to getGpuArchName(). Returns null
+ /// when getGpuArchName() is null.
const char *getComputeArchName() const;
bool isAtTopLevel() const { return AtTopLevel; }
def Xclang : Separate<["-"], "Xclang">,
HelpText<"Pass <arg> to the clang compiler">, MetaVarName<"<arg>">,
Flags<[DriverOption, CoreOption]>;
+def Xcuda_fatbinary : Separate<["-"], "Xcuda-fatbinary">,
+ HelpText<"Pass <arg> to fatbinary invocation">, MetaVarName<"<arg>">;
+def Xcuda_ptxas : Separate<["-"], "Xcuda-ptxas">,
+ HelpText<"Pass <arg> to the ptxas assembler">, MetaVarName<"<arg>">;
def z : Separate<["-"], "z">, Flags<[LinkerInput, RenderAsInput]>,
HelpText<"Pass -z <arg> to the linker">, MetaVarName<"<arg>">;
def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>,
virtual bool IsIntegratedAssemblerDefault() const { return false; }
/// \brief Check if the toolchain should use the integrated assembler.
- bool useIntegratedAs() const;
+ virtual bool useIntegratedAs() const;
/// IsMathErrnoDefault - Does this tool chain use -fmath-errno by default.
virtual bool IsMathErrnoDefault() const { return true; }
TYPE("image", Image, INVALID, "out", "")
TYPE("dSYM", dSYM, INVALID, "dSYM", "A")
TYPE("dependencies", Dependencies, INVALID, "d", "")
+TYPE("cuda-fatbin", CUDA_FATBIN, INVALID, "fatbin","A")
TYPE("none", Nothing, INVALID, nullptr, "u")
TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage,
llvm::ConstantStruct::get(FatbinWrapperTy, Values),
"__cuda_fatbin_wrapper");
+ // NVIDIA's cuobjdump looks for fatbins in this section.
+ FatbinWrapper->setSection(".nvFatBinSegment");
// GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
bool AtTopLevel)
: Action(CudaDeviceClass, Input), GpuArchName(ArchName),
AtTopLevel(AtTopLevel) {
- assert(IsValidGpuArchName(GpuArchName));
+ assert(!GpuArchName || IsValidGpuArchName(GpuArchName));
}
const char *CudaDeviceAction::getComputeArchName() const {
os << '"' << BIA->getArchName() << '"' << ", {"
<< PrintActions1(C, *BIA->begin(), Ids) << "}";
} else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
- os << '"' << CDA->getGpuArchName() << '"' << ", {"
- << PrintActions1(C, *CDA->begin(), Ids) << "}";
+ os << '"'
+ << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)")
+ << '"' << ", {" << PrintActions1(C, *CDA->begin(), Ids) << "}";
} else {
const ActionList *AL;
if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
// Check whether any of device actions stopped before they could generate PTX.
bool PartialCompilation =
llvm::any_of(CudaDeviceActions, [](const Action *a) {
- return a->getKind() != Action::BackendJobClass;
+ return a->getKind() != Action::AssembleJobClass;
});
// Figure out what to do with device actions -- pass them as inputs to the
return HostAction;
}
- // Outputs of device actions during complete CUDA compilation get created
- // with AtTopLevel=false and become inputs for the host action.
+ // If we're not a partial or device-only compilation, we compile each arch to
+ // ptx and assemble to cubin, then feed the cubin *and* the ptx into a device
+ // "link" action, which uses fatbinary to combine these cubins into one
+ // fatbin. The fatbin is then an input to the host compilation.
ActionList DeviceActions;
- for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
- DeviceActions.push_back(
- C.MakeAction<CudaDeviceAction>(CudaDeviceActions[I], GpuArchList[I],
- /* AtTopLevel */ false));
+ for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+ Action* AssembleAction = CudaDeviceActions[I];
+ assert(AssembleAction->getType() == types::TY_Object);
+ assert(AssembleAction->getInputs().size() == 1);
+
+ Action* BackendAction = AssembleAction->getInputs()[0];
+ assert(BackendAction->getType() == types::TY_PP_Asm);
+
+ for (const auto& A : {AssembleAction, BackendAction}) {
+ DeviceActions.push_back(C.MakeAction<CudaDeviceAction>(
+ A, GpuArchList[I], /* AtTopLevel */ false));
+ }
+ }
+ auto FatbinAction = C.MakeAction<CudaDeviceAction>(
+ C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN),
+ /* GpuArchName = */ nullptr,
+ /* AtTopLevel = */ false);
// Return a new host action that incorporates original host action and all
// device actions.
- return C.MakeAction<CudaHostAction>(HostAction, DeviceActions);
+ return C.MakeAction<CudaHostAction>(std::move(HostAction),
+ ActionList({FatbinAction}));
}
void Driver::BuildActions(Compilation &C, const ToolChain &TC,
return C.MakeAction<BackendJobAction>(Input, types::TY_PP_Asm);
}
case phases::Assemble:
- return C.MakeAction<AssembleJobAction>(Input, types::TY_Object);
+ return C.MakeAction<AssembleJobAction>(std::move(Input), types::TY_Object);
}
llvm_unreachable("invalid phase in ConstructPhaseAction");
if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
// Initial processing of CudaDeviceAction carries host params.
// Call BuildJobsForAction() again, now with correct device parameters.
- assert(CDA->getGpuArchName() && "No GPU name in device action.");
- return BuildJobsForAction(C, *CDA->begin(), C.getCudaDeviceToolChain(),
- CDA->getGpuArchName(), CDA->isAtTopLevel(),
- /*MultipleArchs*/ true, LinkingOutput,
- CachedResults);
+ InputInfo II = BuildJobsForAction(
+ C, *CDA->begin(), C.getCudaDeviceToolChain(), CDA->getGpuArchName(),
+ CDA->isAtTopLevel(), /*MultipleArchs*/ true, LinkingOutput,
+ CachedResults);
+ // Currently II's Action is *CDA->begin(). Set it to CDA instead, so that
+ // one can retrieve II's GPU arch.
+ II.setAction(A);
+ return II;
}
const ActionList *Inputs = &A->getInputs();
continue;
CudaInstallPath = CudaPath;
+ CudaBinPath = CudaPath + "/bin";
CudaIncludePath = CudaInstallPath + "/include";
CudaLibDevicePath = CudaInstallPath + "/nvvm/libdevice";
CudaLibPath =
CudaInstallPath + (TargetTriple.isArch64Bit() ? "/lib64" : "/lib");
if (!(D.getVFS().exists(CudaIncludePath) &&
- D.getVFS().exists(CudaLibPath) &&
+ D.getVFS().exists(CudaBinPath) && D.getVFS().exists(CudaLibPath) &&
D.getVFS().exists(CudaLibDevicePath)))
continue;
return new tools::dragonfly::Linker(*this);
}
-/// Stub for CUDA toolchain. At the moment we don't have assembler or
-/// linker and need toolchain mainly to propagate device-side options
-/// to CC1.
+/// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary,
+/// which isn't properly a linker but nonetheless performs the step of stitching
+/// together object files from the assembler into a single blob.
CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
const ArgList &Args)
- : Linux(D, Triple, Args) {}
+ : Linux(D, Triple, Args) {
+ if (CudaInstallation.isValid())
+ getProgramPaths().push_back(CudaInstallation.getBinPath());
+}
void
CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
for (Arg *A : Args) {
if (A->getOption().matches(options::OPT_Xarch__)) {
// Skip this argument unless the architecture matches BoundArch
- if (A->getValue(0) != StringRef(BoundArch))
+ if (!BoundArch || A->getValue(0) != StringRef(BoundArch))
continue;
unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
DAL->append(A);
}
- DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
+ if (BoundArch)
+ DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
return DAL;
}
+Tool *CudaToolChain::buildAssembler() const {
+ return new tools::NVPTX::Assembler(*this);
+}
+
+Tool *CudaToolChain::buildLinker() const {
+ return new tools::NVPTX::Linker(*this);
+}
+
/// XCore tool chain
XCoreToolChain::XCoreToolChain(const Driver &D, const llvm::Triple &Triple,
const ArgList &Args)
bool IsValid;
const Driver &D;
std::string CudaInstallPath;
+ std::string CudaBinPath;
std::string CudaLibPath;
std::string CudaLibDevicePath;
std::string CudaIncludePath;
/// \brief Get the detected Cuda installation path.
StringRef getInstallPath() const { return CudaInstallPath; }
+ /// \brief Get the detected path to Cuda's bin directory.
+ StringRef getBinPath() const { return CudaBinPath; }
/// \brief Get the detected Cuda Include path.
StringRef getIncludePath() const { return CudaIncludePath; }
/// \brief Get the detected Cuda library path.
const char *BoundArch) const override;
void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
llvm::opt::ArgStringList &CC1Args) const override;
+
+ // Never try to use the integrated assembler with CUDA; always fork out to
+ // ptxas.
+ bool useIntegratedAs() const override { return false; }
+
+protected:
+ Tool *buildAssembler() const override; // ptxas
+ Tool *buildLinker() const override; // fatbinary (ok, not really a linker)
};
class LLVM_LIBRARY_VISIBILITY MipsLLVMToolChain : public Linux {
else
ConstructGoldLinkJob(*this, C, JA, Output, Inputs, Args, LinkingOutput);
}
+
+void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
+ const InputInfo &Output,
+ const InputInfoList &Inputs,
+ const ArgList &Args,
+ const char *LinkingOutput) const {
+ const auto &TC =
+ static_cast<const toolchains::CudaToolChain &>(getToolChain());
+ assert(TC.getArch() == llvm::Triple::nvptx ||
+ TC.getArch() == llvm::Triple::nvptx64);
+
+ std::vector<std::string> gpu_archs =
+ Args.getAllArgValues(options::OPT_march_EQ);
+ assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas.");
+ const std::string& gpu_arch = gpu_archs[0];
+
+
+ ArgStringList CmdArgs;
+ CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32");
+
+ // Clang's default optimization level is -O0, but ptxas's default is -O3.
+ CmdArgs.push_back(Args.MakeArgString(
+ llvm::Twine("-O") +
+ Args.getLastArgValue(options::OPT_O_Group, "0").data()));
+
+ // Don't bother passing -g to ptxas: It's enabled by default at -O0, and
+ // not supported at other optimization levels.
+
+ CmdArgs.push_back("--gpu-name");
+ CmdArgs.push_back(Args.MakeArgString(gpu_arch));
+ CmdArgs.push_back("--output-file");
+ CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+ for (const auto& II : Inputs)
+ CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
+
+ for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
+ CmdArgs.push_back(Args.MakeArgString(A));
+
+ const char *Exec = Args.MakeArgString(TC.GetProgramPath("ptxas"));
+ C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}
+
+// All inputs to this linker must be from CudaDeviceActions, as we need to look
+// at the Inputs' Actions in order to figure out which GPU architecture they
+// correspond to.
+void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
+ const InputInfo &Output,
+ const InputInfoList &Inputs,
+ const ArgList &Args,
+ const char *LinkingOutput) const {
+ const auto &TC =
+ static_cast<const toolchains::CudaToolChain &>(getToolChain());
+ assert(TC.getArch() == llvm::Triple::nvptx ||
+ TC.getArch() == llvm::Triple::nvptx64);
+
+ ArgStringList CmdArgs;
+ CmdArgs.push_back("--cuda");
+ CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32");
+ CmdArgs.push_back(Args.MakeArgString("--create"));
+ CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+
+ for (const auto& II : Inputs) {
+ auto* A = cast<const CudaDeviceAction>(II.getAction());
+ // We need to pass an Arch of the form "sm_XX" for cubin files and
+ // "compute_XX" for ptx.
+ const char *Arch = (II.getType() == types::TY_PP_Asm)
+ ? A->getComputeArchName()
+ : A->getGpuArchName();
+ CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
+ Arch + ",file=" + II.getFilename()));
+ }
+
+ for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary))
+ CmdArgs.push_back(Args.MakeArgString(A));
+
+ const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));
+ C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}
};
} // end namespace PS4cpu
+namespace NVPTX {
+
+// Run ptxas, the NVPTX assembler.
+class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+ public:
+ Assembler(const ToolChain &TC)
+ : Tool("NVPTX::Assembler", "ptxas", TC, RF_Full, llvm::sys::WEM_UTF8,
+ "--options-file") {}
+
+ bool hasIntegratedCPP() const override { return false; }
+
+ void ConstructJob(Compilation &C, const JobAction &JA,
+ const InputInfo &Output, const InputInfoList &Inputs,
+ const llvm::opt::ArgList &TCArgs,
+ const char *LinkingOutput) const override;
+};
+
+// Runs fatbinary, which combines GPU object files ("cubin" files) and/or PTX
+// assembly into a single output file.
+class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+ public:
+ Linker(const ToolChain &TC)
+ : Tool("NVPTX::Linker", "fatbinary", TC, RF_Full, llvm::sys::WEM_UTF8,
+ "--options-file") {}
+
+ bool hasIntegratedCPP() const override { return false; }
+
+ void ConstructJob(Compilation &C, const JobAction &JA,
+ const InputInfo &Output, const InputInfoList &Inputs,
+ const llvm::opt::ArgList &TCArgs,
+ const char *LinkingOutput) const override;
+};
+
+} // end namespace NVPTX
+
} // end namespace tools
} // end namespace driver
} // end namespace clang
P.push_back(phases::Compile);
P.push_back(phases::Backend);
}
- if (Id != TY_CUDA_DEVICE)
- P.push_back(phases::Assemble);
+ P.push_back(phases::Assemble);
}
}
--- /dev/null
+// Tests that "sm_XX" gets correctly converted to "compute_YY" when we invoke
+// fatbinary.
+//
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// CHECK:fatbinary
+
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_21 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM21 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_30 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM30 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_32 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM32 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_35 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_37 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM37 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_50 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM50 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_52 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM52 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_53 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM53 %s
+
+// SM20:--image=profile=sm_20{{.*}}--image=profile=compute_20
+// SM21:--image=profile=sm_21{{.*}}--image=profile=compute_20
+// SM30:--image=profile=sm_30{{.*}}--image=profile=compute_30
+// SM32:--image=profile=sm_32{{.*}}--image=profile=compute_32
+// SM35:--image=profile=sm_35{{.*}}--image=profile=compute_35
+// SM37:--image=profile=sm_37{{.*}}--image=profile=compute_37
+// SM50:--image=profile=sm_50{{.*}}--image=profile=compute_50
+// SM52:--image=profile=sm_52{{.*}}--image=profile=compute_52
+// SM53:--image=profile=sm_53{{.*}}--image=profile=compute_53
--- /dev/null
+// Tests that ptxas and fatbinary are correctly during CUDA compilation.
+//
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// Regular compile with -O2.
+// RUN: %clang -### -target x86_64-linux-gnu -O2 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s
+
+// Regular compile without -O. This should result in us passing -O0 to ptxas.
+// RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
+
+// Regular compile targeting sm_35.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s
+
+// 32-bit compile.
+// RUN: %clang -### -target x86_32-linux-gnu -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH32 -check-prefix SM20 %s
+
+// Compile with -fintegrated-as. This should still cause us to invoke ptxas.
+// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
+
+// Check -Xcuda-ptxas and -Xcuda-fatbinary
+// RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \
+// RUN: -Xcuda-fatbinary -bar1 -Xcuda-ptxas -foo2 -Xcuda-fatbinary -bar2 %s 2>&1 \
+// RUN: | FileCheck -check-prefix SM20 -check-prefix PTXAS-EXTRA \
+// RUN: -check-prefix FATBINARY-EXTRA %s
+
+// Match clang job that produces PTX assembly.
+// CHECK: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// SM20: "-target-cpu" "sm_20"
+// SM35: "-target-cpu" "sm_35"
+// SM20: "-o" "[[PTXFILE:[^"]*]]"
+// SM35: "-o" "[[PTXFILE:[^"]*]]"
+
+// Match the call to ptxas (which assembles PTX to SASS).
+// CHECK:ptxas
+// ARCH64: "-m64"
+// ARCH32: "-m32"
+// OPT0: "-O0"
+// OPT2: "-O2"
+// SM20: "--gpu-name" "sm_20"
+// SM35: "--gpu-name" "sm_35"
+// SM20: "--output-file" "[[CUBINFILE:[^"]*]]"
+// SM35: "--output-file" "[[CUBINFILE:[^"]*]]"
+// PTXAS-EXTRA: "-foo1"
+// PTXAS-EXTRA-SAME: "-foo2"
+// CHECK-SAME: "[[PTXFILE]]"
+
+// Match the call to fatbinary (which combines all our PTX and SASS into one
+// blob).
+// CHECK:fatbinary
+// CHECK-DAG: "--cuda"
+// ARCH64-DAG: "-64"
+// ARCH32-DAG: "-32"
+// CHECK-DAG: "--create" "[[FATBINARY:[^"]*]]"
+// SM20-DAG: "--image=profile=compute_20,file=[[PTXFILE]]"
+// SM35-DAG: "--image=profile=compute_35,file=[[PTXFILE]]"
+// SM20-DAG: "--image=profile=sm_20,file=[[CUBINFILE]]"
+// SM35-DAG: "--image=profile=sm_35,file=[[CUBINFILE]]"
+// FATBINARY-EXTRA: "-bar1"
+// FATBINARY-EXTRA-SAME: "-bar2"
+
+// Match the clang job for host compilation.
+// CHECK: "-cc1" "-triple" "x86_64--linux-gnu"
+// CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
// RUN: -check-prefix NOHOST -check-prefix NOLINK %s
-// Verify that with -S we compile host and device sides to assembly and
-// incorporate device code into the host side.
-// RUN: %clang -### -target x86_64-linux-gnu -S -c %s 2>&1 \
-// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
-// RUN: -check-prefix HOST -check-prefix INCLUDES-DEVICE \
-// RUN: -check-prefix NOLINK %s
-
// Verify that --cuda-gpu-arch option passes the correct GPU archtecture to
// device compilation.
// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
// RUN: -check-prefix DEVICE2 -check-prefix DEVICE-SM35 \
// RUN: -check-prefix DEVICE2-SM30 -check-prefix HOST \
// RUN: -check-prefix HOST-NOSAVE -check-prefix INCLUDES-DEVICE \
-// RUN: -check-prefix INCLUDES-DEVICE2 -check-prefix NOLINK %s
+// RUN: -check-prefix NOLINK %s
// Verify that device-side results are passed to the correct tool when
// -save-temps is used.
// DEVICE-NOSAVE-SAME: "-aux-triple" "x86_64--linux-gnu"
// DEVICE-SAME: "-fcuda-is-device"
// DEVICE-SM35-SAME: "-target-cpu" "sm_35"
-// DEVICE-SAME: "-o" "[[GPUBINARY1:[^"]*]]"
+// DEVICE-SAME: "-o" "[[PTXFILE:[^"]*]]"
// DEVICE-NOSAVE-SAME: "-x" "cuda"
// DEVICE-SAVE-SAME: "-x" "ir"
+// Match the call to ptxas (which assembles PTX to SASS).
+// DEVICE:ptxas
+// DEVICE-SM35-DAG: "--gpu-name" "sm_35"
+// DEVICE-DAG: "--output-file" "[[CUBINFILE:[^"]*]]"
+// DEVICE-DAG: "[[PTXFILE]]"
+
// Match another device-side compilation.
// DEVICE2: "-cc1" "-triple" "nvptx64-nvidia-cuda"
// DEVICE2-SAME: "-aux-triple" "x86_64--linux-gnu"
// NODEVICE-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda"
// NODEVICE-SAME-NOT: "-fcuda-is-device"
+// INCLUDES-DEVICE:fatbinary
+// INCLUDES-DEVICE-DAG: "--create" "[[FATBINARY:[^"]*]]"
+// INCLUDES-DEVICE-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE]]"
+// INCLUDES-DEVICE-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE]]"
+
// Match host-side preprocessor job with -save-temps.
// HOST-SAVE: "-cc1" "-triple" "x86_64--linux-gnu"
// HOST-SAVE-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
// HOST-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
// HOST-NOSAVE-SAME: "-x" "cuda"
// HOST-SAVE-SAME: "-x" "cuda-cpp-output"
-// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY1]]"
-// INCLUDES-DEVICE2-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY2]]"
+// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
// Match external assembler that uses compilation output.
// HOST-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"